diff --git a/config b/config index 66566be..80519ae 100644 --- a/config +++ b/config @@ -151,7 +151,6 @@ CONFIG_SCHED_CORE=y # # CPU/Task time and stats accounting # -CONFIG_TICK_CPU_ACCOUNTING=n CONFIG_VIRT_CPU_ACCOUNTING=y CONFIG_VIRT_CPU_ACCOUNTING_GEN=y CONFIG_IRQ_TIME_ACCOUNTING=y @@ -542,7 +541,7 @@ CONFIG_X86_INTEL_TSX_MODE_AUTO=y CONFIG_X86_SGX=y CONFIG_EFI=y CONFIG_EFI_STUB=y -# CONFIG_EFI_HANDOVER_PROTOCOL is not set +CONFIG_EFI_HANDOVER_PROTOCOL=y CONFIG_EFI_MIXED=y # CONFIG_EFI_FAKE_MEMMAP is not set CONFIG_EFI_RUNTIME_MAP=y @@ -3409,6 +3408,7 @@ CONFIG_MICROSOFT_MANA=m CONFIG_NET_VENDOR_MYRI=y CONFIG_MYRI10GE=m CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m CONFIG_NET_VENDOR_NI=y CONFIG_NI_XGE_MANAGEMENT_ENET=m CONFIG_NET_VENDOR_NATSEMI=y @@ -5880,6 +5880,7 @@ CONFIG_VIDEO_V4L2_SUBDEV_API=y # CONFIG_VIDEO_ADV_DEBUG is not set # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_LOOPBACK=m CONFIG_V4L2_MEM2MEM_DEV=m CONFIG_V4L2_FLASH_LED_CLASS=m CONFIG_V4L2_FWNODE=m @@ -7105,6 +7106,7 @@ CONFIG_SND_HDA_CODEC_SI3054=m CONFIG_SND_HDA_GENERIC=m CONFIG_SND_HDA_POWER_SAVE_DEFAULT=1 CONFIG_SND_HDA_INTEL_HDMI_SILENT_STREAM=y +# CONFIG_SND_HDA_CTL_DEV_ID is not set # end of HD-Audio CONFIG_SND_HDA_CORE=m @@ -9002,7 +9004,6 @@ CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y CONFIG_THINKPAD_ACPI_VIDEO=y CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y CONFIG_THINKPAD_LMI=m -CONFIG_LEGION_LAPTOP=m CONFIG_INTEL_ATOMISP2_PDX86=y CONFIG_INTEL_ATOMISP2_LED=m CONFIG_INTEL_IFS=m @@ -11441,4 +11442,4 @@ CONFIG_MEMTEST=y # Rust hacking # # end of Rust hacking -# end of Kernel hacking +# end of Kernel hacking \ No newline at end of file diff --git a/patches/0001-cachy-all.patch b/patches/0001-cachy-all.patch index cf8faa9..c1c40d6 100644 --- a/patches/0001-cachy-all.patch +++ b/patches/0001-cachy-all.patch @@ -1,7 +1,7 @@ -From 349ff8d17d3501ab92ba911463a539cdaa50faa7 Mon Sep 17 00:00:00 2001 +From d7322fe0d4d120555d7dd3c2a6167f7f726b8738 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 15 Jan 2023 16:50:23 +0100 -Subject: [PATCH 01/15] bbr2 +Date: Fri, 10 Mar 2023 17:59:47 +0100 +Subject: [PATCH 01/16] bbr2 Signed-off-by: Peter Jung --- @@ -3281,18 +3281,18 @@ index cb79127f45c3..70e4de876a7f 100644 event = icsk->icsk_pending; -- -2.39.2 +2.40.0.rc2 -From 867183d5c6eadbbff94e6b03e03e9959787d47a6 Mon Sep 17 00:00:00 2001 +From 87439b08ac56036539528efb6da691914f41ca76 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 13 Feb 2023 09:23:53 +0100 -Subject: [PATCH 02/15] bfq +Date: Fri, 10 Mar 2023 18:00:04 +0100 +Subject: [PATCH 02/16] bfq Signed-off-by: Peter Jung --- block/bfq-cgroup.c | 101 ++++--- - block/bfq-iosched.c | 629 ++++++++++++++++++++++++++++-------------- - block/bfq-iosched.h | 144 +++++++--- + block/bfq-iosched.c | 637 ++++++++++++++++++++++++++++-------------- + block/bfq-iosched.h | 144 ++++++++-- block/bfq-wf2q.c | 2 +- block/blk-cgroup.c | 122 ++++---- block/blk-cgroup.h | 10 +- @@ -3301,7 +3301,7 @@ Signed-off-by: Peter Jung block/blk-rq-qos.h | 2 +- block/blk-throttle.c | 16 +- block/blk.h | 6 - - 11 files changed, 743 insertions(+), 386 deletions(-) + 11 files changed, 747 insertions(+), 390 deletions(-) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 0fbde0fc0628..59929dfd559b 100644 @@ -3448,7 +3448,7 @@ index 0fbde0fc0628..59929dfd559b 100644 } diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 380e9bda2e57..c330ff5fde4c 100644 +index 380e9bda2e57..aa644973d260 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -377,20 +377,23 @@ static const unsigned long bfq_late_stable_merging = 600; @@ -3756,7 +3756,7 @@ index 380e9bda2e57..c330ff5fde4c 100644 list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) bfq_bfqq_end_wr(bfqq); bfq_end_wr_async(bfqd); -@@ -2794,6 +2847,35 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, +@@ -2794,6 +2847,40 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, struct bfq_queue *bfqq); @@ -3767,11 +3767,11 @@ index 380e9bda2e57..c330ff5fde4c 100644 +{ + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); -+ struct bfq_queue *new_bfqq; ++ struct bfq_queue *new_bfqq = NULL; + -+ if (idling_boosts_thr_without_issues(bfqd, bfqq) || -+ proc_ref == 0) -+ return NULL; ++ bfqq_data->stable_merge_bfqq = NULL; ++ if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0) ++ goto out; + + /* next function will take at least one ref */ + new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); @@ -3786,13 +3786,18 @@ index 380e9bda2e57..c330ff5fde4c 100644 + new_bfqq_data->stably_merged = true; + } + } ++ ++out: ++ /* deschedule stable merge, because done or aborted here */ ++ bfq_put_stable_ref(stable_merge_bfqq); ++ + return new_bfqq; +} + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return -@@ -2819,6 +2901,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, +@@ -2819,6 +2906,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; @@ -3801,7 +3806,7 @@ index 380e9bda2e57..c330ff5fde4c 100644 /* if a merge has already been setup, then proceed with that first */ if (bfqq->new_bfqq) -@@ -2840,37 +2924,23 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, +@@ -2840,37 +2929,18 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, * stable merging) also if bic is associated with a * sync queue, but this bfqq is async */ @@ -3816,11 +3821,10 @@ index 380e9bda2e57..c330ff5fde4c 100644 - bic->stable_merge_bfqq; - int proc_ref = min(bfqq_process_refs(bfqq), - bfqq_process_refs(stable_merge_bfqq)); -+ bfqq_data->stable_merge_bfqq; - - /* deschedule stable merge, because done or aborted here */ - bfq_put_stable_ref(stable_merge_bfqq); - +- +- /* deschedule stable merge, because done or aborted here */ +- bfq_put_stable_ref(stable_merge_bfqq); +- - bic->stable_merge_bfqq = NULL; - - if (!idling_boosts_thr_without_issues(bfqd, bfqq) && @@ -3838,7 +3842,7 @@ index 380e9bda2e57..c330ff5fde4c 100644 - return new_bfqq; - } else - return NULL; -+ bfqq_data->stable_merge_bfqq = NULL; ++ bfqq_data->stable_merge_bfqq; + + return bfq_setup_stable_merge(bfqd, bfqq, + stable_merge_bfqq, @@ -4032,10 +4036,10 @@ index 380e9bda2e57..c330ff5fde4c 100644 } } + } -+ -+ return NULL; -+} -+ + + return NULL; + } + +static struct bfq_queue * +bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) +{ @@ -4052,10 +4056,10 @@ index 380e9bda2e57..c330ff5fde4c 100644 + return bfqq; + } + } - - return NULL; - } - ++ ++ return NULL; ++} ++ +/* + * Perform a linear scan of each actuator, until an actuator is found + * for which the following three conditions hold: the load of the @@ -5250,10 +5254,10 @@ index 1e94e404eaa8..fe09e8b4c2a8 100644 /** * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg diff --git a/block/blk-iocost.c b/block/blk-iocost.c -index 6955605629e4..22a3639a7a05 100644 +index ec7219caea16..c31d57e29bf8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c -@@ -3091,9 +3091,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, +@@ -3096,9 +3096,11 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, return nbytes; } @@ -5267,7 +5271,7 @@ index 6955605629e4..22a3639a7a05 100644 iocg = blkg_to_iocg(ctx.blkg); -@@ -3112,12 +3114,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, +@@ -3117,12 +3119,14 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); @@ -5285,7 +5289,7 @@ index 6955605629e4..22a3639a7a05 100644 } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, -@@ -3172,19 +3176,22 @@ static const match_table_t qos_tokens = { +@@ -3177,19 +3181,22 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { @@ -5314,7 +5318,7 @@ index 6955605629e4..22a3639a7a05 100644 ioc = q_to_ioc(disk->queue); if (!ioc) { ret = blk_iocost_init(disk); -@@ -3201,7 +3208,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, +@@ -3206,7 +3213,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, enable = ioc->enabled; user = ioc->user_qos_params; @@ -5323,7 +5327,7 @@ index 6955605629e4..22a3639a7a05 100644 substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; -@@ -3290,7 +3297,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, +@@ -3295,7 +3302,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); @@ -5332,7 +5336,7 @@ index 6955605629e4..22a3639a7a05 100644 return nbytes; einval: spin_unlock_irq(&ioc->lock); -@@ -3300,7 +3307,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, +@@ -3305,7 +3312,7 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ret = -EINVAL; err: @@ -5341,7 +5345,7 @@ index 6955605629e4..22a3639a7a05 100644 return ret; } -@@ -3351,22 +3358,25 @@ static const match_table_t i_lcoef_tokens = { +@@ -3356,22 +3363,25 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { @@ -5374,7 +5378,7 @@ index 6955605629e4..22a3639a7a05 100644 if (ret) goto err; ioc = q_to_ioc(q); -@@ -3379,7 +3389,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, +@@ -3384,7 +3394,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; @@ -5383,7 +5387,7 @@ index 6955605629e4..22a3639a7a05 100644 substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; -@@ -3426,7 +3436,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, +@@ -3431,7 +3441,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); @@ -5392,7 +5396,7 @@ index 6955605629e4..22a3639a7a05 100644 return nbytes; einval: -@@ -3437,7 +3447,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, +@@ -3442,7 +3452,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ret = -EINVAL; err: @@ -5561,12 +5565,12 @@ index 4c3b3325219a..78f1706cddca 100644 void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); -- -2.39.2 +2.40.0.rc2 -From 6f60a56132a8b4f7d72e8b720cd16e76b4afbe0d Mon Sep 17 00:00:00 2001 +From e44295cea72d5cefc97900011495f89f000873ac Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 13 Feb 2023 11:26:20 +0100 -Subject: [PATCH 03/15] bitmap +Subject: [PATCH 03/16] bitmap Signed-off-by: Peter Jung --- @@ -6912,12 +6916,12 @@ index bb0ee80526b2..8c04254c5284 100644 #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) int w, bit; -- -2.39.2 +2.40.0.rc2 -From 6410241f76741f457037edfe776d47fff19f7d8c Mon Sep 17 00:00:00 2001 +From 5d1ae6ec70d7e64ac75501503e3dcf229e0942fb Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 25 Feb 2023 14:40:50 +0100 -Subject: [PATCH 04/15] cachy +Date: Sat, 11 Mar 2023 14:42:34 +0100 +Subject: [PATCH 04/16] cachy Signed-off-by: Peter Jung --- @@ -6950,7 +6954,7 @@ Signed-off-by: Peter Jung drivers/i2c/busses/i2c-nct6775.c | 647 ++++++++++++++++++ drivers/i2c/busses/i2c-piix4.c | 4 +- drivers/md/dm-crypt.c | 5 + - drivers/pci/quirks.c | 101 +++ + drivers/pci/quirks.c | 103 ++- drivers/platform/x86/Kconfig | 14 + drivers/platform/x86/Makefile | 3 + drivers/platform/x86/steamdeck.c | 523 ++++++++++++++ @@ -6982,7 +6986,7 @@ Signed-off-by: Peter Jung net/ipv4/tcp_ipv4.c | 2 + scripts/Makefile.lib | 13 +- scripts/Makefile.modinst | 7 +- - 61 files changed, 2200 insertions(+), 74 deletions(-) + 61 files changed, 2200 insertions(+), 76 deletions(-) create mode 100644 arch/x86/Makefile.postlink create mode 100644 drivers/i2c/busses/i2c-nct6775.c create mode 100644 drivers/platform/x86/steamdeck.c @@ -7041,7 +7045,7 @@ index 352ff53a2306..7c210744d84c 100644 vmlinuz voffset.h diff --git a/Makefile b/Makefile -index f26824f367a9..0fe8877f9616 100644 +index 1a1d63f2a9ed..9caed88238ab 100644 --- a/Makefile +++ b/Makefile @@ -834,6 +834,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) @@ -8734,7 +8738,7 @@ index 2653516bcdef..973fe8f80051 100644 if (ret < 0) goto bad; diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index 285acc4aaccc..492e88a99c07 100644 +index 494fa46f5767..bcdfc072cbfb 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3612,6 +3612,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) @@ -8844,10 +8848,12 @@ index 285acc4aaccc..492e88a99c07 100644 /* * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be * prevented for those affected devices. -@@ -4980,6 +5080,7 @@ static const struct pci_dev_acs_enabled { +@@ -5000,8 +5100,7 @@ static const struct pci_dev_acs_enabled { { PCI_VENDOR_ID_NXP, 0x8d9b, pci_quirk_nxp_rp_acs }, /* Zhaoxin Root/Downstream Ports */ { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, +- /* Wangxun nics */ +- { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, + { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, { 0 } }; @@ -9828,7 +9834,7 @@ index 0f8736991427..86a988c830ef 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 137d4abe3eda..98e2d9cc8491 100644 +index 1c240d2c99bc..98e1a7472fd2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -96,6 +96,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); @@ -9841,7 +9847,7 @@ index 137d4abe3eda..98e2d9cc8491 100644 static const int ngroups_max = NGROUPS_MAX; -@@ -1640,6 +1643,15 @@ static struct ctl_table kern_table[] = { +@@ -1645,6 +1648,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, @@ -10229,37 +10235,50 @@ index 4815a8e32227..6a3c36713045 100644 $(dst)/%.ko.gz: $(dst)/%.ko FORCE $(call cmd,gzip) -- -2.39.2 +2.40.0.rc2 -From 993543a17f59dc2ef259242455c5d2d0810a76df Mon Sep 17 00:00:00 2001 +From 0e45a02aaaa398cc0465a407331459f28cdb1ae9 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sun, 15 Jan 2023 16:51:11 +0100 -Subject: [PATCH 05/15] clr +Date: Fri, 10 Mar 2023 18:00:48 +0100 +Subject: [PATCH 05/16] clr Signed-off-by: Peter Jung --- - arch/x86/kernel/tsc.c | 3 ++ - arch/x86/mm/fault.c | 4 +- - drivers/cpufreq/intel_pstate.c | 7 ++++ - drivers/idle/intel_idle.c | 50 ++++++++++++------------ - drivers/input/serio/i8042.c | 10 ++--- - drivers/net/dummy.c | 2 +- - drivers/pci/pci.c | 2 +- - drivers/powercap/intel_rapl_common.c | 2 +- - drivers/thermal/intel/intel_powerclamp.c | 10 +++++ - fs/xattr.c | 15 +++---- - include/linux/jbd2.h | 2 +- - include/linux/wait.h | 2 + - include/uapi/linux/if_bonding.h | 2 +- - init/do_mounts.c | 16 +++++++- - kernel/locking/rwsem.c | 4 +- - kernel/sched/wait.c | 24 ++++++++++++ - kernel/watchdog.c | 2 +- - lib/raid6/algos.c | 4 +- - mm/ksm.c | 11 ++++-- - net/ipv4/inet_connection_sock.c | 2 +- - net/ipv4/tcp.c | 4 +- - 21 files changed, 123 insertions(+), 55 deletions(-) + arch/x86/kernel/tsc.c | 3 + + arch/x86/mm/fault.c | 4 +- + drivers/cpufreq/intel_pstate.c | 7 + + drivers/idle/intel_idle.c | 50 ++-- + drivers/input/serio/i8042.c | 10 +- + drivers/net/dummy.c | 2 +- + drivers/pci/pci.c | 2 +- + drivers/powercap/intel_rapl_common.c | 2 +- + drivers/thermal/intel/intel_powerclamp.c | 10 + + fs/xattr.c | 15 +- + include/linux/jbd2.h | 2 +- + include/linux/rcuref.h | 89 +++++++ + include/linux/types.h | 6 + + include/linux/wait.h | 2 + + include/net/dst.h | 21 +- + include/net/sock.h | 2 +- + include/uapi/linux/if_bonding.h | 2 +- + init/do_mounts.c | 16 +- + kernel/locking/rwsem.c | 4 +- + kernel/sched/wait.c | 24 ++ + kernel/watchdog.c | 2 +- + lib/Makefile | 2 +- + lib/raid6/algos.c | 4 +- + lib/rcuref.c | 311 +++++++++++++++++++++++ + mm/ksm.c | 11 +- + net/bridge/br_nf_core.c | 2 +- + net/core/dst.c | 26 +- + net/core/rtnetlink.c | 2 +- + net/ipv4/inet_connection_sock.c | 2 +- + net/ipv4/tcp.c | 4 +- + net/ipv6/route.c | 6 +- + net/netfilter/ipvs/ip_vs_xmit.c | 4 +- + 32 files changed, 559 insertions(+), 90 deletions(-) + create mode 100644 include/linux/rcuref.h + create mode 100644 lib/rcuref.c diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index a78e73da4a74..bab8a98080cf 100644 @@ -10310,10 +10329,10 @@ index fd73d6d2b808..0c0071ab3966 100644 if (max_highest_perf <= min_highest_perf) { if (cppc_perf.highest_perf > max_highest_perf) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c -index cfeb24d40d37..8d1945afa973 100644 +index f060ac7376e6..1cd277c8f77f 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c -@@ -578,7 +578,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -572,7 +572,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, @@ -10322,7 +10341,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -586,7 +586,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -580,7 +580,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 33, @@ -10331,7 +10350,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -594,7 +594,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -588,7 +588,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, @@ -10340,7 +10359,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -602,7 +602,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -596,7 +596,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, @@ -10349,7 +10368,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -610,7 +610,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -604,7 +604,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, @@ -10358,7 +10377,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -618,7 +618,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -612,7 +612,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, @@ -10367,7 +10386,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -626,7 +626,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { +@@ -620,7 +620,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, @@ -10376,7 +10395,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -646,7 +646,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -640,7 +640,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, @@ -10385,7 +10404,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -654,7 +654,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -648,7 +648,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 40, @@ -10394,7 +10413,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -662,7 +662,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -656,7 +656,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 133, @@ -10403,7 +10422,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -670,7 +670,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -664,7 +664,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x32", .flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 166, @@ -10412,7 +10431,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -678,7 +678,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -672,7 +672,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 300, @@ -10421,7 +10440,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -686,7 +686,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -680,7 +680,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 600, @@ -10430,7 +10449,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -694,7 +694,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { +@@ -688,7 +688,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 2600, @@ -10439,7 +10458,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -715,7 +715,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -709,7 +709,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, @@ -10448,7 +10467,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -723,7 +723,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -717,7 +717,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x10", .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 70, @@ -10457,7 +10476,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -731,7 +731,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -725,7 +725,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 85, @@ -10466,7 +10485,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -739,7 +739,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -733,7 +733,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x33", .flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 124, @@ -10475,7 +10494,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -747,7 +747,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -741,7 +741,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x40", .flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 200, @@ -10484,7 +10503,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -755,7 +755,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -749,7 +749,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x50", .flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 480, @@ -10493,7 +10512,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -763,7 +763,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { +@@ -757,7 +757,7 @@ static struct cpuidle_state skl_cstates[] __initdata = { .desc = "MWAIT 0x60", .flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED | CPUIDLE_FLAG_IBRS, .exit_latency = 890, @@ -10502,7 +10521,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -784,7 +784,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { +@@ -778,7 +778,7 @@ static struct cpuidle_state skx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 10, @@ -10511,7 +10530,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -813,7 +813,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { +@@ -807,7 +807,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 4, @@ -10520,7 +10539,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -821,7 +821,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { +@@ -815,7 +815,7 @@ static struct cpuidle_state icx_cstates[] __initdata = { .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, .exit_latency = 170, @@ -10529,7 +10548,7 @@ index cfeb24d40d37..8d1945afa973 100644 .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { -@@ -987,7 +987,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { +@@ -981,7 +981,7 @@ static struct cpuidle_state spr_cstates[] __initdata = { .desc = "MWAIT 0x01", .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE, .exit_latency = 2, @@ -10601,7 +10620,7 @@ index c4b1b0aa438a..06b00f7a8eab 100644 /* fake multicast ability */ static void set_multicast_list(struct net_device *dev) diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c -index 5641786bd020..0ef504e909db 100644 +index 7a67611dc5f4..48b350fe09d8 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -62,7 +62,7 @@ struct pci_pme_device { @@ -10627,10 +10646,10 @@ index 26d00b1853b4..3e239d6548b5 100644 return -ENODEV; diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c -index b80e25ec1261..187b4ee6e9f5 100644 +index 2f4cbfdf26a0..2d297a1cfa34 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c -@@ -627,6 +627,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { +@@ -636,6 +636,11 @@ static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .set_cur_state = powerclamp_set_cur_state, }; @@ -10642,7 +10661,7 @@ index b80e25ec1261..187b4ee6e9f5 100644 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), {} -@@ -636,6 +641,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); +@@ -645,6 +650,11 @@ MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); static int __init powerclamp_probe(void) { @@ -10696,6 +10715,118 @@ index 2170e0cc279d..e8fa79f5bb34 100644 #ifdef CONFIG_JBD2_DEBUG /* +diff --git a/include/linux/rcuref.h b/include/linux/rcuref.h +new file mode 100644 +index 000000000000..57ffb3c02ace +--- /dev/null ++++ b/include/linux/rcuref.h +@@ -0,0 +1,89 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++#ifndef _LINUX_RCUREF_H ++#define _LINUX_RCUREF_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define RCUREF_NOREF 0x00000000 ++#define RCUREF_ONEREF 0x00000001 ++#define RCUREF_MAXREF 0x7FFFFFFF ++#define RCUREF_SATURATED 0xA0000000 ++#define RCUREF_RELEASED 0xC0000000 ++#define RCUREF_DEAD 0xE0000000 ++ ++/** ++ * rcuref_init - Initialize a rcuref reference count with the given reference count ++ * @ref: Pointer to the reference count ++ * @cnt: The initial reference count typically '1' ++ */ ++static inline void rcuref_init(rcuref_t *ref, unsigned int cnt) ++{ ++ atomic_set(&ref->refcnt, cnt); ++} ++ ++/** ++ * rcuref_read - Read the number of held reference counts of a rcuref ++ * @ref: Pointer to the reference count ++ * ++ * Return: The number of held references (0 ... N) ++ */ ++static inline unsigned int rcuref_read(rcuref_t *ref) ++{ ++ unsigned int c = atomic_read(&ref->refcnt); ++ ++ /* Return 0 if within the DEAD zone. */ ++ return c >= RCUREF_RELEASED ? 0 : c; ++} ++ ++extern __must_check bool rcuref_get_slowpath(rcuref_t *ref, unsigned int new); ++ ++/** ++ * rcuref_get - Acquire one reference on a rcuref reference count ++ * @ref: Pointer to the reference count ++ * ++ * Similar to atomic_inc_not_zero() but saturates at RCUREF_MAXREF. ++ * ++ * Provides no memory ordering, it is assumed the caller has guaranteed the ++ * object memory to be stable (RCU, etc.). It does provide a control dependency ++ * and thereby orders future stores. See documentation in lib/rcuref.c ++ * ++ * Return: ++ * False if the attempt to acquire a reference failed. This happens ++ * when the last reference has been put already ++ * ++ * True if a reference was successfully acquired ++ */ ++static inline __must_check bool rcuref_get(rcuref_t *ref) ++{ ++ /* ++ * Unconditionally increase the reference count. The saturation and ++ * dead zones provide enough tolerance for this. ++ */ ++ unsigned int old = atomic_fetch_add_relaxed(1, &ref->refcnt); ++ ++ /* ++ * If the old value is less than RCUREF_MAXREF, this is a valid ++ * reference. ++ * ++ * In case the original value was RCUREF_NOREF the above ++ * unconditional increment raced with a concurrent put() operation ++ * dropping the last reference. That racing put() operation ++ * subsequently fails to mark the reference count dead because the ++ * count is now elevated again and the concurrent caller is ++ * therefore not allowed to deconstruct the object. ++ */ ++ if (likely(old < RCUREF_MAXREF)) ++ return true; ++ ++ /* Handle the cases inside the saturation and dead zones */ ++ return rcuref_get_slowpath(ref, old); ++} ++ ++extern __must_check bool rcuref_put(rcuref_t *ref); ++ ++#endif +diff --git a/include/linux/types.h b/include/linux/types.h +index ea8cf60a8a79..419baa980529 100644 +--- a/include/linux/types.h ++++ b/include/linux/types.h +@@ -175,6 +175,12 @@ typedef struct { + } atomic64_t; + #endif + ++typedef struct { ++ atomic_t refcnt; ++} rcuref_t; ++ ++#define RCUREF_INIT(i) { .refcnt = ATOMIC_INIT(i) } ++ + struct list_head { + struct list_head *next, *prev; + }; diff --git a/include/linux/wait.h b/include/linux/wait.h index a0307b516b09..edc21128f387 100644 --- a/include/linux/wait.h @@ -10716,6 +10847,82 @@ index a0307b516b09..edc21128f387 100644 long prepare_to_wait_event(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry, int state); void finish_wait(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); long wait_woken(struct wait_queue_entry *wq_entry, unsigned mode, long timeout); +diff --git a/include/net/dst.h b/include/net/dst.h +index d67fda89cd0f..0909a3306902 100644 +--- a/include/net/dst.h ++++ b/include/net/dst.h +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,19 +66,29 @@ struct dst_entry { + * input/output/ops or performance tanks badly + */ + #ifdef CONFIG_64BIT +- atomic_t __refcnt; /* 64-bit offset 64 */ ++ rcuref_t __refcnt; /* 64-bit offset 64 */ + #endif + int __use; + unsigned long lastuse; +- struct lwtunnel_state *lwtstate; + struct rcu_head rcu_head; + short error; + short __pad; + __u32 tclassid; + #ifndef CONFIG_64BIT +- atomic_t __refcnt; /* 32-bit offset 64 */ ++ struct lwtunnel_state *lwtstate; ++ rcuref_t __refcnt; /* 32-bit offset 64 */ + #endif + netdevice_tracker dev_tracker; ++#ifdef CONFIG_64BIT ++ /* ++ * Ensure that lwtstate is not in the same cache line as __refcnt, ++ * because that would lead to false sharing under high contention ++ * of __refcnt. This also ensures that rtable::rt_genid is not ++ * sharing the same cache-line. ++ */ ++ int pad2[6]; ++ struct lwtunnel_state *lwtstate; ++#endif + }; + + struct dst_metrics { +@@ -228,7 +239,7 @@ static inline void dst_hold(struct dst_entry *dst) + * the placement of __refcnt in struct dst_entry + */ + BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63); +- WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0); ++ WARN_ON(!rcuref_get(&dst->__refcnt)); + } + + static inline void dst_use_noref(struct dst_entry *dst, unsigned long time) +@@ -292,7 +303,7 @@ static inline void skb_dst_copy(struct sk_buff *nskb, const struct sk_buff *oskb + */ + static inline bool dst_hold_safe(struct dst_entry *dst) + { +- return atomic_inc_not_zero(&dst->__refcnt); ++ return rcuref_get(&dst->__refcnt); + } + + /** +diff --git a/include/net/sock.h b/include/net/sock.h +index c6584a352463..dbf85161c0c7 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -2159,7 +2159,7 @@ sk_dst_get(struct sock *sk) + + rcu_read_lock(); + dst = rcu_dereference(sk->sk_dst_cache); +- if (dst && !atomic_inc_not_zero(&dst->__refcnt)) ++ if (dst && !rcuref_get(&dst->__refcnt)) + dst = NULL; + rcu_read_unlock(); + return dst; diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index d174914a837d..bf8e2af101a3 100644 --- a/include/uapi/linux/if_bonding.h @@ -10765,10 +10972,10 @@ index 811e94daf0a8..06fef7f97c02 100644 md_run_setup(); diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c -index 44873594de03..fe62d59f2bdc 100644 +index 84d5b649b95f..e341ca8731f7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c -@@ -755,6 +755,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) +@@ -754,6 +754,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; @@ -10776,7 +10983,7 @@ index 44873594de03..fe62d59f2bdc 100644 lockdep_assert_preemption_disabled(); -@@ -791,7 +792,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) +@@ -790,7 +791,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) break; } @@ -10841,6 +11048,19 @@ index 8e61f21e7e33..be1439d38f26 100644 static int __read_mostly nmi_watchdog_available; struct cpumask watchdog_cpumask __read_mostly; +diff --git a/lib/Makefile b/lib/Makefile +index 4d9461bfea42..71c9627153b8 100644 +--- a/lib/Makefile ++++ b/lib/Makefile +@@ -47,7 +47,7 @@ obj-y += bcd.o sort.o parser.o debug_locks.o random32.o \ + list_sort.o uuid.o iov_iter.o clz_ctz.o \ + bsearch.o find_bit.o llist.o memweight.o kfifo.o \ + percpu-refcount.o rhashtable.o base64.o \ +- once.o refcount.o usercopy.o errseq.o bucket_locks.o \ ++ once.o refcount.o rcuref.o usercopy.o errseq.o bucket_locks.o \ + generic-radix-tree.o + obj-$(CONFIG_STRING_SELFTEST) += test_string.o + obj-y += string_helpers.o diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c index a22a05c9af8a..a70bcbbd1673 100644 --- a/lib/raid6/algos.c @@ -10857,6 +11077,323 @@ index a22a05c9af8a..a70bcbbd1673 100644 if (best) { raid6_2data_recov = best->data2; +diff --git a/lib/rcuref.c b/lib/rcuref.c +new file mode 100644 +index 000000000000..34fa40618fca +--- /dev/null ++++ b/lib/rcuref.c +@@ -0,0 +1,311 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++ ++/* ++ * rcuref - A scalable reference count implementation for RCU managed objects ++ * ++ * rcuref is provided to replace open coded reference count implementations ++ * based on atomic_t. It protects explicitely RCU managed objects which can ++ * be visible even after the last reference has been dropped and the object ++ * is heading towards destruction. ++ * ++ * A common usage pattern is: ++ * ++ * get() ++ * rcu_read_lock(); ++ * p = get_ptr(); ++ * if (p && !atomic_inc_not_zero(&p->refcnt)) ++ * p = NULL; ++ * rcu_read_unlock(); ++ * return p; ++ * ++ * put() ++ * if (!atomic_dec_return(&->refcnt)) { ++ * remove_ptr(p); ++ * kfree_rcu((p, rcu); ++ * } ++ * ++ * atomic_inc_not_zero() is implemented with a try_cmpxchg() loop which has ++ * O(N^2) behaviour under contention with N concurrent operations. ++ * ++ * rcuref uses atomic_fetch_add_relaxed() and atomic_fetch_sub_release() ++ * for the fast path, which scale better under contention. ++ * ++ * Why not refcount? ++ * ================= ++ * ++ * In principle it should be possible to make refcount use the rcuref ++ * scheme, but the destruction race described below cannot be prevented ++ * unless the protected object is RCU managed. ++ * ++ * Theory of operation ++ * =================== ++ * ++ * rcuref uses an unsigned integer reference counter. As long as the ++ * counter value is greater than or equal to RCUREF_ONEREF and not larger ++ * than RCUREF_MAXREF the reference is alive: ++ * ++ * NOREF ONEREF MAXREF SATURATED RELEASED DEAD ++ * 0 1 0x7FFFFFFF 0x8000000 0xA0000000 0xBFFFFFFF 0xC0000000 0xE0000000 0xFFFFFFFF ++ * <---valid ------------> <-------saturation zone-------> <-----------dead zone----------> ++ * ++ * The get() and put() operations do unconditional increments and ++ * decrements. The result is checked after the operation. This optimizes ++ * for the fast path. ++ * ++ * If the reference count is saturated or dead, then the increments and ++ * decrements are not harmful as the reference count still stays in the ++ * respective zones and is always set back to STATURATED resp. DEAD. The ++ * zones have room for 2^28 racing operations in each direction, which ++ * makes it practically impossible to escape the zones. ++ * ++ * Once the last reference is dropped the reference count becomes ++ * RCUREF_NOREF which forces rcuref_put() into the slowpath operation. The ++ * slowpath then tries to set the reference count from RCUREF_NOREF to ++ * RCUREF_DEAD via a cmpxchg(). This opens a small window where a ++ * concurrent rcuref_get() can acquire the reference count and bring it ++ * back to RCUREF_ONEREF or even drop the reference again and mark it DEAD. ++ * ++ * If the cmpxchg() succeeds then a concurrent rcuref_get() will result in ++ * DEAD + 1, which is inside the dead zone. If that happens the reference ++ * count is put back to DEAD. ++ * ++ * The actual race is possible due to the unconditional increment and ++ * decrements in rcuref_get() and rcuref_put(): ++ * ++ * T1 T2 ++ * get() put() ++ * if (atomic_fetch_sub(1, &ref->refcnt) >= 0) ++ * succeeds-> atomic_try_cmpxchg(&ref->refcnt, -1, DEAD); ++ * ++ * old = atomic_fetch_add(1, &ref->refcnt); <- Elevates refcount to DEAD + 1 ++ * ++ * As @old observed by T1 is within the dead zone the T1 get() fails. ++ * ++ * Possible critical states: ++ * ++ * Context Counter References Operation ++ * T1 1 1 init() ++ * T2 2 2 get() ++ * T1 1 1 put() ++ * T2 0 0 put() tries to mark dead ++ * T1 1 1 get() ++ * T2 1 1 put() mark dead fails ++ * T1 0 0 put() tries to mark dead ++ * T1 DEAD 0 put() mark dead succeeds ++ * T2 DEAD+1 0 get() fails and puts it back to DEAD ++ * ++ * Of course there are more complex scenarios, but the above illustrates ++ * the working principle. The rest is left to the imagination of the ++ * reader. ++ * ++ * Deconstruction race ++ * =================== ++ * ++ * The release operation must be protected by prohibiting a grace period in ++ * order to prevent a possible use after free: ++ * ++ * T1 T2 ++ * put() get() ++ * // ref->refcnt = ONEREF ++ * if (atomic_fetch_sub(1, &ref->cnt) > ONEREF) ++ * return false; <- Not taken ++ * ++ * // ref->refcnt == NOREF ++ * --> preemption ++ * // Elevates ref->c to ONEREF ++ * if (!atomic_fetch_add(1, &ref->refcnt) >= NOREF) ++ * return true; <- taken ++ * ++ * if (put(&p->ref)) { <-- Succeeds ++ * remove_pointer(p); ++ * kfree_rcu(p, rcu); ++ * } ++ * ++ * RCU grace period ends, object is freed ++ * ++ * atomic_cmpxchg(&ref->refcnt, NONE, DEAD); <- UAF ++ * ++ * This is prevented by disabling preemption around the put() operation as ++ * that's in most kernel configurations cheaper than a rcu_read_lock() / ++ * rcu_read_unlock() pair and in many cases even a NOOP. In any case it ++ * prevents the grace period which keeps the object alive until all put() ++ * operations complete. ++ * ++ * Saturation protection ++ * ===================== ++ * ++ * The reference count has a saturation limit RCUREF_MAXREF (INT_MAX). ++ * Once this is exceedded the reference count becomes stale by setting it ++ * to RCUREF_SATURATED, which will cause a memory leak, but it prevents ++ * wrap arounds which obviously cause worse problems than a memory ++ * leak. When saturation is reached a warning is emitted. ++ * ++ * Race conditions ++ * =============== ++ * ++ * All reference count increment/decrement operations are unconditional and ++ * only verified after the fact. This optimizes for the good case and takes ++ * the occasional race vs. a dead or already saturated refcount into ++ * account. The saturation and dead zones are large enough to accomodate ++ * for that. ++ * ++ * Memory ordering ++ * =============== ++ * ++ * Memory ordering rules are slightly relaxed wrt regular atomic_t functions ++ * and provide only what is strictly required for refcounts. ++ * ++ * The increments are fully relaxed; these will not provide ordering. The ++ * rationale is that whatever is used to obtain the object to increase the ++ * reference count on will provide the ordering. For locked data ++ * structures, its the lock acquire, for RCU/lockless data structures its ++ * the dependent load. ++ * ++ * rcuref_get() provides a control dependency ordering future stores which ++ * ensures that the object is not modified when acquiring a reference ++ * fails. ++ * ++ * rcuref_put() provides release order, i.e. all prior loads and stores ++ * will be issued before. It also provides a control dependency ordering ++ * against the subsequent destruction of the object. ++ * ++ * If rcuref_put() successfully dropped the last reference and marked the ++ * object DEAD it also provides acquire ordering. ++ */ ++ ++#include ++#include ++ ++/** ++ * rcuref_get_slowpath - Slowpath of rcuref_get() ++ * @ref: Pointer to the reference count ++ * @old: The reference count before the unconditional increment ++ * operation in rcuref_get() ++ * ++ * Invoked when the reference count is outside of the valid zone. ++ * ++ * Return: ++ * False if the reference count was already marked dead ++ * ++ * True if the reference count is saturated, which prevents the ++ * object from being deconstructed ever. ++ */ ++bool rcuref_get_slowpath(rcuref_t *ref, unsigned int old) ++{ ++ /* ++ * If the reference count was already marked dead, undo the ++ * increment so it stays in the middle of the dead zone and return ++ * fail. ++ */ ++ if (old >= RCUREF_RELEASED) { ++ atomic_set(&ref->refcnt, RCUREF_DEAD); ++ return false; ++ } ++ ++ /* ++ * If it was saturated, warn and mark it so. In case the increment ++ * was already on a saturated value restore the saturation ++ * marker. This keeps it in the middle of the saturation zone and ++ * prevents the reference count from overflowing. This leaks the ++ * object memory, but prevents the obvious reference count overflow ++ * damage. ++ */ ++ WARN_ONCE(old >= RCUREF_MAXREF, "rcuref saturated - leaking memory"); ++ atomic_set(&ref->refcnt, RCUREF_SATURATED); ++ return true; ++} ++EXPORT_SYMBOL_GPL(rcuref_get_slowpath); ++ ++static __must_check bool __rcuref_put(rcuref_t *ref) ++{ ++ /* ++ * Unconditionally decrement the reference count. The saturation and ++ * dead zones provide enough tolerance for this. ++ */ ++ unsigned int old = atomic_fetch_sub_release(1, &ref->refcnt); ++ ++ /* ++ * If the old value is in the valid range and is greater than ++ * RCUREF_ONEREF, nothing to do. ++ */ ++ if (likely(old > RCUREF_ONEREF && old <= RCUREF_MAXREF)) ++ return false; ++ ++ /* Did this drop the last reference? */ ++ if (likely(old == RCUREF_ONEREF)) { ++ /* ++ * Carefully try to set the reference count to RCUREF_DEAD. ++ * ++ * This can fail if a concurrent get() operation has ++ * elevated it again or the corresponding put() even marked ++ * it dead already. Both are valid situations and do not ++ * require a retry. If this fails the caller is not ++ * allowed to deconstruct the object. ++ */ ++ if (atomic_cmpxchg_release(&ref->refcnt, RCUREF_NOREF, RCUREF_DEAD) != RCUREF_NOREF) ++ return false; ++ ++ /* ++ * The caller can safely schedule the object for ++ * deconstruction. Provide acquire ordering. ++ */ ++ smp_acquire__after_ctrl_dep(); ++ return true; ++ } ++ ++ /* ++ * If the reference count was already in the dead zone, then this ++ * put() operation is imbalanced. Warn, put the reference count back to ++ * DEAD and tell the caller to not deconstruct the object. ++ */ ++ if (WARN_ONCE(old >= RCUREF_RELEASED, "rcuref - imbalanced put()")) { ++ atomic_set(&ref->refcnt, RCUREF_DEAD); ++ return false; ++ } ++ ++ /* ++ * This is a put() operation on a saturated refcount. Restore the ++ * mean saturation value and tell the caller to not deconstruct the ++ * object. ++ */ ++ atomic_set(&ref->refcnt, RCUREF_SATURATED); ++ return false; ++} ++ ++/** ++ * rcuref_put -- Release one reference for a rcuref reference count ++ * @ref: Pointer to the reference count ++ * ++ * Can be invoked from any context. ++ * ++ * Provides release memory ordering, such that prior loads and stores are done ++ * before, and provides an acquire ordering on success such that free() ++ * must come after. ++ * ++ * Return: ++ * ++ * True if this was the last reference with no future references ++ * possible. This signals the caller that it can safely schedule the ++ * object, which is protected by the reference counter, for ++ * deconstruction. ++ * ++ * False if there are still active references or the put() raced ++ * with a concurrent get()/put() pair. Caller is not allowed to ++ * deconstruct the protected object. ++ */ ++bool rcuref_put(rcuref_t *ref) ++{ ++ bool released; ++ ++ /* ++ * Protect against a concurrent get()/put() pair which marks the ++ * reference count DEAD and schedules it for RCU free. This ++ * prevents a grace period and is cheaper than ++ * rcu_read_lock()/unlock(). ++ */ ++ preempt_disable(); ++ released = __rcuref_put(ref); ++ preempt_enable(); ++ return released; ++} ++EXPORT_SYMBOL_GPL(rcuref_put); diff --git a/mm/ksm.c b/mm/ksm.c index addf490da146..a92c9594a2d3 100644 --- a/mm/ksm.c @@ -10879,6 +11416,81 @@ index addf490da146..a92c9594a2d3 100644 } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); +diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c +index 8c69f0c95a8e..c2b628e3cc7f 100644 +--- a/net/bridge/br_nf_core.c ++++ b/net/bridge/br_nf_core.c +@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net_bridge *br) + { + struct rtable *rt = &br->fake_rtable; + +- atomic_set(&rt->dst.__refcnt, 1); ++ rcuref_init(&rt->dst.__refcnt, 1); + rt->dst.dev = br->dev; + dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; +diff --git a/net/core/dst.c b/net/core/dst.c +index 6d2dd03dafa8..750440803883 100644 +--- a/net/core/dst.c ++++ b/net/core/dst.c +@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, + dst->tclassid = 0; + #endif + dst->lwtstate = NULL; +- atomic_set(&dst->__refcnt, initial_ref); ++ rcuref_init(&dst->__refcnt, initial_ref); + dst->__use = 0; + dst->lastuse = jiffies; + dst->flags = flags; +@@ -166,31 +166,15 @@ EXPORT_SYMBOL(dst_dev_put); + + void dst_release(struct dst_entry *dst) + { +- if (dst) { +- int newrefcnt; +- +- newrefcnt = atomic_dec_return(&dst->__refcnt); +- if (WARN_ONCE(newrefcnt < 0, "dst_release underflow")) +- net_warn_ratelimited("%s: dst:%p refcnt:%d\n", +- __func__, dst, newrefcnt); +- if (!newrefcnt) +- call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); +- } ++ if (dst && rcuref_put(&dst->__refcnt)) ++ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu); + } + EXPORT_SYMBOL(dst_release); + + void dst_release_immediate(struct dst_entry *dst) + { +- if (dst) { +- int newrefcnt; +- +- newrefcnt = atomic_dec_return(&dst->__refcnt); +- if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow")) +- net_warn_ratelimited("%s: dst:%p refcnt:%d\n", +- __func__, dst, newrefcnt); +- if (!newrefcnt) +- dst_destroy(dst); +- } ++ if (dst && rcuref_put(&dst->__refcnt)) ++ dst_destroy(dst); + } + EXPORT_SYMBOL(dst_release_immediate); + +diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c +index 64289bc98887..228c54bbdecc 100644 +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, + if (dst) { + ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse); + ci.rta_used = dst->__use; +- ci.rta_clntref = atomic_read(&dst->__refcnt); ++ ci.rta_clntref = rcuref_read(&dst->__refcnt); + } + if (expires) { + unsigned long clock; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f2c43f67187d..9885bfb429a2 100644 --- a/net/ipv4/inet_connection_sock.c @@ -10907,13 +11519,66 @@ index e9e8040d6491..f9b56123b3b8 100644 init_net.ipv4.sysctl_tcp_wmem[0] = PAGE_SIZE; init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024; +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index a6983a13dd20..8b5e3d57b08d 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_entry_template = { + + static const struct rt6_info ip6_null_entry_template = { + .dst = { +- .__refcnt = ATOMIC_INIT(1), ++ .__refcnt = RCUREF_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -ENETUNREACH, +@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_entry_template = { + + static const struct rt6_info ip6_prohibit_entry_template = { + .dst = { +- .__refcnt = ATOMIC_INIT(1), ++ .__refcnt = RCUREF_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EACCES, +@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibit_entry_template = { + + static const struct rt6_info ip6_blk_hole_entry_template = { + .dst = { +- .__refcnt = ATOMIC_INIT(1), ++ .__refcnt = RCUREF_INIT(1), + .__use = 1, + .obsolete = DST_OBSOLETE_FORCE_CHK, + .error = -EINVAL, +diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c +index 029171379884..bc9dc51828f7 100644 +--- a/net/netfilter/ipvs/ip_vs_xmit.c ++++ b/net/netfilter/ipvs/ip_vs_xmit.c +@@ -339,7 +339,7 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n", + &dest->addr.ip, &dest_dst->dst_saddr.ip, +- atomic_read(&rt->dst.__refcnt)); ++ rcuref_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.ip; +@@ -507,7 +507,7 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, + spin_unlock_bh(&dest->dst_lock); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest_dst->dst_saddr.in6, +- atomic_read(&rt->dst.__refcnt)); ++ rcuref_read(&rt->dst.__refcnt)); + } + if (ret_saddr) + *ret_saddr = dest_dst->dst_saddr.in6; -- -2.39.2 +2.40.0.rc2 -From ca9964f0e4522dd46497aaa1736c860ebff85d2e Mon Sep 17 00:00:00 2001 +From ed2979f1636e3197b42234c8acac4d20f4e2ed8e Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 25 Feb 2023 14:41:15 +0100 -Subject: [PATCH 06/15] fixes +Date: Fri, 10 Mar 2023 18:03:29 +0100 +Subject: [PATCH 06/16] fixes Signed-off-by: Peter Jung --- @@ -10923,41 +11588,41 @@ Signed-off-by: Peter Jung Documentation/leds/index.rst | 1 + Documentation/leds/ledtrig-blkdev.rst | 158 +++ arch/x86/boot/compressed/Makefile | 2 +- - arch/x86/kernel/acpi/boot.c | 19 +- + arch/x86/events/rapl.c | 20 +- + arch/x86/kernel/cpu/amd.c | 9 + arch/x86/mm/tlb.c | 2 +- - drivers/acpi/acpica/Makefile | 2 +- - drivers/bluetooth/btusb.c | 9 + - drivers/char/tpm/tpm-chip.c | 62 +- + arch/x86/net/bpf_jit_comp.c | 5 +- + drivers/bluetooth/btusb.c | 2 +- + drivers/char/tpm/tpm-chip.c | 60 +- drivers/char/tpm/tpm.h | 73 + - drivers/hwmon/nct6775-core.c | 2 +- drivers/leds/trigger/Kconfig | 9 + drivers/leds/trigger/Makefile | 1 + - drivers/leds/trigger/ledtrig-blkdev.c | 1220 +++++++++++++++++ + drivers/leds/trigger/ledtrig-blkdev.c | 1221 +++++++++++++++++ fs/eventpoll.c | 2 +- - fs/nfsd/filecache.c | 44 +- - fs/nfsd/trace.h | 31 - fs/proc/base.c | 1 + include/linux/mm_types.h | 7 +- include/linux/pageblock-flags.h | 2 +- + kernel/kheaders.c | 10 +- kernel/kthread.c | 5 + kernel/padata.c | 4 +- lib/string.c | 10 +- lib/zstd/decompress/huf_decompress.c | 2 +- mm/compaction.c | 75 +- mm/internal.h | 6 +- - mm/ksm.c | 185 ++- + mm/ksm.c | 196 ++- + mm/page_alloc.c | 22 +- mm/z3fold.c | 2 - mm/zsmalloc.c | 3 - scripts/Makefile.vmlinux_o | 2 +- sound/pci/hda/cs35l41_hda.c | 2 +- .../selftests/vm/ksm_functional_tests.c | 96 +- - 34 files changed, 1975 insertions(+), 159 deletions(-) + 34 files changed, 1995 insertions(+), 110 deletions(-) create mode 100644 Documentation/ABI/testing/sysfs-class-led-trigger-blkdev create mode 100644 Documentation/leds/ledtrig-blkdev.rst create mode 100644 drivers/leds/trigger/ledtrig-blkdev.c diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block -index cd14ecb3c9a5..853cb2601242 100644 +index cd14ecb3c9a5..ad47337ac75a 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -101,6 +101,16 @@ Description: @@ -10965,7 +11630,7 @@ index cd14ecb3c9a5..853cb2601242 100644 +What: /sys/block//linked_leds -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Directory that contains symbolic links to all LEDs that @@ -10979,19 +11644,19 @@ index cd14ecb3c9a5..853cb2601242 100644 Contact: Martin K. Petersen diff --git a/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev new file mode 100644 -index 000000000000..45275eb0bad3 +index 000000000000..28ce8c814fb7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-led-trigger-blkdev @@ -0,0 +1,78 @@ +What: /sys/class/leds//blink_time -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Time (in milliseconds) that the LED will be on during a single + "blink". + +What: /sys/class/leds//check_interval -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Interval (in milliseconds) between checks of the block devices @@ -11001,35 +11666,35 @@ index 000000000000..45275eb0bad3 + check. + +What: /sys/class/leds//blink_on_read -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to read activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_write -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to write activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_discard -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to discard activity on any of its linked block devices. + +What: /sys/class/leds//blink_on_flush -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Boolean that determines whether the LED will blink in response + to cache flush activity on any of its linked block devices. + +What: /sys/class/leds//link_dev_by_path -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Associate a block device with this LED by writing the path to @@ -11037,7 +11702,7 @@ index 000000000000..45275eb0bad3 + Symbolic links are followed. + +What: /sys/class/leds//unlink_dev_by_path -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Remove the association between this LED and a block device by @@ -11045,7 +11710,7 @@ index 000000000000..45275eb0bad3 + this attribute. Symbolic links are followed. + +What: /sys/class/leds//unlink_dev_by_name -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Remove the association between this LED and a block device by @@ -11053,7 +11718,7 @@ index 000000000000..45275eb0bad3 + attribute. + +What: /sys/class/leds//linked_devices -+Date: October 2022 ++Date: January 2023 +Contact: Ian Pilcher +Description: + Directory containing links to all block devices that are @@ -11268,50 +11933,84 @@ index d995595394bb..19d1fb601796 100644 KBUILD_CFLAGS += -include $(srctree)/include/linux/hidden.h # sev.c indirectly inludes inat-table.h which is generated during -diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c -index 907cc98b1938..518bda50068c 100644 ---- a/arch/x86/kernel/acpi/boot.c -+++ b/arch/x86/kernel/acpi/boot.c -@@ -188,6 +188,17 @@ static int acpi_register_lapic(int id, u32 acpiid, u8 enabled) - return cpu; +diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c +index 52e6e7ed4f78..f000cc16d128 100644 +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -343,14 +343,15 @@ static int rapl_pmu_event_init(struct perf_event *event) + if (event->cpu < 0) + return -EINVAL; + +- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; +- + if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) + return -EINVAL; + + cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); + bit = cfg - 1; + ++ if (bit != PERF_RAPL_PP0) ++ event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; ++ + /* check event supported */ + if (!(rapl_cntr_mask & (1 << bit))) + return -EINVAL; +@@ -363,7 +364,15 @@ static int rapl_pmu_event_init(struct perf_event *event) + pmu = cpu_to_rapl_pmu(event->cpu); + if (!pmu) + return -EINVAL; +- event->cpu = pmu->cpu; ++ ++ /* ++ * FIXME: RAPL PMU considers events are uncore and MSRs can be read from ++ * the first available CPU of the die. But this is not true for energy-cores ++ * event. Therefore as a workaround don't consider pmu->cpu here for PERF_RAPL_PP0. ++ */ ++ if (event->event_caps & PERF_EV_CAP_READ_ACTIVE_PKG) ++ event->cpu = pmu->cpu; ++ + event->pmu_private = pmu; + event->hw.event_base = rapl_msrs[bit].msr; + event->hw.config = cfg; +@@ -537,7 +546,7 @@ static struct perf_msr intel_rapl_spr_msrs[] = { + * - want to use same event codes across both architectures + */ + static struct perf_msr amd_rapl_msrs[] = { +- [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, 0, false, 0 }, ++ [PERF_RAPL_PP0] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_cores_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, 0, false, 0 }, + [PERF_RAPL_PP1] = { 0, &rapl_events_gpu_group, 0, false, 0 }, +@@ -764,7 +773,8 @@ static struct rapl_model model_spr = { + }; + + static struct rapl_model model_amd_hygon = { +- .events = BIT(PERF_RAPL_PKG), ++ .events = BIT(PERF_RAPL_PP0) | ++ BIT(PERF_RAPL_PKG), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, + .rapl_msrs = amd_rapl_msrs, + }; +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index f769d6d08b43..06f2ede1544f 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -880,6 +880,15 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) + } + } + #endif ++ /* ++ * Work around Erratum 1386. The XSAVES instruction malfunctions in ++ * certain circumstances on Zen1/2 uarch, and not all parts have had ++ * updated microcode at the time of writing (March 2023). ++ * ++ * Affected parts all have no supervisor XSAVE states, meaning that ++ * the XSAVEC instruction (which works fine) is equivalent. ++ */ ++ clear_cpu_cap(c, X86_FEATURE_XSAVES); } -+static bool __init acpi_is_processor_usable(u32 lapic_flags) -+{ -+ if (lapic_flags & ACPI_MADT_ENABLED) -+ return true; -+ -+ if (acpi_support_online_capable && (lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) -+ return true; -+ -+ return false; -+} -+ - static int __init - acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) - { -@@ -212,6 +223,10 @@ acpi_parse_x2apic(union acpi_subtable_headers *header, const unsigned long end) - if (apic_id == 0xffffffff) - return 0; - -+ /* don't register processors that cannot be onlined */ -+ if (!acpi_is_processor_usable(processor->lapic_flags)) -+ return 0; -+ - /* - * We need to register disabled CPU as well to permit - * counting disabled CPUs. This allows us to size -@@ -250,9 +265,7 @@ acpi_parse_lapic(union acpi_subtable_headers * header, const unsigned long end) - return 0; - - /* don't register processors that can not be onlined */ -- if (acpi_support_online_capable && -- !(processor->lapic_flags & ACPI_MADT_ENABLED) && -- !(processor->lapic_flags & ACPI_MADT_ONLINE_CAPABLE)) -+ if (!acpi_is_processor_usable(processor->lapic_flags)) - return 0; - - /* + static void init_amd_zn(struct cpuinfo_x86 *c) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c1e31e9a85d7..92d73ccede70 100644 --- a/arch/x86/mm/tlb.c @@ -11325,68 +12024,58 @@ index c1e31e9a85d7..92d73ccede70 100644 __flush_tlb_global(); } else { /* -diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile -index 9e0d95d76fff..30f3fc13c29d 100644 ---- a/drivers/acpi/acpica/Makefile -+++ b/drivers/acpi/acpica/Makefile -@@ -3,7 +3,7 @@ - # Makefile for ACPICA Core interpreter - # +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index b808be77635e..6e696c6b7018 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -343,9 +343,10 @@ static int emit_call(u8 **pprog, void *func, void *ip) --ccflags-y := -Os -D_LINUX -DBUILDING_ACPICA -+ccflags-y := -D_LINUX -DBUILDING_ACPICA - ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT + static int emit_rsb_call(u8 **pprog, void *func, void *ip) + { ++ void *adjusted_ip; + OPTIMIZER_HIDE_VAR(func); +- x86_call_depth_emit_accounting(pprog, func); +- return emit_patch(pprog, func, ip, 0xE8); ++ adjusted_ip = (u8 *)ip + x86_call_depth_emit_accounting(pprog, func); ++ return emit_patch(pprog, func, adjusted_ip, 0xE8); + } - # use acpi.o to put all files here into acpi.o modparam namespace + static int emit_jump(u8 **pprog, void *func, void *ip) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c -index 2ad4efdd9e40..afd2f08ffe30 100644 +index 18bc94718711..7b9ee86b4609 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c -@@ -64,6 +64,7 @@ static struct usb_driver btusb_driver; - #define BTUSB_INTEL_BROKEN_SHUTDOWN_LED BIT(24) - #define BTUSB_INTEL_BROKEN_INITIAL_NCMD BIT(25) - #define BTUSB_INTEL_NO_WBS_SUPPORT BIT(26) -+#define BTUSB_ACTIONS_SEMI BIT(27) +@@ -912,7 +912,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev) + } - static const struct usb_device_id btusb_table[] = { - /* Generic Bluetooth USB device */ -@@ -677,6 +678,9 @@ static const struct usb_device_id blacklist_table[] = { - { USB_DEVICE(0x0cb5, 0xc547), .driver_info = BTUSB_REALTEK | - BTUSB_WIDEBAND_SPEECH }, - -+ /* Actions Semiconductor ATS2851 based devices */ -+ { USB_DEVICE(0x10d7, 0xb012), .driver_info = BTUSB_ACTIONS_SEMI }, -+ - /* Silicon Wave based devices */ - { USB_DEVICE(0x0c10, 0x0000), .driver_info = BTUSB_SWAVE }, - -@@ -4098,6 +4102,11 @@ static int btusb_probe(struct usb_interface *intf, - set_bit(BTUSB_USE_ALT3_FOR_WBS, &data->flags); - } - -+ if (id->driver_info & BTUSB_ACTIONS_SEMI) { -+ /* Support is advertised, but not implemented */ -+ set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks); -+ } -+ - if (!reset) - set_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks); + gpiod_set_value_cansleep(reset_gpio, 0); +- msleep(200); ++ usleep_range(USEC_PER_SEC / 2, USEC_PER_SEC); + gpiod_set_value_cansleep(reset_gpio, 1); + return; diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c -index 741d8f3e8fb3..348dd5705fbb 100644 +index 741d8f3e8fb3..c467eeae9973 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c -@@ -512,6 +512,65 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) +@@ -512,6 +512,63 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) return 0; } -+static bool tpm_is_rng_defective(struct tpm_chip *chip) ++/* ++ * Some AMD fTPM versions may cause stutter ++ * https://www.amd.com/en/support/kb/faq/pa-410 ++ * ++ * Fixes are available in two series of fTPM firmware: ++ * 6.x.y.z series: 6.0.18.6 + ++ * 3.x.y.z series: 3.57.y.5 + ++ */ ++static bool tpm_amd_is_rng_defective(struct tpm_chip *chip) +{ -+ int ret; -+ u64 version; + u32 val1, val2; ++ u64 version; ++ int ret; + -+ /* No known-broken TPM1 chips. */ + if (!(chip->flags & TPM_CHIP_FLAG_TPM2)) + return false; + @@ -11394,7 +12083,6 @@ index 741d8f3e8fb3..348dd5705fbb 100644 + if (ret) + return false; + -+ /* Some AMD fTPM versions may cause stutter */ + ret = tpm2_get_tpm_pt(chip, TPM2_PT_MANUFACTURER, &val1, NULL); + if (ret) + goto release; @@ -11406,8 +12094,6 @@ index 741d8f3e8fb3..348dd5705fbb 100644 + if (ret) + goto release; + ret = tpm2_get_tpm_pt(chip, TPM2_PT_FIRMWARE_VERSION_2, &val2, NULL); -+ if (ret) -+ goto release; + +release: + tpm_relinquish_locality(chip); @@ -11416,13 +12102,6 @@ index 741d8f3e8fb3..348dd5705fbb 100644 + return false; + + version = ((u64)val1 << 32) | val2; -+ /* -+ * Fixes for stutter as described in -+ * https://www.amd.com/en/support/kb/faq/pa-410 -+ * are available in two series of fTPM firmware: -+ * 6.x.y.z series: 6.0.18.6 + -+ * 3.x.y.z series: 3.57.x.5 + -+ */ + if ((version >> 48) == 6) { + if (version >= 0x0006000000180006ULL) + return false; @@ -11432,6 +12111,7 @@ index 741d8f3e8fb3..348dd5705fbb 100644 + } else { + return false; + } ++ + dev_warn(&chip->dev, + "AMD fTPM version 0x%llx causes system stutter; hwrng disabled\n", + version); @@ -11442,13 +12122,13 @@ index 741d8f3e8fb3..348dd5705fbb 100644 static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) { struct tpm_chip *chip = container_of(rng, struct tpm_chip, hwrng); -@@ -521,7 +580,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) +@@ -521,7 +578,8 @@ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) static int tpm_add_hwrng(struct tpm_chip *chip) { - if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip)) + if (!IS_ENABLED(CONFIG_HW_RANDOM_TPM) || tpm_is_firmware_upgrade(chip) || -+ tpm_is_rng_defective(chip)) ++ tpm_amd_is_rng_defective(chip)) return 0; snprintf(chip->hwrng_name, sizeof(chip->hwrng_name), @@ -11536,19 +12216,6 @@ index 24ee4e1cc452..830014a26609 100644 /* 128 bytes is an arbitrary cap. This could be as large as TPM_BUFSIZE - 18 * bytes, but 128 is still a relatively large number of random bytes and -diff --git a/drivers/hwmon/nct6775-core.c b/drivers/hwmon/nct6775-core.c -index da9ec6983e13..c54233f0369b 100644 ---- a/drivers/hwmon/nct6775-core.c -+++ b/drivers/hwmon/nct6775-core.c -@@ -1150,7 +1150,7 @@ static int nct6775_write_fan_div(struct nct6775_data *data, int nr) - if (err) - return err; - reg &= 0x70 >> oddshift; -- reg |= data->fan_div[nr] & (0x7 << oddshift); -+ reg |= (data->fan_div[nr] & 0x7) << oddshift; - return nct6775_write_value(data, fandiv_reg, reg); - } - diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index dc6816d36d06..bda249068182 100644 --- a/drivers/leds/trigger/Kconfig @@ -11578,10 +12245,10 @@ index 25c4db97cdd4..d53bab5d93f1 100644 +obj-$(CONFIG_LEDS_TRIGGER_BLKDEV) += ledtrig-blkdev.o diff --git a/drivers/leds/trigger/ledtrig-blkdev.c b/drivers/leds/trigger/ledtrig-blkdev.c new file mode 100644 -index 000000000000..8614e308fadc +index 000000000000..067eedb003b5 --- /dev/null +++ b/drivers/leds/trigger/ledtrig-blkdev.c -@@ -0,0 +1,1220 @@ +@@ -0,0 +1,1221 @@ +// SPDX-License-Identifier: GPL-2.0-only + +/* @@ -12438,7 +13105,7 @@ index 000000000000..8614e308fadc +{ + const struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + -+ return sprintf(buf, "%u\n", READ_ONCE(btl->blink_msec)); ++ return sysfs_emit(buf, "%u\n", READ_ONCE(btl->blink_msec)); +} + +/** @@ -12489,8 +13156,8 @@ index 000000000000..8614e308fadc +{ + struct blkdev_trig_led *btl = led_trigger_get_drvdata(dev); + -+ return sprintf(buf, "%u\n", -+ jiffies_to_msecs(READ_ONCE(btl->check_jiffies))); ++ return sysfs_emit(buf, "%u\n", ++ jiffies_to_msecs(READ_ONCE(btl->check_jiffies))); +} + +/** @@ -12538,7 +13205,8 @@ index 000000000000..8614e308fadc +static int blkdev_trig_mode_show(const struct blkdev_trig_led *led, char *buf, + enum stat_group bit) +{ -+ return sprintf(buf, READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n"); ++ return sysfs_emit(buf, ++ READ_ONCE(led->mode) & (1 << bit) ? "Y\n" : "N\n"); +} + +/** @@ -12815,133 +13483,6 @@ index 64659b110973..8b5ca9f8f4bb 100644 return ret; } -diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c -index c0950edb26b0..697acf5c3c68 100644 ---- a/fs/nfsd/filecache.c -+++ b/fs/nfsd/filecache.c -@@ -331,37 +331,27 @@ nfsd_file_alloc(struct nfsd_file_lookup_key *key, unsigned int may) - return nf; - } - -+/** -+ * nfsd_file_check_write_error - check for writeback errors on a file -+ * @nf: nfsd_file to check for writeback errors -+ * -+ * Check whether a nfsd_file has an unseen error. Reset the write -+ * verifier if so. -+ */ - static void --nfsd_file_fsync(struct nfsd_file *nf) --{ -- struct file *file = nf->nf_file; -- int ret; -- -- if (!file || !(file->f_mode & FMODE_WRITE)) -- return; -- ret = vfs_fsync(file, 1); -- trace_nfsd_file_fsync(nf, ret); -- if (ret) -- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); --} -- --static int - nfsd_file_check_write_error(struct nfsd_file *nf) - { - struct file *file = nf->nf_file; - -- if (!file || !(file->f_mode & FMODE_WRITE)) -- return 0; -- return filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err)); -+ if ((file->f_mode & FMODE_WRITE) && -+ filemap_check_wb_err(file->f_mapping, READ_ONCE(file->f_wb_err))) -+ nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); - } - - static void - nfsd_file_hash_remove(struct nfsd_file *nf) - { - trace_nfsd_file_unhash(nf); -- -- if (nfsd_file_check_write_error(nf)) -- nfsd_reset_write_verifier(net_generic(nf->nf_net, nfsd_net_id)); - rhashtable_remove_fast(&nfsd_file_rhash_tbl, &nf->nf_rhash, - nfsd_file_rhash_params); - } -@@ -387,23 +377,12 @@ nfsd_file_free(struct nfsd_file *nf) - this_cpu_add(nfsd_file_total_age, age); - - nfsd_file_unhash(nf); -- -- /* -- * We call fsync here in order to catch writeback errors. It's not -- * strictly required by the protocol, but an nfsd_file could get -- * evicted from the cache before a COMMIT comes in. If another -- * task were to open that file in the interim and scrape the error, -- * then the client may never see it. By calling fsync here, we ensure -- * that writeback happens before the entry is freed, and that any -- * errors reported result in the write verifier changing. -- */ -- nfsd_file_fsync(nf); -- - if (nf->nf_mark) - nfsd_file_mark_put(nf->nf_mark); - if (nf->nf_file) { - get_file(nf->nf_file); - filp_close(nf->nf_file, NULL); -+ nfsd_file_check_write_error(nf); - fput(nf->nf_file); - } - -@@ -1159,6 +1138,7 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct svc_fh *fhp, - out: - if (status == nfs_ok) { - this_cpu_inc(nfsd_file_acquisitions); -+ nfsd_file_check_write_error(nf); - *pnf = nf; - } else { - if (refcount_dec_and_test(&nf->nf_ref)) -diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h -index 8f9c82d9e075..4183819ea082 100644 ---- a/fs/nfsd/trace.h -+++ b/fs/nfsd/trace.h -@@ -1202,37 +1202,6 @@ TRACE_EVENT(nfsd_file_close, - ) - ); - --TRACE_EVENT(nfsd_file_fsync, -- TP_PROTO( -- const struct nfsd_file *nf, -- int ret -- ), -- TP_ARGS(nf, ret), -- TP_STRUCT__entry( -- __field(void *, nf_inode) -- __field(int, nf_ref) -- __field(int, ret) -- __field(unsigned long, nf_flags) -- __field(unsigned char, nf_may) -- __field(struct file *, nf_file) -- ), -- TP_fast_assign( -- __entry->nf_inode = nf->nf_inode; -- __entry->nf_ref = refcount_read(&nf->nf_ref); -- __entry->ret = ret; -- __entry->nf_flags = nf->nf_flags; -- __entry->nf_may = nf->nf_may; -- __entry->nf_file = nf->nf_file; -- ), -- TP_printk("inode=%p ref=%d flags=%s may=%s nf_file=%p ret=%d", -- __entry->nf_inode, -- __entry->nf_ref, -- show_nf_flags(__entry->nf_flags), -- show_nfsd_may_flags(__entry->nf_may), -- __entry->nf_file, __entry->ret -- ) --); -- - #include "cache.h" - - TRACE_DEFINE_ENUM(RC_DROPIT); diff --git a/fs/proc/base.c b/fs/proc/base.c index 9e479d7d202b..ac9ebe972be0 100644 --- a/fs/proc/base.c @@ -12992,6 +13533,40 @@ index 5f1ae07d724b..97cda629c9e9 100644 #endif /* CONFIG_HUGETLB_PAGE */ +diff --git a/kernel/kheaders.c b/kernel/kheaders.c +index 8f69772af77b..42163c9e94e5 100644 +--- a/kernel/kheaders.c ++++ b/kernel/kheaders.c +@@ -26,15 +26,15 @@ asm ( + " .popsection \n" + ); + +-extern char kernel_headers_data; +-extern char kernel_headers_data_end; ++extern char kernel_headers_data[]; ++extern char kernel_headers_data_end[]; + + static ssize_t + ikheaders_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) + { +- memcpy(buf, &kernel_headers_data + off, len); ++ memcpy(buf, &kernel_headers_data[off], len); + return len; + } + +@@ -48,8 +48,8 @@ static struct bin_attribute kheaders_attr __ro_after_init = { + + static int __init ikheaders_init(void) + { +- kheaders_attr.size = (&kernel_headers_data_end - +- &kernel_headers_data); ++ kheaders_attr.size = (kernel_headers_data_end - ++ kernel_headers_data); + return sysfs_create_bin_file(kernel_kobj, &kheaders_attr); + } + diff --git a/kernel/kthread.c b/kernel/kthread.c index f97fd01a2932..7e6751b29101 100644 --- a/kernel/kthread.c @@ -13244,7 +13819,7 @@ index bcf75a8b032d..21466d0ab22f 100644 }; diff --git a/mm/ksm.c b/mm/ksm.c -index a92c9594a2d3..c267b92b837b 100644 +index a92c9594a2d3..ee60890cf9b1 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -214,6 +214,7 @@ struct ksm_rmap_item { @@ -13399,7 +13974,33 @@ index a92c9594a2d3..c267b92b837b 100644 } return err; } -@@ -2044,6 +2094,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, +@@ -988,9 +1038,15 @@ static int unmerge_and_remove_all_rmap_items(void) + + mm = mm_slot->slot.mm; + mmap_read_lock(mm); ++ ++ /* ++ * Exit right away if mm is exiting to avoid lockdep issue in ++ * the maple tree ++ */ ++ if (ksm_test_exit(mm)) ++ goto mm_exiting; ++ + for_each_vma(vmi, vma) { +- if (ksm_test_exit(mm)) +- break; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, +@@ -999,6 +1055,7 @@ static int unmerge_and_remove_all_rmap_items(void) + goto error; + } + ++mm_exiting: + remove_trailing_rmap_items(&mm_slot->rmap_list); + mmap_read_unlock(mm); + +@@ -2044,6 +2101,42 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, rmap_item->mm->ksm_merging_pages++; } @@ -13442,7 +14043,7 @@ index a92c9594a2d3..c267b92b837b 100644 /* * cmp_and_merge_page - first see if page can be merged into the stable tree; * if not, compare checksum to previous and if it's the same, see if page can -@@ -2055,7 +2141,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, +@@ -2055,7 +2148,6 @@ static void stable_tree_append(struct ksm_rmap_item *rmap_item, */ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { @@ -13450,7 +14051,7 @@ index a92c9594a2d3..c267b92b837b 100644 struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; -@@ -2092,6 +2177,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite +@@ -2092,6 +2184,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } remove_rmap_item_from_tree(rmap_item); @@ -13458,7 +14059,7 @@ index a92c9594a2d3..c267b92b837b 100644 if (kpage) { if (PTR_ERR(kpage) == -EBUSY) -@@ -2128,29 +2214,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite +@@ -2128,29 +2221,16 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ @@ -13495,7 +14096,7 @@ index a92c9594a2d3..c267b92b837b 100644 tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { -@@ -2214,23 +2287,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite +@@ -2214,23 +2294,39 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite } } @@ -13541,7 +14142,7 @@ index a92c9594a2d3..c267b92b837b 100644 rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ -@@ -2337,6 +2426,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) +@@ -2337,6 +2433,22 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) } if (is_zone_device_page(*page)) goto next_page; @@ -13564,7 +14165,7 @@ index a92c9594a2d3..c267b92b837b 100644 if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); -@@ -3138,6 +3243,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, +@@ -3138,6 +3250,13 @@ static ssize_t pages_volatile_show(struct kobject *kobj, } KSM_ATTR_RO(pages_volatile); @@ -13578,7 +14179,7 @@ index a92c9594a2d3..c267b92b837b 100644 static ssize_t stable_node_dups_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { -@@ -3193,6 +3305,7 @@ static struct attribute *ksm_attrs[] = { +@@ -3193,6 +3312,7 @@ static struct attribute *ksm_attrs[] = { &pages_sharing_attr.attr, &pages_unshared_attr.attr, &pages_volatile_attr.attr, @@ -13586,6 +14187,60 @@ index a92c9594a2d3..c267b92b837b 100644 &full_scans_attr.attr, #ifdef CONFIG_NUMA &merge_across_nodes_attr.attr, +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 3bb3484563ed..3aec9a6a9cb7 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3119,6 +3119,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + { + unsigned long flags; + int i, allocated = 0; ++ struct list_head *prev_tail = list->prev; ++ struct page *pos, *n; + + spin_lock_irqsave(&zone->lock, flags); + for (i = 0; i < count; ++i) { +@@ -3127,9 +3129,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + if (unlikely(page == NULL)) + break; + +- if (unlikely(check_pcp_refill(page, order))) +- continue; +- + /* + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of +@@ -3141,7 +3140,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + * pages are ordered properly. + */ + list_add_tail(&page->pcp_list, list); +- allocated++; + if (is_migrate_cma(get_pcppage_migratetype(page))) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, + -(1 << order)); +@@ -3155,6 +3153,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, + */ + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); + spin_unlock_irqrestore(&zone->lock, flags); ++ ++ /* ++ * Pages are appended to the pcp list without checking to reduce the ++ * time holding the zone lock. Checking the appended pages happens right ++ * after the critical section while still holding the pcp lock. ++ */ ++ pos = list_first_entry(prev_tail, struct page, pcp_list); ++ list_for_each_entry_safe_from(pos, n, list, pcp_list) { ++ if (unlikely(check_pcp_refill(pos, order))) { ++ list_del(&pos->pcp_list); ++ continue; ++ } ++ ++ allocated++; ++ } ++ + return allocated; + } + diff --git a/mm/z3fold.c b/mm/z3fold.c index a4de0c317ac7..0cef845d397b 100644 --- a/mm/z3fold.c @@ -13813,12 +14468,12 @@ index b11b7e5115dc..3033cd6ed3b4 100644 #ifdef __NR_userfaultfd test_unmerge_uffd_wp(); -- -2.39.2 +2.40.0.rc2 -From b773e9f32d0254f398a29134cec883652e3c4201 Mon Sep 17 00:00:00 2001 +From 50de9c32a97f479390ff525d679f224e1ceb8e3b Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 13 Feb 2023 11:27:09 +0100 -Subject: [PATCH 07/15] fs-patches +Date: Fri, 3 Mar 2023 16:59:32 +0100 +Subject: [PATCH 07/16] fs-patches Signed-off-by: Peter Jung --- @@ -13826,11 +14481,11 @@ Signed-off-by: Peter Jung block/blk-merge.c | 3 +- fs/btrfs/Makefile | 6 +- fs/btrfs/backref.c | 33 +- - fs/btrfs/bio.c | 557 +++++++++++++++++++++--- + fs/btrfs/bio.c | 557 ++++++++++++++++++++--- fs/btrfs/bio.h | 67 +-- - fs/btrfs/block-group.c | 273 ++++++++++-- + fs/btrfs/block-group.c | 276 ++++++++++-- fs/btrfs/block-group.h | 24 +- - fs/btrfs/btrfs_inode.h | 22 +- + fs/btrfs/btrfs_inode.h | 23 +- fs/btrfs/compression.c | 276 ++---------- fs/btrfs/compression.h | 3 - fs/btrfs/ctree.c | 62 ++- @@ -13838,48 +14493,53 @@ Signed-off-by: Peter Jung fs/btrfs/defrag.c | 4 +- fs/btrfs/delayed-ref.c | 24 +- fs/btrfs/delayed-ref.h | 2 +- - fs/btrfs/discard.c | 41 +- - fs/btrfs/disk-io.c | 225 +--------- + fs/btrfs/disk-io.c | 222 +--------- fs/btrfs/disk-io.h | 14 +- fs/btrfs/extent-io-tree.c | 10 +- fs/btrfs/extent-io-tree.h | 1 - fs/btrfs/extent-tree.c | 181 +++----- fs/btrfs/extent-tree.h | 81 ++++ - fs/btrfs/extent_io.c | 582 +++---------------------- + fs/btrfs/extent_io.c | 582 +++--------------------- fs/btrfs/extent_io.h | 36 +- - fs/btrfs/file-item.c | 72 ++-- + fs/btrfs/file-item.c | 72 ++- fs/btrfs/file-item.h | 8 +- - fs/btrfs/file.c | 2 +- + fs/btrfs/file.c | 13 +- fs/btrfs/free-space-tree.c | 2 +- - fs/btrfs/fs.c | 4 + - fs/btrfs/fs.h | 11 +- - fs/btrfs/inode.c | 641 ++++------------------------ + fs/btrfs/fs.h | 5 +- + fs/btrfs/inode.c | 715 ++++++------------------------ fs/btrfs/ioctl.c | 2 +- - fs/btrfs/lru_cache.c | 166 ++++++++ + fs/btrfs/lru_cache.c | 166 +++++++ fs/btrfs/lru_cache.h | 80 ++++ fs/btrfs/lzo.c | 2 +- fs/btrfs/messages.c | 30 -- fs/btrfs/messages.h | 34 -- - fs/btrfs/ordered-data.c | 25 +- - fs/btrfs/ordered-data.h | 3 +- + fs/btrfs/ordered-data.c | 71 ++- + fs/btrfs/ordered-data.h | 10 +- fs/btrfs/qgroup.c | 2 +- - fs/btrfs/raid56.c | 334 ++++++--------- + fs/btrfs/raid56.c | 334 +++++--------- fs/btrfs/raid56.h | 4 +- fs/btrfs/relocation.c | 2 +- - fs/btrfs/scrub.c | 51 ++- - fs/btrfs/send.c | 684 ++++++++++++++++-------------- + fs/btrfs/scrub.c | 2 +- + fs/btrfs/send.c | 684 ++++++++++++++-------------- fs/btrfs/super.c | 3 +- - fs/btrfs/sysfs.c | 41 +- - fs/btrfs/sysfs.h | 3 +- + fs/btrfs/sysfs.c | 12 +- fs/btrfs/tests/extent-map-tests.c | 2 +- - fs/btrfs/transaction.c | 34 ++ + fs/btrfs/transaction.c | 29 ++ fs/btrfs/transaction.h | 31 ++ fs/btrfs/tree-log.c | 87 ++-- fs/btrfs/tree-log.h | 9 +- fs/btrfs/volumes.c | 116 ++--- fs/btrfs/volumes.h | 18 - - fs/btrfs/zoned.c | 146 +++---- + fs/btrfs/zoned.c | 146 +++--- fs/btrfs/zoned.h | 20 +- + fs/ext4/extents.c | 2 +- + fs/ext4/file.c | 34 +- + fs/ext4/inode.c | 429 ++++++------------ + fs/ext4/ioctl.c | 3 - + fs/ext4/namei.c | 11 +- + fs/ext4/page-io.c | 10 +- + fs/ext4/super.c | 26 +- + fs/ext4/xattr.c | 137 ++++-- fs/gfs2/bmap.c | 38 +- fs/iomap/buffered-io.c | 91 ++-- fs/iomap/direct-io.c | 10 +- @@ -13887,25 +14547,27 @@ Signed-off-by: Peter Jung fs/xfs/libxfs/xfs_bmap.c | 32 +- fs/xfs/libxfs/xfs_bmap.h | 5 +- fs/xfs/libxfs/xfs_btree.c | 18 +- - fs/xfs/libxfs/xfs_refcount.c | 96 ++--- + fs/xfs/libxfs/xfs_refcount.c | 96 ++-- fs/xfs/libxfs/xfs_refcount.h | 4 +- - fs/xfs/libxfs/xfs_rmap.c | 50 ++- + fs/xfs/libxfs/xfs_rmap.c | 50 +-- fs/xfs/libxfs/xfs_rmap.h | 6 +- fs/xfs/xfs_bmap_item.c | 137 +++--- fs/xfs/xfs_error.c | 2 +- fs/xfs/xfs_error.h | 12 +- fs/xfs/xfs_extfree_item.c | 99 +++-- + fs/xfs/xfs_fsmap.c | 1 + fs/xfs/xfs_globals.c | 3 +- fs/xfs/xfs_iomap.c | 4 +- fs/xfs/xfs_refcount_item.c | 110 +++-- - fs/xfs/xfs_rmap_item.c | 142 +++---- + fs/xfs/xfs_rmap_item.c | 142 +++--- fs/xfs/xfs_sysfs.c | 12 +- fs/xfs/xfs_sysfs.h | 10 +- fs/xfs/xfs_trace.h | 15 +- include/linux/bio.h | 4 + include/linux/iomap.h | 30 +- include/trace/events/btrfs.h | 127 +++++- - 83 files changed, 2936 insertions(+), 3366 deletions(-) + include/trace/events/ext4.h | 7 - + 90 files changed, 3213 insertions(+), 3751 deletions(-) create mode 100644 fs/btrfs/lru_cache.c create mode 100644 fs/btrfs/lru_cache.h @@ -13923,7 +14585,7 @@ index 8de008c0c5ad..e2561416391c 100644 OR together the tags which represent errors which should cause panics: diff --git a/block/blk-merge.c b/block/blk-merge.c -index b7c193d67185..64bf7d9dd8e8 100644 +index 808b58129d3e..1ac782fdc55c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -276,7 +276,7 @@ static bool bvec_split_segs(const struct queue_limits *lim, @@ -14866,7 +15528,7 @@ index b12f84b3b341..873ff85817f0 100644 u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c -index 708d843daa72..5b10401d803b 100644 +index 708d843daa72..80c73137e322 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1,5 +1,6 @@ @@ -15038,7 +15700,17 @@ index 708d843daa72..5b10401d803b 100644 if (btrfs_test_opt(fs_info, SPACE_CACHE)) { ret = load_free_space_cache(block_group); if (ret == 1) { -@@ -1816,7 +1965,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +@@ -1687,7 +1836,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) + + btrfs_info(fs_info, + "reclaiming chunk %llu with %llu%% used %llu%% unusable", +- bg->start, div_u64(bg->used * 100, bg->length), ++ bg->start, ++ div64_u64(bg->used * 100, bg->length), + div64_u64(zone_unusable * 100, bg->length)); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); +@@ -1816,7 +1966,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * * @fs_info: the filesystem * @chunk_start: logical address of block group @@ -15046,7 +15718,7 @@ index 708d843daa72..5b10401d803b 100644 * @physical: physical address to map to logical addresses * @logical: return array of logical addresses which map to @physical * @naddrs: length of @logical -@@ -1827,8 +1975,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +@@ -1827,8 +1976,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) * block copies. */ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, @@ -15056,7 +15728,7 @@ index 708d843daa72..5b10401d803b 100644 { struct extent_map *em; struct map_lookup *map; -@@ -1868,9 +2015,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, +@@ -1868,9 +2016,6 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, data_stripe_length)) continue; @@ -15066,7 +15738,7 @@ index 708d843daa72..5b10401d803b 100644 stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); -@@ -1927,7 +2071,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) +@@ -1927,7 +2072,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); @@ -15075,7 +15747,7 @@ index 708d843daa72..5b10401d803b 100644 bytenr, &logical, &nr, &stripe_len); if (ret) return ret; -@@ -3330,7 +3474,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, +@@ -3330,7 +3475,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { @@ -15084,7 +15756,7 @@ index 708d843daa72..5b10401d803b 100644 cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { -@@ -3379,6 +3523,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, +@@ -3379,6 +3524,7 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info->disk_used -= num_bytes * factor; reclaim = should_reclaim_block_group(cache, num_bytes); @@ -15092,7 +15764,7 @@ index 708d843daa72..5b10401d803b 100644 spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); -@@ -3433,32 +3578,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, +@@ -3433,32 +3579,42 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * reservation and return -EAGAIN, otherwise this function always succeeds. */ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, @@ -15151,7 +15823,7 @@ index 708d843daa72..5b10401d803b 100644 spin_unlock(&cache->lock); spin_unlock(&space_info->lock); return ret; -@@ -4218,3 +4373,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount +@@ -4218,3 +4374,73 @@ void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount bg->swap_extents -= amount; spin_unlock(&bg->lock); } @@ -15287,7 +15959,7 @@ index a02ea76fd6cf..6e4a0b429ac3 100644 + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h -index 195c09e20609..49a92aa65de1 100644 +index 195c09e20609..87020aa58121 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -93,12 +93,6 @@ struct btrfs_inode { @@ -15328,6 +16000,14 @@ index 195c09e20609..49a92aa65de1 100644 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *orig_start, u64 *orig_block_len, u64 *ram_bytes, bool nowait, bool strict); +@@ -532,6 +516,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, + size_t done_before); + struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, ++ struct btrfs_ordered_extent **ordered_extent, + size_t done_before); + + extern const struct dentry_operations btrfs_dentry_operations; diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5122ca79f7ea..f42f31f22d13 100644 --- a/fs/btrfs/compression.c @@ -16095,129 +16775,8 @@ index d6304b690ec4..2eb34abf700f 100644 struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head); -diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c -index ff2e524d9937..317aeff6c1da 100644 ---- a/fs/btrfs/discard.c -+++ b/fs/btrfs/discard.c -@@ -78,6 +78,7 @@ static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, - static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) - { -+ lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; - -@@ -89,6 +90,8 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, - BTRFS_DISCARD_DELAY); - block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; - } -+ if (list_empty(&block_group->discard_list)) -+ btrfs_get_block_group(block_group); - - list_move_tail(&block_group->discard_list, - get_discard_list(discard_ctl, block_group)); -@@ -108,8 +111,12 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, - static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) - { -+ bool queued; -+ - spin_lock(&discard_ctl->lock); - -+ queued = !list_empty(&block_group->discard_list); -+ - if (!btrfs_run_discard_work(discard_ctl)) { - spin_unlock(&discard_ctl->lock); - return; -@@ -121,6 +128,8 @@ static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, - block_group->discard_eligible_time = (ktime_get_ns() + - BTRFS_DISCARD_UNUSED_DELAY); - block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; -+ if (!queued) -+ btrfs_get_block_group(block_group); - list_add_tail(&block_group->discard_list, - &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); - -@@ -131,6 +140,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, - struct btrfs_block_group *block_group) - { - bool running = false; -+ bool queued = false; - - spin_lock(&discard_ctl->lock); - -@@ -140,7 +150,16 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, - } - - block_group->discard_eligible_time = 0; -+ queued = !list_empty(&block_group->discard_list); - list_del_init(&block_group->discard_list); -+ /* -+ * If the block group is currently running in the discard workfn, we -+ * don't want to deref it, since it's still being used by the workfn. -+ * The workfn will notice this case and deref the block group when it is -+ * finished. -+ */ -+ if (queued && !running) -+ btrfs_put_block_group(block_group); - - spin_unlock(&discard_ctl->lock); - -@@ -214,10 +233,12 @@ static struct btrfs_block_group *peek_discard_list( - if (block_group && now >= block_group->discard_eligible_time) { - if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && - block_group->used != 0) { -- if (btrfs_is_block_group_data_only(block_group)) -+ if (btrfs_is_block_group_data_only(block_group)) { - __add_to_discard_list(discard_ctl, block_group); -- else -+ } else { - list_del_init(&block_group->discard_list); -+ btrfs_put_block_group(block_group); -+ } - goto again; - } - if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { -@@ -511,6 +532,15 @@ static void btrfs_discard_workfn(struct work_struct *work) - spin_lock(&discard_ctl->lock); - discard_ctl->prev_discard = trimmed; - discard_ctl->prev_discard_time = now; -+ /* -+ * If the block group was removed from the discard list while it was -+ * running in this workfn, then we didn't deref it, since this function -+ * still owned that reference. But we set the discard_ctl->block_group -+ * back to NULL, so we can use that condition to know that now we need -+ * to deref the block_group. -+ */ -+ if (discard_ctl->block_group == NULL) -+ btrfs_put_block_group(block_group); - discard_ctl->block_group = NULL; - __btrfs_discard_schedule_work(discard_ctl, now, false); - spin_unlock(&discard_ctl->lock); -@@ -651,8 +681,12 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) - list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, - bg_list) { - list_del_init(&block_group->bg_list); -- btrfs_put_block_group(block_group); - btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); -+ /* -+ * This put is for the get done by btrfs_mark_bg_unused. -+ * Queueing discard incremented it for discard's reference. -+ */ -+ btrfs_put_block_group(block_group); - } - spin_unlock(&fs_info->unused_bgs_lock); - } -@@ -683,6 +717,7 @@ static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) - if (block_group->used == 0) - btrfs_mark_bg_unused(block_group); - spin_lock(&discard_ctl->lock); -+ btrfs_put_block_group(block_group); - } - } - spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c -index 3aa04224315e..b53f0e30ce2b 100644 +index fde40112a259..b53f0e30ce2b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -78,23 +78,6 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) @@ -16463,17 +17022,7 @@ index 3aa04224315e..b53f0e30ce2b 100644 static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { -@@ -1910,6 +1727,9 @@ static int cleaner_kthread(void *arg) - goto sleep; - } - -+ if (test_and_clear_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags)) -+ btrfs_sysfs_feature_update(fs_info); -+ - btrfs_run_delayed_iputs(fs_info); - - again = btrfs_clean_one_deleted_snapshot(fs_info); -@@ -5159,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, +@@ -5162,11 +4979,12 @@ static int btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, start += fs_info->nodesize; if (!eb) continue; @@ -18091,7 +18640,7 @@ index 031225668434..cd7f2ae515c0 100644 struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c -index af046d22300e..5cc5a1faaef5 100644 +index af046d22300e..ec5c5355906b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1017,7 +1017,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, @@ -18103,6 +18652,38 @@ index af046d22300e..5cc5a1faaef5 100644 btrfs_put_ordered_extent(ordered); return -EAGAIN; } +@@ -1465,6 +1465,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) + ssize_t err; + unsigned int ilock_flags = 0; + struct iomap_dio *dio; ++ struct btrfs_ordered_extent *ordered_extent = NULL; + + if (iocb->ki_flags & IOCB_NOWAIT) + ilock_flags |= BTRFS_ILOCK_TRY; +@@ -1526,7 +1527,7 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) + * got -EFAULT, faulting in the pages before the retry. + */ + from->nofault = true; +- dio = btrfs_dio_write(iocb, from, written); ++ dio = btrfs_dio_write(iocb, from, &ordered_extent, written); + from->nofault = false; + + /* +@@ -1569,6 +1570,14 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) + goto relock; + } + } ++ /* ++ * We can't loop back to btrfs_dio_write, so we can drop the cached ++ * ordered extent. Typically btrfs_dio_iomap_end will run and put the ++ * ordered_extent, but this is needed to clean up in case of an error ++ * path breaking out of iomap_iter before the final iomap_end call. ++ */ ++ if (ordered_extent) ++ btrfs_put_ordered_extent(ordered_extent); + + /* + * If 'err' is -ENOTBLK or we have not written all data, then it means diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index c667e878ef1a..4d155a48ec59 100644 --- a/fs/btrfs/free-space-tree.c @@ -18116,43 +18697,8 @@ index c667e878ef1a..4d155a48ec59 100644 btrfs_tree_unlock(free_space_root->node); btrfs_free_tree_block(trans, btrfs_root_id(free_space_root), free_space_root->node, 0, 1); -diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c -index 5553e1f8afe8..31c1648bc0b4 100644 ---- a/fs/btrfs/fs.c -+++ b/fs/btrfs/fs.c -@@ -24,6 +24,7 @@ void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, - name, flag); - } - spin_unlock(&fs_info->super_lock); -+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); - } - } - -@@ -46,6 +47,7 @@ void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag, - name, flag); - } - spin_unlock(&fs_info->super_lock); -+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); - } - } - -@@ -68,6 +70,7 @@ void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, - name, flag); - } - spin_unlock(&fs_info->super_lock); -+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); - } - } - -@@ -90,5 +93,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, - name, flag); - } - spin_unlock(&fs_info->super_lock); -+ set_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags); - } - } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h -index 37b86acfcbcf..4c477eae6891 100644 +index 3d8156fc8523..4c477eae6891 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -3,6 +3,7 @@ @@ -18163,20 +18709,7 @@ index 37b86acfcbcf..4c477eae6891 100644 #include #include #include -@@ -125,6 +126,12 @@ enum { - */ - BTRFS_FS_NO_OVERCOMMIT, - -+ /* -+ * Indicate if we have some features changed, this is mostly for -+ * cleaner thread to update the sysfs interface. -+ */ -+ BTRFS_FS_FEATURE_CHANGED, -+ - #if BITS_PER_LONG == 32 - /* Indicate if we have error/warn message printed on 32bit systems */ - BTRFS_FS_32BIT_ERROR, -@@ -742,8 +749,10 @@ struct btrfs_fs_info { +@@ -748,8 +749,10 @@ struct btrfs_fs_info { */ u64 zone_size; @@ -18189,10 +18722,14 @@ index 37b86acfcbcf..4c477eae6891 100644 spinlock_t treelog_bg_lock; u64 treelog_bg; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c -index 98a800b8bd43..44e9acc77a74 100644 +index 98a800b8bd43..6aaa892474be 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c -@@ -84,27 +84,12 @@ struct btrfs_dio_data { +@@ -81,30 +81,16 @@ struct btrfs_dio_data { + struct extent_changeset *data_reserved; + bool data_space_reserved; + bool nocow_done; ++ struct btrfs_ordered_extent *ordered; }; struct btrfs_dio_private { @@ -18222,7 +18759,7 @@ index 98a800b8bd43..44e9acc77a74 100644 }; static struct bio_set btrfs_dio_bioset; -@@ -228,7 +213,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, +@@ -228,7 +214,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; @@ -18231,7 +18768,7 @@ index 98a800b8bd43..44e9acc77a74 100644 struct page *page; if (locked_page) { -@@ -2535,19 +2520,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, +@@ -2535,19 +2521,6 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } } @@ -18251,7 +18788,7 @@ index 98a800b8bd43..44e9acc77a74 100644 /* * Split an extent_map at [start, start + len] * -@@ -2663,19 +2635,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, +@@ -2663,19 +2636,19 @@ static int split_zoned_em(struct btrfs_inode *inode, u64 start, u64 len, return ret; } @@ -18276,7 +18813,7 @@ index 98a800b8bd43..44e9acc77a74 100644 if (WARN_ON_ONCE(!ordered)) return BLK_STS_IOERR; -@@ -2715,7 +2687,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, +@@ -2715,7 +2688,7 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, ret = btrfs_split_ordered_extent(ordered, pre, post); if (ret) goto out; @@ -18285,7 +18822,7 @@ index 98a800b8bd43..44e9acc77a74 100644 out: btrfs_put_ordered_extent(ordered); -@@ -2723,75 +2695,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, +@@ -2723,75 +2696,6 @@ static blk_status_t extract_ordered_extent(struct btrfs_inode *inode, return errno_to_blk_status(ret); } @@ -18361,7 +18898,7 @@ index 98a800b8bd43..44e9acc77a74 100644 /* * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. -@@ -2969,7 +2872,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +@@ -2969,7 +2873,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); @@ -18370,7 +18907,7 @@ index 98a800b8bd43..44e9acc77a74 100644 btrfs_put_ordered_extent(ordered); goto again; } -@@ -3259,15 +3162,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +@@ -3259,15 +3163,13 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } @@ -18388,7 +18925,7 @@ index 98a800b8bd43..44e9acc77a74 100644 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; -@@ -3474,109 +3375,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of +@@ -3474,109 +3376,55 @@ static u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, u64 of } /* @@ -18533,7 +19070,7 @@ index 98a800b8bd43..44e9acc77a74 100644 } /* -@@ -4987,7 +4834,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, +@@ -4987,7 +4835,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, unlock_extent(io_tree, block_start, block_end, &cached_state); unlock_page(page); put_page(page); @@ -18542,7 +19079,7 @@ index 98a800b8bd43..44e9acc77a74 100644 btrfs_put_ordered_extent(ordered); goto again; } -@@ -5466,8 +5313,6 @@ void btrfs_evict_inode(struct inode *inode) +@@ -5466,8 +5314,6 @@ void btrfs_evict_inode(struct inode *inode) if (is_bad_inode(inode)) goto no_delete; @@ -18551,7 +19088,71 @@ index 98a800b8bd43..44e9acc77a74 100644 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) goto no_delete; -@@ -7392,7 +7237,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, +@@ -7131,6 +6977,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, + } + + static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, ++ struct btrfs_dio_data *dio_data, + const u64 start, + const u64 len, + const u64 orig_start, +@@ -7141,7 +6988,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + const int type) + { + struct extent_map *em = NULL; +- int ret; ++ struct btrfs_ordered_extent *ordered; + + if (type != BTRFS_ORDERED_NOCOW) { + em = create_io_em(inode, start, len, orig_start, block_start, +@@ -7151,18 +6998,21 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + if (IS_ERR(em)) + goto out; + } +- ret = btrfs_add_ordered_extent(inode, start, len, len, block_start, +- block_len, 0, +- (1 << type) | +- (1 << BTRFS_ORDERED_DIRECT), +- BTRFS_COMPRESS_NONE); +- if (ret) { ++ ordered = btrfs_alloc_ordered_extent(inode, start, len, len, ++ block_start, block_len, 0, ++ (1 << type) | ++ (1 << BTRFS_ORDERED_DIRECT), ++ BTRFS_COMPRESS_NONE); ++ if (IS_ERR(ordered)) { + if (em) { + free_extent_map(em); + btrfs_drop_extent_map_range(inode, start, + start + len - 1, false); + } +- em = ERR_PTR(ret); ++ em = ERR_PTR(PTR_ERR(ordered)); ++ } else { ++ ASSERT(!dio_data->ordered); ++ dio_data->ordered = ordered; + } + out: + +@@ -7170,6 +7020,7 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, + } + + static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, ++ struct btrfs_dio_data *dio_data, + u64 start, u64 len) + { + struct btrfs_root *root = inode->root; +@@ -7185,7 +7036,8 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, + if (ret) + return ERR_PTR(ret); + +- em = btrfs_create_dio_extent(inode, start, ins.offset, start, ++ em = btrfs_create_dio_extent(inode, dio_data, ++ start, ins.offset, start, + ins.objectid, ins.offset, ins.offset, + ins.offset, BTRFS_ORDERED_REGULAR); + btrfs_dec_block_group_reservations(fs_info, ins.objectid); +@@ -7392,7 +7244,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) @@ -18560,7 +19161,51 @@ index 98a800b8bd43..44e9acc77a74 100644 else ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); -@@ -7833,10 +7678,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, +@@ -7530,7 +7382,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + } + space_reserved = true; + +- em2 = btrfs_create_dio_extent(BTRFS_I(inode), start, len, ++ em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len, + orig_start, block_start, + len, orig_block_len, + ram_bytes, type); +@@ -7572,7 +7424,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, + goto out; + space_reserved = true; + +- em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); ++ em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len); + if (IS_ERR(em)) { + ret = PTR_ERR(em); + goto out; +@@ -7676,6 +7528,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + } + } + ++ if (dio_data->ordered) { ++ ASSERT(write); ++ em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, ++ dio_data->ordered->file_offset, ++ dio_data->ordered->bytes_left); ++ if (IS_ERR(em)) { ++ ret = PTR_ERR(em); ++ goto err; ++ } ++ goto map_iomap; ++ } + memset(dio_data, 0, sizeof(*dio_data)); + + /* +@@ -7817,6 +7680,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + else + free_extent_state(cached_state); + ++map_iomap: + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does +@@ -7833,10 +7697,6 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; @@ -18571,7 +19216,37 @@ index 98a800b8bd43..44e9acc77a74 100644 free_extent_map(em); return 0; -@@ -7888,267 +7729,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, +@@ -7874,13 +7734,25 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + if (submitted < length) { + pos += submitted; + length -= submitted; +- if (write) +- btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, +- pos, length, false); +- else ++ if (write) { ++ if (submitted == 0) { ++ btrfs_mark_ordered_io_finished(BTRFS_I(inode), ++ NULL, pos, ++ length, false); ++ btrfs_put_ordered_extent(dio_data->ordered); ++ dio_data->ordered = NULL; ++ } ++ } else { + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ++ } + ret = -ENOTBLK; ++ } else { ++ /* On the last bio, release our cached ordered_extent. */ ++ if (write) { ++ btrfs_put_ordered_extent(dio_data->ordered); ++ dio_data->ordered = NULL; ++ } + } + + if (write) +@@ -7888,267 +7760,47 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, return ret; } @@ -18599,7 +19274,8 @@ index 98a800b8bd43..44e9acc77a74 100644 -} - -void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) --{ ++static void btrfs_dio_end_io(struct btrfs_bio *bbio) + { - struct btrfs_dio_private *dip = btrfs_bio(bio)->private; - - BUG_ON(bio_op(bio) == REQ_OP_WRITE); @@ -18644,8 +19320,7 @@ index 98a800b8bd43..44e9acc77a74 100644 -blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode, - struct bio *bio, - u64 dio_file_offset) -+static void btrfs_dio_end_io(struct btrfs_bio *bbio) - { +-{ - return btrfs_csum_one_bio(inode, bio, dio_file_offset, false); -} - @@ -18667,7 +19342,7 @@ index 98a800b8bd43..44e9acc77a74 100644 - - if (bio_op(bio) == REQ_OP_READ) - err = btrfs_check_read_dio_bio(dip, bbio, !err); -- + - if (err) - dip->bio.bi_status = err; - @@ -18676,7 +19351,7 @@ index 98a800b8bd43..44e9acc77a74 100644 - bio_put(bio); - btrfs_dio_private_put(dip); -} - +- -static void btrfs_submit_dio_bio(struct bio *bio, struct btrfs_inode *inode, - u64 file_offset, int async_submit) -{ @@ -18797,12 +19472,10 @@ index 98a800b8bd43..44e9acc77a74 100644 - status = errno_to_blk_status(ret); - goto out_err_em; - } - +- - clone_len = min(submit_len, geom.len); - ASSERT(clone_len <= UINT_MAX); -+ btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); -+ bbio->file_offset = file_offset; - +- - /* - * This will never fail as it's passing GPF_NOFS and - * the allocation is backed by btrfs_bioset. @@ -18843,9 +19516,11 @@ index 98a800b8bd43..44e9acc77a74 100644 - if (!raid56) - async_submit = 1; - } -- + - btrfs_submit_dio_bio(bio, BTRFS_I(inode), file_offset, async_submit); -- ++ btrfs_bio_init(bbio, BTRFS_I(iter->inode), btrfs_dio_end_io, bio->bi_private); ++ bbio->file_offset = file_offset; + - dio_data->submitted += clone_len; - clone_offset += clone_len; - start_sector += clone_len >> 9; @@ -18867,7 +19542,7 @@ index 98a800b8bd43..44e9acc77a74 100644 } static const struct iomap_ops btrfs_dio_iomap_ops = { -@@ -8157,7 +7778,7 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { +@@ -8157,25 +7809,30 @@ static const struct iomap_ops btrfs_dio_iomap_ops = { }; static const struct iomap_dio_ops btrfs_dio_ops = { @@ -18876,7 +19551,34 @@ index 98a800b8bd43..44e9acc77a74 100644 .bio_set = &btrfs_dio_bioset, }; -@@ -8552,7 +8173,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) + ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) + { +- struct btrfs_dio_data data; ++ struct btrfs_dio_data data = { 0 }; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL, &data, done_before); + } + + struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, ++ struct btrfs_ordered_extent **ordered_extent, + size_t done_before) + { +- struct btrfs_dio_data data; ++ struct btrfs_dio_data dio_data = { .ordered = *ordered_extent }; ++ struct iomap_dio *dio; + +- return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, +- IOMAP_DIO_PARTIAL, &data, done_before); ++ dio = __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, ++ IOMAP_DIO_PARTIAL, &dio_data, done_before); ++ if (!IS_ERR_OR_NULL(dio)) ++ *ordered_extent = dio_data.ordered; ++ return dio; + } + + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +@@ -8552,7 +8209,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent(io_tree, page_start, page_end, &cached_state); unlock_page(page); up_read(&BTRFS_I(inode)->i_mmap_lock); @@ -18885,7 +19587,7 @@ index 98a800b8bd43..44e9acc77a74 100644 btrfs_put_ordered_extent(ordered); goto again; } -@@ -8850,7 +8471,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) +@@ -8850,7 +8507,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_log_commit = 0; spin_lock_init(&ei->lock); @@ -18893,7 +19595,7 @@ index 98a800b8bd43..44e9acc77a74 100644 ei->outstanding_extents = 0; if (sb->s_magic != BTRFS_TEST_MAGIC) btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv, -@@ -8870,7 +8490,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) +@@ -8870,7 +8526,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->io_tree.inode = ei; extent_io_tree_init(fs_info, &ei->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); @@ -18901,7 +19603,7 @@ index 98a800b8bd43..44e9acc77a74 100644 atomic_set(&ei->sync_writers, 0); mutex_init(&ei->log_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); -@@ -8994,7 +8613,7 @@ int __init btrfs_init_cachep(void) +@@ -8994,7 +8649,7 @@ int __init btrfs_init_cachep(void) goto fail; if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, @@ -18910,7 +19612,7 @@ index 98a800b8bd43..44e9acc77a74 100644 BIOSET_NEED_BVECS)) goto fail; -@@ -10289,65 +9908,13 @@ struct btrfs_encoded_read_private { +@@ -10289,65 +9944,13 @@ struct btrfs_encoded_read_private { wait_queue_head_t wait; atomic_t pending; blk_status_t status; @@ -18977,7 +19679,7 @@ index 98a800b8bd43..44e9acc77a74 100644 /* * The memory barrier implied by the atomic_dec_return() here * pairs with the memory barrier implied by the -@@ -10356,11 +9923,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) +@@ -10356,11 +9959,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) * write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ @@ -18990,7 +19692,7 @@ index 98a800b8bd43..44e9acc77a74 100644 bio_put(&bbio->bio); } -@@ -10368,47 +9934,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, +@@ -10368,47 +9970,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, u64 file_offset, u64 disk_bytenr, u64 disk_io_size, struct page **pages) { @@ -19041,7 +19743,7 @@ index 98a800b8bd43..44e9acc77a74 100644 btrfs_encoded_read_endio, &priv); bio->bi_iter.bi_sector = -@@ -10417,14 +9962,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, +@@ -10417,14 +9998,8 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, if (!bytes || bio_add_page(bio, pages[i], bytes, 0) < bytes) { @@ -19058,7 +19760,7 @@ index 98a800b8bd43..44e9acc77a74 100644 bio = NULL; continue; } -@@ -10435,7 +9974,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, +@@ -10435,7 +10010,6 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, } } @@ -19066,7 +19768,7 @@ index 98a800b8bd43..44e9acc77a74 100644 if (atomic_dec_return(&priv.pending)) io_wait_event(priv.wait, !atomic_read(&priv.pending)); /* See btrfs_encoded_read_endio() for ordering. */ -@@ -10995,9 +10533,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, +@@ -10995,9 +10569,8 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis, return 0; max_pages = sis->max - bsi->nr_pages; @@ -19456,10 +20158,89 @@ index 190af1f698d9..8c516ee58ff9 100644 __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ (errno), fmt, ##args) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c -index 57d8c72737e1..6c24b69e2d0a 100644 +index 57d8c72737e1..1848d0d1a9c4 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c -@@ -616,7 +616,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) +@@ -160,14 +160,16 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + * @compress_type: Compression algorithm used for data. + * + * Most of these parameters correspond to &struct btrfs_file_extent_item. The +- * tree is given a single reference on the ordered extent that was inserted. ++ * tree is given a single reference on the ordered extent that was inserted, and ++ * the returned pointer is given a second reference. + * +- * Return: 0 or -ENOMEM. ++ * Return: the new ordered extent or ERR_PTR(-ENOMEM). + */ +-int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, +- u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, +- u64 disk_num_bytes, u64 offset, unsigned flags, +- int compress_type) ++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( ++ struct btrfs_inode *inode, u64 file_offset, ++ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, ++ int compress_type) + { + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; +@@ -181,7 +183,7 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + /* For nocow write, we can release the qgroup rsv right now */ + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + if (ret < 0) +- return ret; ++ return ERR_PTR(ret); + ret = 0; + } else { + /* +@@ -190,11 +192,11 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + */ + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + if (ret < 0) +- return ret; ++ return ERR_PTR(ret); + } + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); + if (!entry) +- return -ENOMEM; ++ return ERR_PTR(-ENOMEM); + + entry->file_offset = file_offset; + entry->num_bytes = num_bytes; +@@ -256,6 +258,32 @@ int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + btrfs_mod_outstanding_extents(inode, 1); + spin_unlock(&inode->lock); + ++ /* One ref for the returned entry to match semantics of lookup. */ ++ refcount_inc(&entry->refs); ++ ++ return entry; ++} ++ ++/* ++ * Add a new btrfs_ordered_extent for the range, but drop the reference instead ++ * of returning it to the caller. ++ */ ++int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, ++ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, ++ int compress_type) ++{ ++ struct btrfs_ordered_extent *ordered; ++ ++ ordered = btrfs_alloc_ordered_extent(inode, file_offset, num_bytes, ++ ram_bytes, disk_bytenr, ++ disk_num_bytes, offset, flags, ++ compress_type); ++ ++ if (IS_ERR(ordered)) ++ return PTR_ERR(ordered); ++ btrfs_put_ordered_extent(ordered); ++ + return 0; + } + +@@ -616,7 +644,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); @@ -19468,7 +20249,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644 complete(&ordered->completion); } -@@ -716,13 +716,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, +@@ -716,13 +744,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, } /* @@ -19486,7 +20267,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644 { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; -@@ -744,12 +743,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) +@@ -744,12 +771,10 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); @@ -19503,7 +20284,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644 } /* -@@ -800,7 +797,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +@@ -800,7 +825,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } @@ -19512,7 +20293,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644 end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't -@@ -1061,7 +1058,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, +@@ -1061,7 +1086,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent(&inode->io_tree, start, end, cachedp); @@ -19522,7 +20303,7 @@ index 57d8c72737e1..6c24b69e2d0a 100644 } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h -index 89f82b78f590..eb40cb39f842 100644 +index 89f82b78f590..18007f9c00ad 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -157,7 +157,6 @@ struct btrfs_ordered_extent { @@ -19533,7 +20314,21 @@ index 89f82b78f590..eb40cb39f842 100644 }; static inline void -@@ -187,7 +186,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, +@@ -179,15 +178,20 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, + bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, + struct btrfs_ordered_extent **cached, + u64 file_offset, u64 io_size); ++struct btrfs_ordered_extent *btrfs_alloc_ordered_extent( ++ struct btrfs_inode *inode, u64 file_offset, ++ u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, ++ int compress_type); + int btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset, + u64 num_bytes, u64 ram_bytes, u64 disk_bytenr, +- u64 disk_num_bytes, u64 offset, unsigned flags, ++ u64 disk_num_bytes, u64 offset, unsigned long flags, + int compress_type); + void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); @@ -20182,7 +20977,7 @@ index 31ec4a7658ce..ef13a9d4e370 100644 struct btrfs_fs_info *fs_info = inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c -index 52b346795f66..69c93ae333f6 100644 +index a5d026041be4..69c93ae333f6 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -229,7 +229,7 @@ struct full_stripe_lock { @@ -20194,77 +20989,6 @@ index 52b346795f66..69c93ae333f6 100644 struct scrub_page_private { u64 logical; }; -@@ -2053,20 +2053,33 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) - * a) don't have an extent buffer and - * b) the page is already kmapped - */ -- if (sblock->logical != btrfs_stack_header_bytenr(h)) -+ if (sblock->logical != btrfs_stack_header_bytenr(h)) { - sblock->header_error = 1; -- -- if (sector->generation != btrfs_stack_header_generation(h)) { -- sblock->header_error = 1; -- sblock->generation_error = 1; -+ btrfs_warn_rl(fs_info, -+ "tree block %llu mirror %u has bad bytenr, has %llu want %llu", -+ sblock->logical, sblock->mirror_num, -+ btrfs_stack_header_bytenr(h), -+ sblock->logical); -+ goto out; - } - -- if (!scrub_check_fsid(h->fsid, sector)) -+ if (!scrub_check_fsid(h->fsid, sector)) { - sblock->header_error = 1; -+ btrfs_warn_rl(fs_info, -+ "tree block %llu mirror %u has bad fsid, has %pU want %pU", -+ sblock->logical, sblock->mirror_num, -+ h->fsid, sblock->dev->fs_devices->fsid); -+ goto out; -+ } - -- if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, -- BTRFS_UUID_SIZE)) -+ if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { - sblock->header_error = 1; -+ btrfs_warn_rl(fs_info, -+ "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", -+ sblock->logical, sblock->mirror_num, -+ h->chunk_tree_uuid, fs_info->chunk_tree_uuid); -+ goto out; -+ } - - shash->tfm = fs_info->csum_shash; - crypto_shash_init(shash); -@@ -2079,9 +2092,27 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) - } - - crypto_shash_final(shash, calculated_csum); -- if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) -+ if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size)) { - sblock->checksum_error = 1; -+ btrfs_warn_rl(fs_info, -+ "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, -+ sblock->logical, sblock->mirror_num, -+ CSUM_FMT_VALUE(fs_info->csum_size, on_disk_csum), -+ CSUM_FMT_VALUE(fs_info->csum_size, calculated_csum)); -+ goto out; -+ } -+ -+ if (sector->generation != btrfs_stack_header_generation(h)) { -+ sblock->header_error = 1; -+ sblock->generation_error = 1; -+ btrfs_warn_rl(fs_info, -+ "tree block %llu mirror %u has bad generation, has %llu want %llu", -+ sblock->logical, sblock->mirror_num, -+ btrfs_stack_header_generation(h), -+ sector->generation); -+ } - -+out: - return sblock->header_error || sblock->checksum_error; - } - diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d50182b6deec..e5c963bb873d 100644 --- a/fs/btrfs/send.c @@ -21491,7 +22215,7 @@ index 433ce221dc5c..581845bc206a 100644 if (!mixed && found->flags & BTRFS_BLOCK_GROUP_METADATA) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c -index 45615ce36498..8c5efa5813b3 100644 +index 108aa3876186..8c5efa5813b3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -702,7 +702,7 @@ static void release_raid_kobj(struct kobject *kobj) @@ -21548,65 +22272,6 @@ index 45615ce36498..8c5efa5813b3 100644 .sysfs_ops = &kobj_sysfs_ops, .release = qgroup_release, .default_groups = qgroup_groups, -@@ -2272,36 +2272,23 @@ void btrfs_sysfs_del_one_qgroup(struct btrfs_fs_info *fs_info, - * Change per-fs features in /sys/fs/btrfs/UUID/features to match current - * values in superblock. Call after any changes to incompat/compat_ro flags - */ --void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, -- u64 bit, enum btrfs_feature_set set) -+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info) - { -- struct btrfs_fs_devices *fs_devs; - struct kobject *fsid_kobj; -- u64 __maybe_unused features; -- int __maybe_unused ret; -+ int ret; - - if (!fs_info) - return; - -- /* -- * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not -- * safe when called from some contexts (eg. balance) -- */ -- features = get_features(fs_info, set); -- ASSERT(bit & supported_feature_masks[set]); -- -- fs_devs = fs_info->fs_devices; -- fsid_kobj = &fs_devs->fsid_kobj; -- -+ fsid_kobj = &fs_info->fs_devices->fsid_kobj; - if (!fsid_kobj->state_initialized) - return; - -- /* -- * FIXME: this is too heavy to update just one value, ideally we'd like -- * to use sysfs_update_group but some refactoring is needed first. -- */ -- sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); -- ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); -+ ret = sysfs_update_group(fsid_kobj, &btrfs_feature_attr_group); -+ if (ret < 0) -+ btrfs_warn(fs_info, -+ "failed to update /sys/fs/btrfs/%pU/features: %d", -+ fs_info->fs_devices->fsid, ret); - } - - int __init btrfs_init_sysfs(void) -diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h -index bacef43f7267..86c7eef12873 100644 ---- a/fs/btrfs/sysfs.h -+++ b/fs/btrfs/sysfs.h -@@ -19,8 +19,7 @@ void btrfs_sysfs_remove_device(struct btrfs_device *device); - int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); - void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); - void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); --void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, -- u64 bit, enum btrfs_feature_set set); -+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info); - void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); - - int __init btrfs_init_sysfs(void); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index c5b3a631bf4f..f2f2e11dac4c 100644 --- a/fs/btrfs/tests/extent-map-tests.c @@ -21621,22 +22286,10 @@ index c5b3a631bf4f..f2f2e11dac4c 100644 if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c -index b8c52e89688c..18329ebcb1cb 100644 +index 8f8d0fce6e4a..18329ebcb1cb 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c -@@ -2464,6 +2464,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) - wake_up(&fs_info->transaction_wait); - btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_UNBLOCKED); - -+ /* If we have features changed, wake up the cleaner to update sysfs. */ -+ if (test_bit(BTRFS_FS_FEATURE_CHANGED, &fs_info->flags) && -+ fs_info->cleaner_kthread) -+ wake_up_process(fs_info->cleaner_kthread); -+ - ret = btrfs_write_and_wait_transaction(trans); - if (ret) { - btrfs_handle_fs_error(fs_info, ret, -@@ -2604,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) +@@ -2609,6 +2609,35 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info) return (ret < 0) ? 0 : 1; } @@ -22522,6 +23175,1091 @@ index f43990985d80..c0570d35fea2 100644 static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group) { return true; +diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c +index 9de1c9d1a13d..3559ea6b0781 100644 +--- a/fs/ext4/extents.c ++++ b/fs/ext4/extents.c +@@ -3251,7 +3251,7 @@ static int ext4_split_extent_at(handle_t *handle, + ext4_ext_mark_unwritten(ex2); + + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); +- if (err != -ENOSPC && err != -EDQUOT) ++ if (err != -ENOSPC && err != -EDQUOT && err != -ENOMEM) + goto out; + + if (EXT4_EXT_MAY_ZEROOUT & split_flag) { +diff --git a/fs/ext4/file.c b/fs/ext4/file.c +index 7ac0a81bd371..6e9f198ecacf 100644 +--- a/fs/ext4/file.c ++++ b/fs/ext4/file.c +@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len) + return false; + } + +-/* Is IO overwriting allocated and initialized blocks? */ +-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) ++/* Is IO overwriting allocated or initialized blocks? */ ++static bool ext4_overwrite_io(struct inode *inode, ++ loff_t pos, loff_t len, bool *unwritten) + { + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; +@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len) + blklen = map.m_len; + + err = ext4_map_blocks(NULL, inode, &map, 0); ++ if (err != blklen) ++ return false; + /* + * 'err==len' means that all of the blocks have been preallocated, +- * regardless of whether they have been initialized or not. To exclude +- * unwritten extents, we need to check m_flags. ++ * regardless of whether they have been initialized or not. We need to ++ * check m_flags to distinguish the unwritten extents. + */ +- return err == blklen && (map.m_flags & EXT4_MAP_MAPPED); ++ *unwritten = !(map.m_flags & EXT4_MAP_MAPPED); ++ return true; + } + + static ssize_t ext4_generic_write_checks(struct kiocb *iocb, +@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = { + * - For extending writes case we don't take the shared lock, since it requires + * updating inode i_disksize and/or orphan handling with exclusive lock. + * +- * - shared locking will only be true mostly with overwrites. Otherwise we will +- * switch to exclusive i_rwsem lock. ++ * - shared locking will only be true mostly with overwrites, including ++ * initialized blocks and unwritten blocks. For overwrite unwritten blocks ++ * we protect splitting extents by i_data_sem in ext4_inode_info, so we can ++ * also release exclusive i_rwsem lock. ++ * ++ * - Otherwise we will switch to exclusive i_rwsem lock. + */ + static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, +- bool *ilock_shared, bool *extend) ++ bool *ilock_shared, bool *extend, ++ bool *unwritten) + { + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); +@@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from, + * in file_modified(). + */ + if (*ilock_shared && (!IS_NOSEC(inode) || *extend || +- !ext4_overwrite_io(inode, offset, count))) { ++ !ext4_overwrite_io(inode, offset, count, unwritten))) { + if (iocb->ki_flags & IOCB_NOWAIT) { + ret = -EAGAIN; + goto out; +@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) + loff_t offset = iocb->ki_pos; + size_t count = iov_iter_count(from); + const struct iomap_ops *iomap_ops = &ext4_iomap_ops; +- bool extend = false, unaligned_io = false; ++ bool extend = false, unaligned_io = false, unwritten = false; + bool ilock_shared = true; + + /* +@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) + return ext4_buffered_write_iter(iocb, from); + } + +- ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend); ++ ret = ext4_dio_write_checks(iocb, from, ++ &ilock_shared, &extend, &unwritten); + if (ret <= 0) + return ret; + +@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) + ext4_journal_stop(handle); + } + +- if (ilock_shared) ++ if (ilock_shared && !unwritten) + iomap_ops = &ext4_iomap_overwrite_ops; + ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops, + (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0, +diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c +index 9d9f414f99fe..24128f6cd1b0 100644 +--- a/fs/ext4/inode.c ++++ b/fs/ext4/inode.c +@@ -136,7 +136,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, + new_size); + } + +-static int __ext4_journalled_writepage(struct page *page, unsigned int len); + static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); + +@@ -1005,30 +1004,6 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, + return ret; + } + +-/* +- * To preserve ordering, it is essential that the hole instantiation and +- * the data write be encapsulated in a single transaction. We cannot +- * close off a transaction and start a new one between the ext4_get_block() +- * and the commit_write(). So doing the jbd2_journal_start at the start of +- * prepare_write() is the right place. +- * +- * Also, this function can nest inside ext4_writepage(). In that case, we +- * *know* that ext4_writepage() has generated enough buffer credits to do the +- * whole page. So we won't block on the journal in that case, which is good, +- * because the caller may be PF_MEMALLOC. +- * +- * By accident, ext4 can be reentered when a transaction is open via +- * quota file writes. If we were to commit the transaction while thus +- * reentered, there can be a deadlock - we would be holding a quota +- * lock, and the commit would never complete if another thread had a +- * transaction open and was blocking on the quota lock - a ranking +- * violation. +- * +- * So what we do is to rely on the fact that jbd2_journal_stop/journal_start +- * will _not_ run commit under these circumstances because handle->h_ref +- * is elevated. We'll still have enough credits for the tiny quotafile +- * write. +- */ + int do_journal_get_write_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh) + { +@@ -1149,6 +1124,13 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, + } + #endif + ++/* ++ * To preserve ordering, it is essential that the hole instantiation and ++ * the data write be encapsulated in a single transaction. We cannot ++ * close off a transaction and start a new one between the ext4_get_block() ++ * and the ext4_write_end(). So doing the jbd2_journal_start at the start of ++ * ext4_write_begin() is the right place. ++ */ + static int ext4_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +@@ -1649,12 +1631,6 @@ static void ext4_print_free_blocks(struct inode *inode) + return; + } + +-static int ext4_bh_delay_or_unwritten(handle_t *handle, struct inode *inode, +- struct buffer_head *bh) +-{ +- return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); +-} +- + /* + * ext4_insert_delayed_block - adds a delayed block to the extents status + * tree, incrementing the reserved cluster/block +@@ -1887,216 +1863,10 @@ int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, + return 0; + } + +-static int __ext4_journalled_writepage(struct page *page, +- unsigned int len) ++static void mpage_page_done(struct mpage_da_data *mpd, struct page *page) + { +- struct address_space *mapping = page->mapping; +- struct inode *inode = mapping->host; +- handle_t *handle = NULL; +- int ret = 0, err = 0; +- int inline_data = ext4_has_inline_data(inode); +- struct buffer_head *inode_bh = NULL; +- loff_t size; +- +- ClearPageChecked(page); +- +- if (inline_data) { +- BUG_ON(page->index != 0); +- BUG_ON(len > ext4_get_max_inline_size(inode)); +- inode_bh = ext4_journalled_write_inline_data(inode, len, page); +- if (inode_bh == NULL) +- goto out; +- } +- /* +- * We need to release the page lock before we start the +- * journal, so grab a reference so the page won't disappear +- * out from under us. +- */ +- get_page(page); +- unlock_page(page); +- +- handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, +- ext4_writepage_trans_blocks(inode)); +- if (IS_ERR(handle)) { +- ret = PTR_ERR(handle); +- put_page(page); +- goto out_no_pagelock; +- } +- BUG_ON(!ext4_handle_valid(handle)); +- +- lock_page(page); +- put_page(page); +- size = i_size_read(inode); +- if (page->mapping != mapping || page_offset(page) > size) { +- /* The page got truncated from under us */ +- ext4_journal_stop(handle); +- ret = 0; +- goto out; +- } +- +- if (inline_data) { +- ret = ext4_mark_inode_dirty(handle, inode); +- } else { +- struct buffer_head *page_bufs = page_buffers(page); +- +- if (page->index == size >> PAGE_SHIFT) +- len = size & ~PAGE_MASK; +- else +- len = PAGE_SIZE; +- +- ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, +- NULL, do_journal_get_write_access); +- +- err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, +- NULL, write_end_fn); +- } +- if (ret == 0) +- ret = err; +- err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); +- if (ret == 0) +- ret = err; +- EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; +- err = ext4_journal_stop(handle); +- if (!ret) +- ret = err; +- +- ext4_set_inode_state(inode, EXT4_STATE_JDATA); +-out: ++ mpd->first_page++; + unlock_page(page); +-out_no_pagelock: +- brelse(inode_bh); +- return ret; +-} +- +-/* +- * Note that we don't need to start a transaction unless we're journaling data +- * because we should have holes filled from ext4_page_mkwrite(). We even don't +- * need to file the inode to the transaction's list in ordered mode because if +- * we are writing back data added by write(), the inode is already there and if +- * we are writing back data modified via mmap(), no one guarantees in which +- * transaction the data will hit the disk. In case we are journaling data, we +- * cannot start transaction directly because transaction start ranks above page +- * lock so we have to do some magic. +- * +- * This function can get called via... +- * - ext4_writepages after taking page lock (have journal handle) +- * - journal_submit_inode_data_buffers (no journal handle) +- * - shrink_page_list via the kswapd/direct reclaim (no journal handle) +- * - grab_page_cache when doing write_begin (have journal handle) +- * +- * We don't do any block allocation in this function. If we have page with +- * multiple blocks we need to write those buffer_heads that are mapped. This +- * is important for mmaped based write. So if we do with blocksize 1K +- * truncate(f, 1024); +- * a = mmap(f, 0, 4096); +- * a[0] = 'a'; +- * truncate(f, 4096); +- * we have in the page first buffer_head mapped via page_mkwrite call back +- * but other buffer_heads would be unmapped but dirty (dirty done via the +- * do_wp_page). So writepage should write the first block. If we modify +- * the mmap area beyond 1024 we will again get a page_fault and the +- * page_mkwrite callback will do the block allocation and mark the +- * buffer_heads mapped. +- * +- * We redirty the page if we have any buffer_heads that is either delay or +- * unwritten in the page. +- * +- * We can get recursively called as show below. +- * +- * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> +- * ext4_writepage() +- * +- * But since we don't do any block allocation we should not deadlock. +- * Page also have the dirty flag cleared so we don't get recurive page_lock. +- */ +-static int ext4_writepage(struct page *page, +- struct writeback_control *wbc) +-{ +- struct folio *folio = page_folio(page); +- int ret = 0; +- loff_t size; +- unsigned int len; +- struct buffer_head *page_bufs = NULL; +- struct inode *inode = page->mapping->host; +- struct ext4_io_submit io_submit; +- +- if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { +- folio_invalidate(folio, 0, folio_size(folio)); +- folio_unlock(folio); +- return -EIO; +- } +- +- trace_ext4_writepage(page); +- size = i_size_read(inode); +- if (page->index == size >> PAGE_SHIFT && +- !ext4_verity_in_progress(inode)) +- len = size & ~PAGE_MASK; +- else +- len = PAGE_SIZE; +- +- /* Should never happen but for bugs in other kernel subsystems */ +- if (!page_has_buffers(page)) { +- ext4_warning_inode(inode, +- "page %lu does not have buffers attached", page->index); +- ClearPageDirty(page); +- unlock_page(page); +- return 0; +- } +- +- page_bufs = page_buffers(page); +- /* +- * We cannot do block allocation or other extent handling in this +- * function. If there are buffers needing that, we have to redirty +- * the page. But we may reach here when we do a journal commit via +- * journal_submit_inode_data_buffers() and in that case we must write +- * allocated buffers to achieve data=ordered mode guarantees. +- * +- * Also, if there is only one buffer per page (the fs block +- * size == the page size), if one buffer needs block +- * allocation or needs to modify the extent tree to clear the +- * unwritten flag, we know that the page can't be written at +- * all, so we might as well refuse the write immediately. +- * Unfortunately if the block size != page size, we can't as +- * easily detect this case using ext4_walk_page_buffers(), but +- * for the extremely common case, this is an optimization that +- * skips a useless round trip through ext4_bio_write_page(). +- */ +- if (ext4_walk_page_buffers(NULL, inode, page_bufs, 0, len, NULL, +- ext4_bh_delay_or_unwritten)) { +- redirty_page_for_writepage(wbc, page); +- if ((current->flags & PF_MEMALLOC) || +- (inode->i_sb->s_blocksize == PAGE_SIZE)) { +- /* +- * For memory cleaning there's no point in writing only +- * some buffers. So just bail out. Warn if we came here +- * from direct reclaim. +- */ +- WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) +- == PF_MEMALLOC); +- unlock_page(page); +- return 0; +- } +- } +- +- if (PageChecked(page) && ext4_should_journal_data(inode)) +- /* +- * It's mmapped pagecache. Add buffers and journal it. There +- * doesn't seem much point in redirtying the page here. +- */ +- return __ext4_journalled_writepage(page, len); +- +- ext4_io_submit_init(&io_submit, wbc); +- io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS); +- if (!io_submit.io_end) { +- redirty_page_for_writepage(wbc, page); +- unlock_page(page); +- return -ENOMEM; +- } +- ret = ext4_bio_write_page(&io_submit, page, len); +- ext4_io_submit(&io_submit); +- /* Drop io_end reference we got from init */ +- ext4_put_io_end_defer(io_submit.io_end); +- return ret; + } + + static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) +@@ -2129,7 +1899,6 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) + err = ext4_bio_write_page(&mpd->io_submit, page, len); + if (!err) + mpd->wbc->nr_to_write--; +- mpd->first_page++; + + return err; + } +@@ -2243,6 +2012,7 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, + err = mpage_submit_page(mpd, head->b_page); + if (err < 0) + return err; ++ mpage_page_done(mpd, head->b_page); + } + if (lblk >= blocks) { + mpd->scanned_until_end = 1; +@@ -2374,6 +2144,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) + err = mpage_submit_page(mpd, page); + if (err < 0) + goto out; ++ mpage_page_done(mpd, page); + } + folio_batch_release(&fbatch); + } +@@ -2572,6 +2343,50 @@ static bool ext4_page_nomap_can_writeout(struct page *page) + return false; + } + ++static int ext4_journal_page_buffers(handle_t *handle, struct page *page, ++ int len) ++{ ++ struct buffer_head *page_bufs = page_buffers(page); ++ struct inode *inode = page->mapping->host; ++ int ret, err; ++ ++ ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, ++ NULL, do_journal_get_write_access); ++ err = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len, ++ NULL, write_end_fn); ++ if (ret == 0) ++ ret = err; ++ err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len); ++ if (ret == 0) ++ ret = err; ++ EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid; ++ ++ ext4_set_inode_state(inode, EXT4_STATE_JDATA); ++ ++ return ret; ++} ++ ++static int mpage_journal_page_buffers(handle_t *handle, ++ struct mpage_da_data *mpd, ++ struct page *page) ++{ ++ struct inode *inode = mpd->inode; ++ loff_t size = i_size_read(inode); ++ int len; ++ ++ ClearPageChecked(page); ++ clear_page_dirty_for_io(page); ++ mpd->wbc->nr_to_write--; ++ ++ if (page->index == size >> PAGE_SHIFT && ++ !ext4_verity_in_progress(inode)) ++ len = size & ~PAGE_MASK; ++ else ++ len = PAGE_SIZE; ++ ++ return ext4_journal_page_buffers(handle, page, len); ++} ++ + /* + * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages + * needing mapping, submit mapped pages +@@ -2597,7 +2412,6 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) + struct address_space *mapping = mpd->inode->i_mapping; + struct pagevec pvec; + unsigned int nr_pages; +- long left = mpd->wbc->nr_to_write; + pgoff_t index = mpd->first_page; + pgoff_t end = mpd->last_page; + xa_mark_t tag; +@@ -2605,12 +2419,20 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) + int blkbits = mpd->inode->i_blkbits; + ext4_lblk_t lblk; + struct buffer_head *head; ++ handle_t *handle = NULL; ++ int bpp = ext4_journal_blocks_per_page(mpd->inode); + + if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; + ++ if (ext4_should_journal_data(mpd->inode)) { ++ handle = ext4_journal_start(mpd->inode, EXT4_HT_WRITE_PAGE, ++ bpp); ++ if (IS_ERR(handle)) ++ return PTR_ERR(handle); ++ } + pagevec_init(&pvec); + mpd->map.m_len = 0; + mpd->next_page = index; +@@ -2631,13 +2453,22 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) + * newly appeared dirty pages, but have not synced all + * of the old dirty pages. + */ +- if (mpd->wbc->sync_mode == WB_SYNC_NONE && left <= 0) ++ if (mpd->wbc->sync_mode == WB_SYNC_NONE && ++ mpd->wbc->nr_to_write <= ++ mpd->map.m_len >> (PAGE_SHIFT - blkbits)) + goto out; + + /* If we can't merge this page, we are done. */ + if (mpd->map.m_len > 0 && mpd->next_page != page->index) + goto out; + ++ if (handle) { ++ err = ext4_journal_ensure_credits(handle, bpp, ++ 0); ++ if (err < 0) ++ goto out; ++ } ++ + lock_page(page); + /* + * If the page is no longer dirty, or its mapping no +@@ -2677,18 +2508,30 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) + mpd->first_page = page->index; + mpd->next_page = page->index + 1; + /* +- * Writeout for transaction commit where we cannot +- * modify metadata is simple. Just submit the page. ++ * Writeout when we cannot modify metadata is simple. ++ * Just submit the page. For data=journal mode we ++ * first handle writeout of the page for checkpoint and ++ * only after that handle delayed page dirtying. This ++ * is crutial so that forcing a transaction commit and ++ * then calling filemap_write_and_wait() guarantees ++ * current state of data is in its final location. Such ++ * sequence is used for example by insert/collapse ++ * range operations before discarding the page cache. + */ + if (!mpd->can_map) { + if (ext4_page_nomap_can_writeout(page)) { + err = mpage_submit_page(mpd, page); + if (err < 0) + goto out; +- } else { +- unlock_page(page); +- mpd->first_page++; + } ++ /* Pending dirtying of journalled data? */ ++ if (PageChecked(page)) { ++ err = mpage_journal_page_buffers(handle, ++ mpd, page); ++ if (err < 0) ++ goto out; ++ } ++ mpage_page_done(mpd, page); + } else { + /* Add all dirty buffers to mpd */ + lblk = ((ext4_lblk_t)page->index) << +@@ -2700,24 +2543,21 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) + goto out; + err = 0; + } +- left--; + } + pagevec_release(&pvec); + cond_resched(); + } + mpd->scanned_until_end = 1; ++ if (handle) ++ ext4_journal_stop(handle); + return 0; + out: + pagevec_release(&pvec); ++ if (handle) ++ ext4_journal_stop(handle); + return err; + } + +-static int ext4_writepage_cb(struct page *page, struct writeback_control *wbc, +- void *data) +-{ +- return ext4_writepage(page, wbc); +-} +- + static int ext4_do_writepages(struct mpage_da_data *mpd) + { + struct writeback_control *wbc = mpd->wbc; +@@ -2743,13 +2583,6 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) + goto out_writepages; + +- if (ext4_should_journal_data(inode)) { +- blk_start_plug(&plug); +- ret = write_cache_pages(mapping, wbc, ext4_writepage_cb, NULL); +- blk_finish_plug(&plug); +- goto out_writepages; +- } +- + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that +@@ -2784,6 +2617,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) + ext4_journal_stop(handle); + } + ++ /* ++ * data=journal mode does not do delalloc so we just need to writeout / ++ * journal already mapped buffers ++ */ ++ if (ext4_should_journal_data(inode)) ++ mpd->can_map = 0; ++ + if (ext4_should_dioread_nolock(inode)) { + /* + * We may need to convert up to one extent per block in +@@ -3160,9 +3000,8 @@ static int ext4_da_write_end(struct file *file, + * i_disksize since writeback will push i_disksize upto i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block (ext4_da_should_update_i_disksize() +- * check), we need to update i_disksize here as neither +- * ext4_writepage() nor certain ext4_writepages() paths not +- * allocating blocks update i_disksize. ++ * check), we need to update i_disksize here as certain ++ * ext4_writepages() paths not allocating blocks update i_disksize. + * + * Note that we defer inode dirtying to generic_write_end() / + * ext4_da_write_inline_data_end(). +@@ -3687,24 +3526,26 @@ const struct iomap_ops ext4_iomap_report_ops = { + }; + + /* +- * Whenever the folio is being dirtied, corresponding buffers should already +- * be attached to the transaction (we take care of this in ext4_page_mkwrite() +- * and ext4_write_begin()). However we cannot move buffers to dirty transaction +- * lists here because ->dirty_folio is called under VFS locks and the folio +- * is not necessarily locked. +- * +- * We cannot just dirty the folio and leave attached buffers clean, because the +- * buffers' dirty state is "definitive". We cannot just set the buffers dirty +- * or jbddirty because all the journalling code will explode. +- * +- * So what we do is to mark the folio "pending dirty" and next time writepage +- * is called, propagate that into the buffers appropriately. ++ * For data=journal mode, folio should be marked dirty only when it was ++ * writeably mapped. When that happens, it was already attached to the ++ * transaction and marked as jbddirty (we take care of this in ++ * ext4_page_mkwrite()). On transaction commit, we writeprotect page mappings ++ * so we should have nothing to do here, except for the case when someone ++ * had the page pinned and dirtied the page through this pin (e.g. by doing ++ * direct IO to it). In that case we'd need to attach buffers here to the ++ * transaction but we cannot due to lock ordering. We cannot just dirty the ++ * folio and leave attached buffers clean, because the buffers' dirty state is ++ * "definitive". We cannot just set the buffers dirty or jbddirty because all ++ * the journalling code will explode. So what we do is to mark the folio ++ * "pending dirty" and next time ext4_writepages() is called, attach buffers ++ * to the transaction appropriately. + */ + static bool ext4_journalled_dirty_folio(struct address_space *mapping, + struct folio *folio) + { + WARN_ON_ONCE(!folio_buffers(folio)); +- folio_set_checked(folio); ++ if (folio_maybe_dma_pinned(folio)) ++ folio_set_checked(folio); + return filemap_dirty_folio(mapping, folio); + } + +@@ -4872,13 +4713,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + goto bad_inode; + raw_inode = ext4_raw_inode(&iloc); + +- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) { +- ext4_error_inode(inode, function, line, 0, +- "iget: root inode unallocated"); +- ret = -EFSCORRUPTED; +- goto bad_inode; +- } +- + if ((flags & EXT4_IGET_HANDLE) && + (raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) { + ret = -ESTALE; +@@ -4951,11 +4785,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, + * NeilBrown 1999oct15 + */ + if (inode->i_nlink == 0) { +- if ((inode->i_mode == 0 || ++ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL || + !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) && + ino != EXT4_BOOT_LOADER_INO) { +- /* this inode is deleted */ +- ret = -ESTALE; ++ /* this inode is deleted or unallocated */ ++ if (flags & EXT4_IGET_SPECIAL) { ++ ext4_error_inode(inode, function, line, 0, ++ "iget: special inode unallocated"); ++ ret = -EFSCORRUPTED; ++ } else ++ ret = -ESTALE; + goto bad_inode; + } + /* The only unlinked inodes we let through here have +@@ -5382,7 +5221,7 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode) + * If the folio is fully truncated, we don't need to wait for any commit + * (and we even should not as __ext4_journalled_invalidate_folio() may + * strip all buffers from the folio but keep the folio dirty which can then +- * confuse e.g. concurrent ext4_writepage() seeing dirty folio without ++ * confuse e.g. concurrent ext4_writepages() seeing dirty folio without + * buffers). Also we don't need to wait for any commit if all buffers in + * the folio remain valid. This is most beneficial for the common case of + * blocksize == PAGESIZE. +@@ -5788,7 +5627,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; + int idxblocks; +- int ret = 0; ++ int ret; + + /* + * How many index blocks need to touch to map @lblocks logical blocks +@@ -6320,18 +6159,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf) + err = __block_write_begin(page, 0, len, ext4_get_block); + if (!err) { + ret = VM_FAULT_SIGBUS; +- if (ext4_walk_page_buffers(handle, inode, +- page_buffers(page), 0, len, NULL, +- do_journal_get_write_access)) +- goto out_error; +- if (ext4_walk_page_buffers(handle, inode, +- page_buffers(page), 0, len, NULL, +- write_end_fn)) +- goto out_error; +- if (ext4_jbd2_inode_add_write(handle, inode, +- page_offset(page), len)) ++ if (ext4_journal_page_buffers(handle, page, len)) + goto out_error; +- ext4_set_inode_state(inode, EXT4_STATE_JDATA); + } else { + unlock_page(page); + } +diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c +index 8067ccda34e4..2e8c34036313 100644 +--- a/fs/ext4/ioctl.c ++++ b/fs/ext4/ioctl.c +@@ -155,9 +155,6 @@ static int ext4_update_backup_sb(struct super_block *sb, + set_buffer_uptodate(bh); + unlock_buffer(bh); + +- if (err) +- goto out_bh; +- + if (handle) { + err = ext4_handle_dirty_metadata(handle, NULL, bh); + if (err) +diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c +index dd28453d6ea3..270fbcba75b6 100644 +--- a/fs/ext4/namei.c ++++ b/fs/ext4/namei.c +@@ -3872,9 +3872,16 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir)) + goto end_rename; + } ++ /* ++ * We need to protect against old.inode directory getting ++ * converted from inline directory format into a normal one. ++ */ ++ inode_lock_nested(old.inode, I_MUTEX_NONDIR2); + retval = ext4_rename_dir_prepare(handle, &old); +- if (retval) ++ if (retval) { ++ inode_unlock(old.inode); + goto end_rename; ++ } + } + /* + * If we're renaming a file within an inline_data dir and adding or +@@ -4006,6 +4013,8 @@ static int ext4_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + } else { + ext4_journal_stop(handle); + } ++ if (old.dir_bh) ++ inode_unlock(old.inode); + release_bh: + brelse(old.dir_bh); + brelse(old.bh); +diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c +index beaec6d81074..3bc7c7c5b99d 100644 +--- a/fs/ext4/page-io.c ++++ b/fs/ext4/page-io.c +@@ -500,7 +500,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, + + /* Nothing to submit? Just unlock the page... */ + if (!nr_to_submit) +- goto unlock; ++ return 0; + + bh = head = page_buffers(page); + +@@ -548,7 +548,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io, + } + bh = bh->b_this_page; + } while (bh != head); +- goto unlock; ++ ++ return ret; + } + } + +@@ -564,7 +565,6 @@ int ext4_bio_write_page(struct ext4_io_submit *io, + io_submit_add_bh(io, inode, + bounce_page ? bounce_page : page, bh); + } while ((bh = bh->b_this_page) != head); +-unlock: +- unlock_page(page); +- return ret; ++ ++ return 0; + } +diff --git a/fs/ext4/super.c b/fs/ext4/super.c +index c81fa0fa9901..2192b4111442 100644 +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -4751,7 +4751,6 @@ static int ext4_group_desc_init(struct super_block *sb, + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned int db_count; + ext4_fsblk_t block; +- int ret; + int i; + + db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / +@@ -4791,8 +4790,7 @@ static int ext4_group_desc_init(struct super_block *sb, + ext4_msg(sb, KERN_ERR, + "can't read group descriptor %d", i); + sbi->s_gdb_count = i; +- ret = PTR_ERR(bh); +- goto out; ++ return PTR_ERR(bh); + } + rcu_read_lock(); + rcu_dereference(sbi->s_group_desc)[i] = bh; +@@ -4801,13 +4799,10 @@ static int ext4_group_desc_init(struct super_block *sb, + sbi->s_gdb_count = db_count; + if (!ext4_check_descriptors(sb, logical_sb_block, first_not_zeroed)) { + ext4_msg(sb, KERN_ERR, "group descriptors corrupted!"); +- ret = -EFSCORRUPTED; +- goto out; ++ return -EFSCORRUPTED; + } ++ + return 0; +-out: +- ext4_group_desc_free(sbi); +- return ret; + } + + static int ext4_load_and_init_journal(struct super_block *sb, +@@ -5234,14 +5229,14 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) + if (ext4_geometry_check(sb, es)) + goto failed_mount; + +- err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); +- if (err) +- goto failed_mount; +- + timer_setup(&sbi->s_err_report, print_daily_error_info, 0); + spin_lock_init(&sbi->s_error_lock); + INIT_WORK(&sbi->s_error_work, flush_stashed_error_work); + ++ err = ext4_group_desc_init(sb, es, logical_sb_block, &first_not_zeroed); ++ if (err) ++ goto failed_mount3; ++ + /* Register extent status tree shrinker */ + if (ext4_es_register_shrinker(sbi)) + goto failed_mount3; +@@ -5967,8 +5962,11 @@ static int ext4_load_journal(struct super_block *sb, + if (!really_read_only && journal_devnum && + journal_devnum != le32_to_cpu(es->s_journal_dev)) { + es->s_journal_dev = cpu_to_le32(journal_devnum); +- +- /* Make sure we flush the recovery flag to disk. */ ++ ext4_commit_super(sb); ++ } ++ if (!really_read_only && journal_inum && ++ journal_inum != le32_to_cpu(es->s_journal_inum)) { ++ es->s_journal_inum = cpu_to_le32(journal_inum); + ext4_commit_super(sb); + } + +diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c +index 0c6b011a91b3..62f2ec599218 100644 +--- a/fs/ext4/xattr.c ++++ b/fs/ext4/xattr.c +@@ -184,27 +184,73 @@ ext4_xattr_handler(int name_index) + } + + static int +-ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, +- void *value_start) ++check_xattrs(struct inode *inode, struct buffer_head *bh, ++ struct ext4_xattr_entry *entry, void *end, void *value_start, ++ const char *function, unsigned int line) + { + struct ext4_xattr_entry *e = entry; ++ int err = -EFSCORRUPTED; ++ char *err_str; ++ ++ if (bh) { ++ if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || ++ BHDR(bh)->h_blocks != cpu_to_le32(1)) { ++ err_str = "invalid header"; ++ goto errout; ++ } ++ if (buffer_verified(bh)) ++ return 0; ++ if (!ext4_xattr_block_csum_verify(inode, bh)) { ++ err = -EFSBADCRC; ++ err_str = "invalid checksum"; ++ goto errout; ++ } ++ } else { ++ struct ext4_xattr_ibody_header *header = value_start; ++ ++ header -= 1; ++ if (end - (void *)header < sizeof(*header) + sizeof(u32)) { ++ err_str = "in-inode xattr block too small"; ++ goto errout; ++ } ++ if (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)) { ++ err_str = "bad magic number in in-inode xattr"; ++ goto errout; ++ } ++ } + + /* Find the end of the names list */ + while (!IS_LAST_ENTRY(e)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); +- if ((void *)next >= end) +- return -EFSCORRUPTED; +- if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) +- return -EFSCORRUPTED; ++ if ((void *)next >= end) { ++ err_str = "e_name out of bounds"; ++ goto errout; ++ } ++ if (strnlen(e->e_name, e->e_name_len) != e->e_name_len) { ++ err_str = "bad e_name length"; ++ goto errout; ++ } + e = next; + } + + /* Check the values */ + while (!IS_LAST_ENTRY(entry)) { + u32 size = le32_to_cpu(entry->e_value_size); ++ unsigned long ea_ino = le32_to_cpu(entry->e_value_inum); + +- if (size > EXT4_XATTR_SIZE_MAX) +- return -EFSCORRUPTED; ++ if (!ext4_has_feature_ea_inode(inode->i_sb) && ea_ino) { ++ err_str = "ea_inode specified without ea_inode feature enabled"; ++ goto errout; ++ } ++ if (ea_ino && ((ea_ino == EXT4_ROOT_INO) || ++ !ext4_valid_inum(inode->i_sb, ea_ino))) { ++ err_str = "invalid ea_ino"; ++ goto errout; ++ } ++ if (size > EXT4_XATTR_SIZE_MAX) { ++ err_str = "e_value size too large"; ++ goto errout; ++ } + + if (size != 0 && entry->e_value_inum == 0) { + u16 offs = le16_to_cpu(entry->e_value_offs); +@@ -216,66 +262,54 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, + * the padded and unpadded sizes, since the size may + * overflow to 0 when adding padding. + */ +- if (offs > end - value_start) +- return -EFSCORRUPTED; ++ if (offs > end - value_start) { ++ err_str = "e_value out of bounds"; ++ goto errout; ++ } + value = value_start + offs; + if (value < (void *)e + sizeof(u32) || + size > end - value || +- EXT4_XATTR_SIZE(size) > end - value) +- return -EFSCORRUPTED; ++ EXT4_XATTR_SIZE(size) > end - value) { ++ err_str = "overlapping e_value "; ++ goto errout; ++ } + } + entry = EXT4_XATTR_NEXT(entry); + } +- ++ if (bh) ++ set_buffer_verified(bh); + return 0; ++ ++errout: ++ if (bh) ++ __ext4_error_inode(inode, function, line, 0, -err, ++ "corrupted xattr block %llu: %s", ++ (unsigned long long) bh->b_blocknr, ++ err_str); ++ else ++ __ext4_error_inode(inode, function, line, 0, -err, ++ "corrupted in-inode xattr: %s", err_str); ++ return err; + } + + static inline int + __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, + const char *function, unsigned int line) + { +- int error = -EFSCORRUPTED; +- +- if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || +- BHDR(bh)->h_blocks != cpu_to_le32(1)) +- goto errout; +- if (buffer_verified(bh)) +- return 0; +- +- error = -EFSBADCRC; +- if (!ext4_xattr_block_csum_verify(inode, bh)) +- goto errout; +- error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, +- bh->b_data); +-errout: +- if (error) +- __ext4_error_inode(inode, function, line, 0, -error, +- "corrupted xattr block %llu", +- (unsigned long long) bh->b_blocknr); +- else +- set_buffer_verified(bh); +- return error; ++ return check_xattrs(inode, bh, BFIRST(bh), bh->b_data + bh->b_size, ++ bh->b_data, function, line); + } + + #define ext4_xattr_check_block(inode, bh) \ + __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) + + +-static int ++static inline int + __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line) + { +- int error = -EFSCORRUPTED; +- +- if (end - (void *)header < sizeof(*header) + sizeof(u32) || +- (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) +- goto errout; +- error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); +-errout: +- if (error) +- __ext4_error_inode(inode, function, line, 0, -error, +- "corrupted in-inode xattr"); +- return error; ++ return check_xattrs(inode, NULL, IFIRST(header), end, IFIRST(header), ++ function, line); + } + + #define xattr_check_inode(inode, header, end) \ +@@ -388,6 +422,17 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino, + struct inode *inode; + int err; + ++ /* ++ * We have to check for this corruption early as otherwise ++ * iget_locked() could wait indefinitely for the state of our ++ * parent inode. ++ */ ++ if (parent->i_ino == ea_ino) { ++ ext4_error(parent->i_sb, ++ "Parent and EA inode have the same ino %lu", ea_ino); ++ return -EFSCORRUPTED; ++ } ++ + inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e7537fd305dd..e191ecfb1fde 100644 --- a/fs/gfs2/bmap.c @@ -23847,6 +25585,18 @@ index d5130d1fcfae..011b50469301 100644 if (error == -EFSCORRUPTED) XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, extp, sizeof(*extp)); +diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c +index 88a88506ffff..92ca2017eded 100644 +--- a/fs/xfs/xfs_fsmap.c ++++ b/fs/xfs/xfs_fsmap.c +@@ -761,6 +761,7 @@ xfs_getfsmap_datadev_bnobt( + { + struct xfs_alloc_rec_incore akeys[2]; + ++ memset(akeys, 0, sizeof(akeys)); + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_bnobt_query, &akeys[0]); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 4d0a98f920ca..9edc1f2bc939 100644 --- a/fs/xfs/xfs_globals.c @@ -24891,13 +26641,31 @@ index 6548b5b5aa60..75d7d22c3a27 100644 ); TRACE_EVENT(btrfs_find_cluster, +diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h +index 77b426ae0064..ebccf6a6aa1b 100644 +--- a/include/trace/events/ext4.h ++++ b/include/trace/events/ext4.h +@@ -584,13 +584,6 @@ DECLARE_EVENT_CLASS(ext4__page_op, + (unsigned long) __entry->index) + ); + +-DEFINE_EVENT(ext4__page_op, ext4_writepage, +- +- TP_PROTO(struct page *page), +- +- TP_ARGS(page) +-); +- + DEFINE_EVENT(ext4__page_op, ext4_readpage, + + TP_PROTO(struct page *page), -- -2.39.2 +2.40.0.rc2 -From dd48d0cbb7162c029af11d861336a07195a7f331 Mon Sep 17 00:00:00 2001 +From 31bc464783789781c2a6885b36f63fcb3751a5bb Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 17 Feb 2023 15:35:46 +0100 -Subject: [PATCH 08/15] Implement amd-pstate-epp and amd-pstate-guided driver +Date: Fri, 10 Mar 2023 18:05:48 +0100 +Subject: [PATCH 08/16] Implement amd-pstate-epp and amd-pstate-guided driver Signed-off-by: Peter Jung --- @@ -24907,14 +26675,13 @@ Signed-off-by: Peter Jung drivers/cpufreq/amd-pstate.c | 794 +++++++++++++++++- drivers/cpufreq/brcmstb-avs-cpufreq.c | 5 +- drivers/cpufreq/cpufreq.c | 8 +- - drivers/cpufreq/davinci-cpufreq.c | 4 +- drivers/cpufreq/mediatek-cpufreq-hw.c | 4 +- drivers/cpufreq/omap-cpufreq.c | 4 +- drivers/cpufreq/qcom-cpufreq-hw.c | 4 +- include/acpi/cppc_acpi.h | 23 + include/linux/amd-pstate.h | 34 + include/linux/cpufreq.h | 2 +- - 13 files changed, 1139 insertions(+), 59 deletions(-) + 12 files changed, 1136 insertions(+), 58 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9595abf34974..f39b8f05392c 100644 @@ -25335,7 +27102,7 @@ index 0f17b1c32718..0efdbeed6ada 100644 /* Update only if there are pending write commands */ if (pcc_ss_data->pending_pcc_write_cmd) diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index c17bd845f5fc..d4e60da7a544 100644 +index c17bd845f5fc..f4f96baae500 100644 --- a/drivers/cpufreq/amd-pstate.c +++ b/drivers/cpufreq/amd-pstate.c @@ -59,8 +59,173 @@ @@ -26189,7 +27956,7 @@ index c17bd845f5fc..d4e60da7a544 100644 - if (!cppc_load) { - pr_debug("driver load is disabled, boot with amd_pstate=passive to enable this\n"); + if (cppc_state == AMD_PSTATE_DISABLE) { -+ pr_debug("driver load is disabled, boot with specific mode to enable this\n"); ++ pr_info("driver load is disabled, boot with specific mode to enable this\n"); return -ENODEV; } @@ -26322,21 +28089,6 @@ index 7e56a42750ea..85a0bea2dbf1 100644 } EXPORT_SYMBOL_GPL(cpufreq_unregister_driver); -diff --git a/drivers/cpufreq/davinci-cpufreq.c b/drivers/cpufreq/davinci-cpufreq.c -index 9e97f60f8199..2d23015e2abd 100644 ---- a/drivers/cpufreq/davinci-cpufreq.c -+++ b/drivers/cpufreq/davinci-cpufreq.c -@@ -138,7 +138,9 @@ static int __exit davinci_cpufreq_remove(struct platform_device *pdev) - if (cpufreq.asyncclk) - clk_put(cpufreq.asyncclk); - -- return cpufreq_unregister_driver(&davinci_driver); -+ cpufreq_unregister_driver(&davinci_driver); -+ -+ return 0; - } - - static struct platform_driver davinci_cpufreq_driver = { diff --git a/drivers/cpufreq/mediatek-cpufreq-hw.c b/drivers/cpufreq/mediatek-cpufreq-hw.c index f80339779084..f21a9e3df53d 100644 --- a/drivers/cpufreq/mediatek-cpufreq-hw.c @@ -26509,12 +28261,12 @@ index 6a94a6eaad27..65623233ab2f 100644 bool cpufreq_driver_test_flags(u16 flags); const char *cpufreq_get_current_driver(void); -- -2.39.2 +2.40.0.rc2 -From 952e829ec925dcded44f080eefbef8078de089c8 Mon Sep 17 00:00:00 2001 +From 501028b1bc1da95eeb61b26a0ee82ef93873d5d7 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Sun, 22 Jan 2023 13:41:50 +0100 -Subject: [PATCH 09/15] ksm +Subject: [PATCH 09/16] ksm Signed-off-by: Peter Jung --- @@ -26774,10 +28526,10 @@ index 860b2dcf3ac4..810e1fcaff94 100644 COND_SYSCALL(mbind); COND_SYSCALL(get_mempolicy); diff --git a/mm/ksm.c b/mm/ksm.c -index c267b92b837b..4474b7ac0cd6 100644 +index ee60890cf9b1..bc920121bce9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c -@@ -2575,54 +2575,78 @@ static int ksm_scan_thread(void *nothing) +@@ -2582,54 +2582,78 @@ static int ksm_scan_thread(void *nothing) return 0; } @@ -27010,37 +28762,37 @@ index b6ea204d4e23..0064dcafb812 100644 +subsys_initcall(pmadv_sysfs_init); +#endif /* CONFIG_KSM */ -- -2.39.2 +2.40.0.rc2 -From 4146b9df71595a233386acaed0dc699b27eb7e8a Mon Sep 17 00:00:00 2001 +From abf71738a315ea5ad029cd3976ec7b2d9456c432 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 13 Feb 2023 09:25:47 +0100 -Subject: [PATCH 10/15] maple-lru +Date: Fri, 10 Mar 2023 18:06:12 +0100 +Subject: [PATCH 10/16] maple-lru Signed-off-by: Peter Jung --- - Documentation/mm/multigen_lru.rst | 86 ++- + Documentation/mm/multigen_lru.rst | 128 +++- include/linux/fs.h | 2 + include/linux/maple_tree.h | 6 - include/linux/memcontrol.h | 10 + include/linux/mm_inline.h | 19 +- - include/linux/mmzone.h | 122 +++- - lib/maple_tree.c | 113 ++- + include/linux/mmzone.h | 124 +++- + lib/maple_tree.c | 149 ++-- mm/fadvise.c | 5 +- mm/memcontrol.c | 12 + mm/memory.c | 7 +- mm/page_alloc.c | 1 + mm/rmap.c | 42 +- - mm/vmscan.c | 1059 ++++++++++++++++++----------- + mm/vmscan.c | 1083 ++++++++++++++++++----------- mm/workingset.c | 4 +- tools/testing/radix-tree/maple.c | 18 +- - 15 files changed, 1002 insertions(+), 504 deletions(-) + 15 files changed, 1066 insertions(+), 544 deletions(-) diff --git a/Documentation/mm/multigen_lru.rst b/Documentation/mm/multigen_lru.rst -index d7062c6a8946..5f1f6ecbb79b 100644 +index d7062c6a8946..52ed5092022f 100644 --- a/Documentation/mm/multigen_lru.rst +++ b/Documentation/mm/multigen_lru.rst -@@ -89,15 +89,15 @@ variables are monotonically increasing. +@@ -89,21 +89,22 @@ variables are monotonically increasing. Generation numbers are truncated into ``order_base_2(MAX_NR_GENS+1)`` bits in order to fit into the gen counter in ``folio->flags``. Each @@ -27059,7 +28811,15 @@ index d7062c6a8946..5f1f6ecbb79b 100644 contrast to moving across generations, which requires the LRU lock, moving across tiers only involves atomic operations on ``folio->flags`` and therefore has a negligible cost. A feedback loop -@@ -127,7 +127,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. + modeled after the PID controller monitors refaults over all the tiers + from anon and file types and decides which tiers from which types to +-evict or protect. ++evict or protect. The desired effect is to balance refault percentages ++between anon and file types proportional to the swappiness level. + + There are two conceptually independent procedures: the aging and the + eviction. They form a closed-loop system, i.e., the page reclaim. +@@ -127,7 +128,7 @@ page mapped by this PTE to ``(max_seq%MAX_NR_GENS)+1``. Eviction -------- The eviction consumes old generations. Given an ``lruvec``, it @@ -27068,7 +28828,7 @@ index d7062c6a8946..5f1f6ecbb79b 100644 ``min_seq%MAX_NR_GENS`` becomes empty. To select a type and a tier to evict from, it first compares ``min_seq[]`` to select the older type. If both types are equally old, it selects the one whose first tier has -@@ -141,9 +141,85 @@ loop has detected outlying refaults from the tier this page is in. To +@@ -141,15 +142,124 @@ loop has detected outlying refaults from the tier this page is in. To this end, the feedback loop uses the first tier as the baseline, for the reason stated earlier. @@ -27087,6 +28847,27 @@ index d7062c6a8946..5f1f6ecbb79b 100644 + and memory sizes. +2. It is more reliable because it is directly wired to the OOM killer. + ++``mm_struct`` list ++------------------ ++An ``mm_struct`` list is maintained for each memcg, and an ++``mm_struct`` follows its owner task to the new memcg when this task ++is migrated. ++ ++A page table walker iterates ``lruvec_memcg()->mm_list`` and calls ++``walk_page_range()`` with each ``mm_struct`` on this list to scan ++PTEs. When multiple page table walkers iterate the same list, each of ++them gets a unique ``mm_struct``, and therefore they can run in ++parallel. ++ ++Page table walkers ignore any misplaced pages, e.g., if an ++``mm_struct`` was migrated, pages left in the previous memcg will be ++ignored when the current memcg is under reclaim. Similarly, page table ++walkers will ignore pages from nodes other than the one under reclaim. ++ ++This infrastructure also tracks the usage of ``mm_struct`` between ++context switches so that page table walkers can skip processes that ++have been sleeping since the last iteration. ++ +Rmap/PT walk feedback +--------------------- +Searching the rmap for PTEs mapping each page on an LRU list (to test @@ -27101,7 +28882,7 @@ index d7062c6a8946..5f1f6ecbb79b 100644 +adds the PMD entry pointing to the PTE table to the Bloom filter. This +forms a feedback loop between the eviction and the aging. + -+Bloom Filters ++Bloom filters +------------- +Bloom filters are a space and memory efficient data structure for set +membership test, i.e., test if an element is not in the set or may be @@ -27117,6 +28898,18 @@ index d7062c6a8946..5f1f6ecbb79b 100644 +which may yield hot pages anyway. Parameters of the filter itself can +control the false positive rate in the limit. + ++PID controller ++-------------- ++A feedback loop modeled after the Proportional-Integral-Derivative ++(PID) controller monitors refaults over anon and file types and ++decides which type to evict when both types are available from the ++same generation. ++ ++The PID controller uses generations rather than the wall clock as the ++time domain because a CPU can scan pages at different rates under ++varying memory pressure. It calculates a moving average for each new ++generation to avoid being permanently locked in a suboptimal state. ++ +Memcg LRU +--------- +An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, @@ -27155,6 +28948,15 @@ index d7062c6a8946..5f1f6ecbb79b 100644 * Generations * Rmap walks +-* Page table walks +-* Bloom filters +-* PID controller ++* Page table walks via ``mm_struct`` list ++* Bloom filters for rmap/PT walk feedback ++* PID controller for refault feedback + + The aging and the eviction form a producer-consumer model; + specifically, the latter drives the former by the sliding window over diff --git a/include/linux/fs.h b/include/linux/fs.h index c1769a2c5d70..d353c262d669 100644 --- a/include/linux/fs.h @@ -27285,7 +29087,7 @@ index ff3f3f23f649..de1e622dd366 100644 + #endif diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index cd28a100d9e4..977be526c939 100644 +index cd28a100d9e4..70bd7f55bdd2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -7,6 +7,7 @@ @@ -27490,20 +29292,32 @@ index cd28a100d9e4..977be526c939 100644 /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif -@@ -1243,6 +1345,8 @@ typedef struct pglist_data { +@@ -1242,7 +1344,9 @@ typedef struct pglist_data { + #ifdef CONFIG_LRU_GEN /* kswap mm walk data */ - struct lru_gen_mm_walk mm_walk; +- struct lru_gen_mm_walk mm_walk; ++ struct lru_gen_mm_walk mm_walk; + /* lru_gen_folio list */ + struct lru_gen_memcg memcg_lru; #endif CACHELINE_PADDING(_pad2_); diff --git a/lib/maple_tree.c b/lib/maple_tree.c -index 5a976393c9ae..b95652b79b55 100644 +index 5a976393c9ae..a73f83d0eb0e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c -@@ -149,13 +149,12 @@ struct maple_subtree_state { +@@ -146,16 +146,22 @@ struct maple_subtree_state { + struct maple_big_node *bn; + }; + ++#ifdef CONFIG_KASAN_STACK ++/* Prevent mas_wr_bnode() from exceeding the stack frame limit */ ++#define noinline_for_kasan noinline_for_stack ++#else ++#define noinline_for_kasan inline ++#endif ++ /* Functions */ static inline struct maple_node *mt_alloc_one(gfp_t gfp) { @@ -27519,7 +29333,7 @@ index 5a976393c9ae..b95652b79b55 100644 } static inline void mt_free_bulk(size_t size, void __rcu **nodes) -@@ -183,7 +182,6 @@ static void ma_free_rcu(struct maple_node *node) +@@ -183,7 +189,6 @@ static void ma_free_rcu(struct maple_node *node) call_rcu(&node->rcu, mt_free_rcu); } @@ -27527,7 +29341,7 @@ index 5a976393c9ae..b95652b79b55 100644 static void mas_set_height(struct ma_state *mas) { unsigned int new_flags = mas->tree->ma_flags; -@@ -468,7 +466,7 @@ static inline +@@ -468,7 +473,7 @@ static inline void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, unsigned char slot) { @@ -27536,7 +29350,7 @@ index 5a976393c9ae..b95652b79b55 100644 unsigned long shift; unsigned long type; enum maple_type p_type = mte_node_type(parent); -@@ -502,10 +500,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, +@@ -502,10 +507,9 @@ void mte_set_parent(struct maple_enode *enode, const struct maple_enode *parent, */ static inline unsigned int mte_parent_slot(const struct maple_enode *enode) { @@ -27549,7 +29363,7 @@ index 5a976393c9ae..b95652b79b55 100644 return 0; /* -@@ -1128,9 +1125,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) +@@ -1128,9 +1132,10 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) { struct maple_alloc *ret, *node = mas->alloc; unsigned long total = mas_allocated(mas); @@ -27561,7 +29375,7 @@ index 5a976393c9ae..b95652b79b55 100644 return NULL; if (total == 1) { -@@ -1140,27 +1138,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) +@@ -1140,27 +1145,25 @@ static inline struct maple_node *mas_pop_node(struct ma_state *mas) goto single_node; } @@ -27597,7 +29411,7 @@ index 5a976393c9ae..b95652b79b55 100644 return (struct maple_node *)ret; } -@@ -1179,21 +1175,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) +@@ -1179,21 +1182,20 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) unsigned long count; unsigned int requested = mas_alloc_req(mas); @@ -27624,7 +29438,7 @@ index 5a976393c9ae..b95652b79b55 100644 reuse->total += head->total; } -@@ -1212,7 +1207,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) +@@ -1212,7 +1214,6 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) { struct maple_alloc *node; unsigned long allocated = mas_allocated(mas); @@ -27632,7 +29446,7 @@ index 5a976393c9ae..b95652b79b55 100644 unsigned int requested = mas_alloc_req(mas); unsigned int count; void **slots = NULL; -@@ -1228,24 +1222,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) +@@ -1228,24 +1229,29 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) WARN_ON(!allocated); } @@ -27667,7 +29481,7 @@ index 5a976393c9ae..b95652b79b55 100644 slots = (void **)&node->slot[offset]; max_req -= offset; -@@ -1259,15 +1258,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) +@@ -1259,15 +1265,13 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) goto nomem_bulk; node->node_count += count; @@ -27687,7 +29501,7 @@ index 5a976393c9ae..b95652b79b55 100644 return; nomem_bulk: -@@ -1276,10 +1273,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) +@@ -1276,10 +1280,8 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) nomem_one: mas_set_alloc_req(mas, requested); if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) @@ -27699,7 +29513,7 @@ index 5a976393c9ae..b95652b79b55 100644 } /* -@@ -1887,10 +1882,9 @@ static inline int mab_calc_split(struct ma_state *mas, +@@ -1887,10 +1889,9 @@ static inline int mab_calc_split(struct ma_state *mas, /* Avoid ending a node on a NULL entry */ split = mab_no_null_split(bn, split, slot_count); @@ -27712,7 +29526,16 @@ index 5a976393c9ae..b95652b79b55 100644 return split; } -@@ -2947,7 +2941,7 @@ static inline void *mtree_range_walk(struct ma_state *mas) +@@ -2113,7 +2114,7 @@ static inline void mas_bulk_rebalance(struct ma_state *mas, unsigned char end, + * + * Return: The actual end of the data stored in @b_node + */ +-static inline void mas_store_b_node(struct ma_wr_state *wr_mas, ++static noinline_for_kasan void mas_store_b_node(struct ma_wr_state *wr_mas, + struct maple_big_node *b_node, unsigned char offset_end) + { + unsigned char slot; +@@ -2947,7 +2948,7 @@ static inline void *mtree_range_walk(struct ma_state *mas) mas->min = prev_min; mas->max = prev_max; mas->node = last; @@ -27721,7 +29544,7 @@ index 5a976393c9ae..b95652b79b55 100644 dead_node: mas_reset(mas); -@@ -3467,7 +3461,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, +@@ -3467,7 +3468,6 @@ static inline bool mas_push_data(struct ma_state *mas, int height, */ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) { @@ -27729,7 +29552,16 @@ index 5a976393c9ae..b95652b79b55 100644 struct maple_subtree_state mast; int height = 0; unsigned char mid_split, split = 0; -@@ -3893,7 +3886,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) +@@ -3586,7 +3586,7 @@ static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, + * @b_node: The maple big node + * @end: The end of the data. + */ +-static inline int mas_commit_b_node(struct ma_wr_state *wr_mas, ++static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, + struct maple_big_node *b_node, unsigned char end) + { + struct maple_node *node; +@@ -3893,7 +3893,7 @@ static inline void *mtree_lookup_walk(struct ma_state *mas) goto dead_node; } while (!ma_is_leaf(type)); @@ -27738,7 +29570,7 @@ index 5a976393c9ae..b95652b79b55 100644 dead_node: mas_reset(mas); -@@ -4711,15 +4704,11 @@ static inline void *mas_next_nentry(struct ma_state *mas, +@@ -4711,15 +4711,11 @@ static inline void *mas_next_nentry(struct ma_state *mas, static inline void mas_rewalk(struct ma_state *mas, unsigned long index) { @@ -27754,7 +29586,48 @@ index 5a976393c9ae..b95652b79b55 100644 } /* -@@ -5590,8 +5579,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, +@@ -5093,35 +5089,21 @@ static inline bool mas_rewind_node(struct ma_state *mas) + */ + static inline bool mas_skip_node(struct ma_state *mas) + { +- unsigned char slot, slot_count; +- unsigned long *pivots; +- enum maple_type mt; ++ if (mas_is_err(mas)) ++ return false; + +- mt = mte_node_type(mas->node); +- slot_count = mt_slots[mt] - 1; + do { + if (mte_is_root(mas->node)) { +- slot = mas->offset; +- if (slot > slot_count) { ++ if (mas->offset >= mas_data_end(mas)) { + mas_set_err(mas, -EBUSY); + return false; + } + } else { + mas_ascend(mas); +- slot = mas->offset; +- mt = mte_node_type(mas->node); +- slot_count = mt_slots[mt] - 1; + } +- } while (slot > slot_count); +- +- mas->offset = ++slot; +- pivots = ma_pivots(mas_mn(mas), mt); +- if (slot > 0) +- mas->min = pivots[slot - 1] + 1; +- +- if (slot <= slot_count) +- mas->max = pivots[slot]; ++ } while (mas->offset >= mas_data_end(mas)); + ++ mas->offset++; + return true; + } + +@@ -5590,8 +5572,8 @@ static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags, /* * mte_destroy_walk() - Free a tree or sub-tree. @@ -27765,7 +29638,7 @@ index 5a976393c9ae..b95652b79b55 100644 * * Must hold the write lock. */ -@@ -5620,7 +5609,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) +@@ -5620,7 +5602,6 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) mas_reset(wr_mas->mas); } } @@ -27773,7 +29646,15 @@ index 5a976393c9ae..b95652b79b55 100644 } /* Interface */ -@@ -5745,6 +5733,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) +@@ -5733,6 +5714,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) + mas_reset(mas); + return ret; + } ++EXPORT_SYMBOL_GPL(mas_preallocate); + + /* + * mas_destroy() - destroy a maple state. +@@ -5745,6 +5727,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) void mas_destroy(struct ma_state *mas) { struct maple_alloc *node; @@ -27781,7 +29662,7 @@ index 5a976393c9ae..b95652b79b55 100644 /* * When using mas_for_each() to insert an expected number of elements, -@@ -5767,14 +5756,20 @@ void mas_destroy(struct ma_state *mas) +@@ -5767,14 +5750,20 @@ void mas_destroy(struct ma_state *mas) } mas->mas_flags &= ~(MA_STATE_BULK|MA_STATE_PREALLOC); @@ -27806,7 +29687,7 @@ index 5a976393c9ae..b95652b79b55 100644 mas->alloc = NULL; } EXPORT_SYMBOL_GPL(mas_destroy); -@@ -6734,7 +6729,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, +@@ -6734,7 +6723,7 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, if (i < (MAPLE_RANGE64_SLOTS - 1)) last = node->pivot[i]; @@ -27815,7 +29696,7 @@ index 5a976393c9ae..b95652b79b55 100644 break; if (last == 0 && i > 0) break; -@@ -6841,7 +6836,7 @@ void mt_dump(const struct maple_tree *mt) +@@ -6841,7 +6830,7 @@ void mt_dump(const struct maple_tree *mt) if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0); else if (entry) @@ -27848,7 +29729,7 @@ index bf04fec87f35..fb7c5f43fd2a 100644 case POSIX_FADV_DONTNEED: __filemap_fdatawrite_range(mapping, offset, endbyte, diff --git a/mm/memcontrol.c b/mm/memcontrol.c -index 73afff8062f9..7fe2f4f36cf4 100644 +index 2eee092f8f11..802d3868d097 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -477,6 +477,12 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid) @@ -27874,7 +29755,7 @@ index 73afff8062f9..7fe2f4f36cf4 100644 if (order > 0) return 0; -@@ -5382,6 +5391,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) +@@ -5386,6 +5395,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) if (unlikely(mem_cgroup_is_root(memcg))) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ); @@ -27882,7 +29763,7 @@ index 73afff8062f9..7fe2f4f36cf4 100644 return 0; offline_kmem: memcg_offline_kmem(memcg); -@@ -5413,6 +5423,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) +@@ -5417,6 +5427,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) memcg_offline_kmem(memcg); reparent_shrinker_deferred(memcg); wb_memcg_offline(memcg); @@ -27890,7 +29771,7 @@ index 73afff8062f9..7fe2f4f36cf4 100644 drain_all_stock(memcg); -@@ -5424,6 +5435,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) +@@ -5428,6 +5439,7 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); invalidate_reclaim_iterators(memcg); @@ -27924,10 +29805,10 @@ index f526b9152bef..4ad62eba3cb7 100644 static void lru_gen_exit_fault(void) diff --git a/mm/page_alloc.c b/mm/page_alloc.c -index 3bb3484563ed..d8c7750c5c92 100644 +index 3aec9a6a9cb7..6658cbf43f5d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c -@@ -7929,6 +7929,7 @@ static void __init free_area_init_node(int nid) +@@ -7943,6 +7943,7 @@ static void __init free_area_init_node(int nid) pgdat_set_deferred_range(pgdat); free_area_init_core(pgdat); @@ -27936,7 +29817,7 @@ index 3bb3484563ed..d8c7750c5c92 100644 static void __init free_area_init_memoryless_node(int nid) diff --git a/mm/rmap.c b/mm/rmap.c -index b616870a09be..7b9205cb7d87 100644 +index 3b45d049069e..c8701608bb0d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -823,25 +823,14 @@ static bool folio_referenced_one(struct folio *folio, @@ -28015,7 +29896,7 @@ index b616870a09be..7b9205cb7d87 100644 *vm_flags = pra.vm_flags; diff --git a/mm/vmscan.c b/mm/vmscan.c -index 160acbbdf111..04a54656b6b7 100644 +index 160acbbdf111..1a8f3b1c0bad 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,8 @@ @@ -28285,6 +30166,15 @@ index 160acbbdf111..04a54656b6b7 100644 static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last) { int i; +@@ -3592,7 +3608,7 @@ static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq) + } + + /****************************************************************************** +- * refault feedback loop ++ * PID controller + ******************************************************************************/ + + /* @@ -3623,7 +3639,7 @@ struct ctrl_pos { static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, struct ctrl_pos *pos) @@ -29558,7 +31448,69 @@ index 160acbbdf111..04a54656b6b7 100644 while (!list_empty(head)) { bool success; -@@ -5545,7 +5814,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, +@@ -5402,14 +5671,14 @@ static void lru_gen_change_state(bool enabled) + * sysfs interface + ******************************************************************************/ + +-static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++static ssize_t min_ttl_ms_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) + { +- return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); ++ return sysfs_emit(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl))); + } + + /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +-static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, +- const char *buf, size_t len) ++static ssize_t min_ttl_ms_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t len) + { + unsigned int msecs; + +@@ -5421,11 +5690,9 @@ static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, + return len; + } + +-static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR( +- min_ttl_ms, 0644, show_min_ttl, store_min_ttl +-); ++static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR_RW(min_ttl_ms); + +-static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) + { + unsigned int caps = 0; + +@@ -5442,7 +5709,7 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c + } + + /* see Documentation/admin-guide/mm/multigen_lru.rst for details */ +-static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, ++static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) + { + int i; +@@ -5469,9 +5736,7 @@ static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr, + return len; + } + +-static struct kobj_attribute lru_gen_enabled_attr = __ATTR( +- enabled, 0644, show_enabled, store_enabled +-); ++static struct kobj_attribute lru_gen_enabled_attr = __ATTR_RW(enabled); + + static struct attribute *lru_gen_attrs[] = { + &lru_gen_min_ttl_attr.attr, +@@ -5479,7 +5744,7 @@ static struct attribute *lru_gen_attrs[] = { + NULL + }; + +-static struct attribute_group lru_gen_attr_group = { ++static const struct attribute_group lru_gen_attr_group = { + .name = "lru_gen", + .attrs = lru_gen_attrs, + }; +@@ -5545,7 +5810,7 @@ static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec, int i; int type, tier; int hist = lru_hist_from_seq(seq); @@ -29567,7 +31519,7 @@ index 160acbbdf111..04a54656b6b7 100644 for (tier = 0; tier < MAX_NR_TIERS; tier++) { seq_printf(m, " %10d", tier); -@@ -5595,7 +5864,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) +@@ -5595,7 +5860,7 @@ static int lru_gen_seq_show(struct seq_file *m, void *v) unsigned long seq; bool full = !debugfs_real_fops(m->file)->write; struct lruvec *lruvec = v; @@ -29576,7 +31528,7 @@ index 160acbbdf111..04a54656b6b7 100644 int nid = lruvec_pgdat(lruvec)->node_id; struct mem_cgroup *memcg = lruvec_memcg(lruvec); DEFINE_MAX_SEQ(lruvec); -@@ -5692,7 +5961,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co +@@ -5692,7 +5957,7 @@ static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_co if (sc->nr_reclaimed >= nr_to_reclaim) return 0; @@ -29585,7 +31537,7 @@ index 160acbbdf111..04a54656b6b7 100644 return 0; cond_resched(); -@@ -5713,11 +5982,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, +@@ -5713,11 +5978,11 @@ static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq, if (!mem_cgroup_disabled()) { rcu_read_lock(); @@ -29600,7 +31552,7 @@ index 160acbbdf111..04a54656b6b7 100644 rcu_read_unlock(); if (!memcg) -@@ -5777,7 +6046,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, +@@ -5777,7 +6042,7 @@ static ssize_t lru_gen_seq_write(struct file *file, const char __user *src, set_task_reclaim_state(current, &sc.reclaim_state); flags = memalloc_noreclaim_save(); blk_start_plug(&plug); @@ -29609,7 +31561,7 @@ index 160acbbdf111..04a54656b6b7 100644 err = -ENOMEM; goto done; } -@@ -5849,7 +6118,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) +@@ -5849,7 +6114,7 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) { int i; int gen, type, zone; @@ -29618,7 +31570,7 @@ index 160acbbdf111..04a54656b6b7 100644 lrugen->max_seq = MIN_NR_GENS + 1; lrugen->enabled = lru_gen_enabled(); -@@ -5858,13 +6127,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) +@@ -5858,13 +6123,26 @@ void lru_gen_init_lruvec(struct lruvec *lruvec) lrugen->timestamps[i] = jiffies; for_each_gen_type_zone(gen, type, zone) @@ -29646,7 +31598,7 @@ index 160acbbdf111..04a54656b6b7 100644 void lru_gen_init_memcg(struct mem_cgroup *memcg) { INIT_LIST_HEAD(&memcg->mm_list.fifo); -@@ -5876,19 +6158,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) +@@ -5876,19 +6154,25 @@ void lru_gen_exit_memcg(struct mem_cgroup *memcg) int i; int nid; @@ -29673,7 +31625,7 @@ index 160acbbdf111..04a54656b6b7 100644 static int __init init_lru_gen(void) { -@@ -5915,6 +6203,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc +@@ -5915,6 +6199,10 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc { } @@ -29684,7 +31636,7 @@ index 160acbbdf111..04a54656b6b7 100644 #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) -@@ -5928,7 +6220,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +@@ -5928,7 +6216,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) bool proportional_reclaim; struct blk_plug plug; @@ -29693,7 +31645,7 @@ index 160acbbdf111..04a54656b6b7 100644 lru_gen_shrink_lruvec(lruvec, sc); return; } -@@ -6171,6 +6463,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) +@@ -6171,6 +6459,11 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) struct lruvec *target_lruvec; bool reclaimable = false; @@ -29786,12 +31738,12 @@ index 81fa7ec2e66a..1f36bc1c5d36 100644 mn = mas_pop_node(&mas); MT_BUG_ON(mt, not_empty(mn)); -- -2.39.2 +2.40.0.rc2 -From bdbf1daa5eee87e0879e18f3a427259ff1840c98 Mon Sep 17 00:00:00 2001 +From d9e434e1093f450c71f9a327b2201f7bdcc75743 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Fri, 17 Feb 2023 13:41:20 +0100 -Subject: [PATCH 11/15] mm/kvm: lockless accessed bit harvest +Subject: [PATCH 11/16] mm/kvm: lockless accessed bit harvest TLDR ==== @@ -30687,7 +32639,7 @@ index d6c06e140277..521f71ad0467 100644 unsigned long address) { diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h -index 977be526c939..beece92ce62e 100644 +index 70bd7f55bdd2..0ddbf712708d 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -379,6 +379,7 @@ enum { @@ -30756,7 +32708,7 @@ index f45ff1b7626a..324799848fed 100644 unsigned long address) { diff --git a/mm/rmap.c b/mm/rmap.c -index 7b9205cb7d87..82e3a0be1ada 100644 +index c8701608bb0d..8ecbbadab752 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -822,12 +822,10 @@ static bool folio_referenced_one(struct folio *folio, @@ -30776,7 +32728,7 @@ index 7b9205cb7d87..82e3a0be1ada 100644 pvmw.pte)) referenced++; diff --git a/mm/vmscan.c b/mm/vmscan.c -index 04a54656b6b7..2fc436638dfe 100644 +index 1a8f3b1c0bad..ec0142165ce7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,6 +57,8 @@ @@ -31011,7 +32963,7 @@ index 04a54656b6b7..2fc436638dfe 100644 } /****************************************************************************** -@@ -5707,6 +5805,9 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c +@@ -5705,6 +5803,9 @@ static ssize_t enabled_show(struct kobject *kobj, struct kobj_attribute *attr, c if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) caps |= BIT(LRU_GEN_NONLEAF_YOUNG); @@ -31022,7 +32974,7 @@ index 04a54656b6b7..2fc436638dfe 100644 } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c -index 9c60384b5ae0..1b465df4a93d 100644 +index 07aae60288f9..a115a27b375e 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -875,6 +875,63 @@ static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn, @@ -31098,12 +33050,12 @@ index 9c60384b5ae0..1b465df4a93d 100644 .release = kvm_mmu_notifier_release, }; -- -2.39.2 +2.40.0.rc2 -From df63c6ca5ad19cda15524ce1f5fce0eed3dc9932 Mon Sep 17 00:00:00 2001 +From c63e61e48ac0d492af1918ba84350e07a5c95d17 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 13 Feb 2023 09:26:09 +0100 -Subject: [PATCH 12/15] objtool +Subject: [PATCH 12/16] objtool Signed-off-by: Peter Jung --- @@ -31112,13 +33064,13 @@ Signed-off-by: Peter Jung tools/objtool/Documentation/objtool.txt | 8 +++ tools/objtool/Makefile | 66 +++++++++++++++++-------- tools/objtool/builtin-check.c | 2 +- - tools/objtool/check.c | 9 ++++ + tools/objtool/check.c | 7 +++ tools/objtool/elf.c | 42 ++++++++-------- tools/objtool/include/objtool/builtin.h | 2 - tools/objtool/include/objtool/elf.h | 9 ++-- tools/objtool/include/objtool/special.h | 2 +- tools/objtool/special.c | 6 +-- - 11 files changed, 95 insertions(+), 54 deletions(-) + 11 files changed, 93 insertions(+), 54 deletions(-) diff --git a/tools/objtool/.gitignore b/tools/objtool/.gitignore index 14236db3677f..4faa4dd72f35 100644 @@ -31283,26 +33235,10 @@ index a4f39407bf59..7c175198d09f 100644 OPT_CALLBACK_OPTARG('h', "hacks", NULL, NULL, "jump_label,noinstr,skylake", "patch toolchain bugs/limitations", parse_hacks), OPT_BOOLEAN('i', "ibt", &opts.ibt, "validate and annotate IBT"), diff --git a/tools/objtool/check.c b/tools/objtool/check.c -index 4b7c8b33069e..0678ba04fe22 100644 +index ea1e7cdeb1b3..384b7df3fbb2 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c -@@ -688,6 +688,7 @@ static int create_static_call_sections(struct objtool_file *file) - if (strncmp(key_name, STATIC_CALL_TRAMP_PREFIX_STR, - STATIC_CALL_TRAMP_PREFIX_LEN)) { - WARN("static_call: trampoline name malformed: %s", key_name); -+ free(key_name); - return -1; - } - tmp = key_name + STATIC_CALL_TRAMP_PREFIX_LEN - STATIC_CALL_KEY_PREFIX_LEN; -@@ -697,6 +698,7 @@ static int create_static_call_sections(struct objtool_file *file) - if (!key_sym) { - if (!opts.module) { - WARN("static_call: can't find static_call_key symbol: %s", tmp); -+ free(key_name); - return -1; - } - -@@ -854,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) +@@ -856,8 +856,15 @@ static int create_ibt_endbr_seal_sections(struct objtool_file *file) list_for_each_entry(insn, &file->endbr_list, call_node) { int *site = (int *)sec->data->d_buf + idx; @@ -31504,12 +33440,12 @@ index 9c8d827f69af..baa85c31526b 100644 unsigned int nr_entries; struct special_alt *alt; -- -2.39.2 +2.40.0.rc2 -From ebd62c969d7faaafed390dca325a64c1b7cbd982 Mon Sep 17 00:00:00 2001 +From 56bbff019101b84507c1e796512b1be6840c6eda Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 25 Feb 2023 14:41:53 +0100 -Subject: [PATCH 13/15] sched +Date: Fri, 3 Mar 2023 17:02:07 +0100 +Subject: [PATCH 13/16] sched Signed-off-by: Peter Jung --- @@ -33156,12 +35092,12 @@ index 771f8ddb7053..9e8bb6278604 100644 * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency -- -2.39.2 +2.40.0.rc2 -From 8b6571aa2c4ecca1ed8686c872fb37696788a043 Mon Sep 17 00:00:00 2001 +From e0cfd01287f19367a61351b05d43cf4471156ffd Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Mon, 6 Feb 2023 09:53:13 +0100 -Subject: [PATCH 14/15] zram +Subject: [PATCH 14/16] zram Signed-off-by: Peter Jung --- @@ -33649,12 +35585,12 @@ index c5254626f051..2afdbf76a1aa 100644 }; #endif -- -2.39.2 +2.40.0.rc2 -From 33909ccdec1819a9090548ad25426b2ba315de15 Mon Sep 17 00:00:00 2001 +From 02b507dfef3f09d3de2785ed80164e15c8ed7844 Mon Sep 17 00:00:00 2001 From: Peter Jung Date: Tue, 14 Feb 2023 22:02:09 +0100 -Subject: [PATCH 15/15] zstd import v1.5.4 +Subject: [PATCH 15/16] zstd import v1.5.4 Signed-off-by: Peter Jung --- @@ -45767,4 +47703,3518 @@ index f4ed952ed485..7d31518e9d5a 100644 EXPORT_SYMBOL(zstd_reset_dstream); -- -2.39.2 +2.40.0.rc2 + +From 16b77e5461b5cc96bf4476bde0fee2ecc25aca83 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 10 Mar 2023 19:28:54 +0100 +Subject: [PATCH 16/16] v4l2-core: add v4l2loopback + +Signed-off-by: Peter Jung +--- + drivers/media/v4l2-core/Kconfig | 5 + + drivers/media/v4l2-core/Makefile | 2 + + drivers/media/v4l2-core/v4l2loopback.c | 2906 +++++++++++++++++ + drivers/media/v4l2-core/v4l2loopback.h | 96 + + .../media/v4l2-core/v4l2loopback_formats.h | 445 +++ + 5 files changed, 3454 insertions(+) + create mode 100644 drivers/media/v4l2-core/v4l2loopback.c + create mode 100644 drivers/media/v4l2-core/v4l2loopback.h + create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h + +diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig +index 348559bc2468..32a46fcc751f 100644 +--- a/drivers/media/v4l2-core/Kconfig ++++ b/drivers/media/v4l2-core/Kconfig +@@ -40,6 +40,11 @@ config VIDEO_TUNER + config V4L2_JPEG_HELPER + tristate + ++config V4L2_LOOPBACK ++ tristate "V4L2 loopback device" ++ help ++ V4L2 loopback device ++ + # Used by drivers that need v4l2-h264.ko + config V4L2_H264 + tristate +diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile +index 41d91bd10cf2..4de37a844f95 100644 +--- a/drivers/media/v4l2-core/Makefile ++++ b/drivers/media/v4l2-core/Makefile +@@ -32,6 +32,8 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o + obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o + obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o + ++obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o ++ + obj-$(CONFIG_VIDEOBUF_DMA_CONTIG) += videobuf-dma-contig.o + obj-$(CONFIG_VIDEOBUF_DMA_SG) += videobuf-dma-sg.o + obj-$(CONFIG_VIDEOBUF_GEN) += videobuf-core.o +diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c +new file mode 100644 +index 000000000000..2ab1f760cfb5 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback.c +@@ -0,0 +1,2906 @@ ++/* -*- c-file-style: "linux" -*- */ ++/* ++ * v4l2loopback.c -- video4linux2 loopback driver ++ * ++ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com) ++ * Copyright (C) 2010-2019 IOhannes m zmoelnig (zmoelnig@iem.at) ++ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de) ++ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com) ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "v4l2loopback.h" ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1) ++#define kstrtoul strict_strtoul ++#endif ++ ++#if defined(timer_setup) && defined(from_timer) ++#define HAVE_TIMER_SETUP ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) ++#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER ++#endif ++ ++#define V4L2LOOPBACK_VERSION_CODE \ ++ KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \ ++ V4L2LOOPBACK_VERSION_BUGFIX) ++ ++MODULE_DESCRIPTION("V4L2 loopback video device"); ++MODULE_AUTHOR("Vasily Levin, " ++ "IOhannes m zmoelnig ," ++ "Stefan Diewald," ++ "Anton Novikov" ++ "et al."); ++#ifdef SNAPSHOT_VERSION ++MODULE_VERSION(__stringify(SNAPSHOT_VERSION)); ++#else ++MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify( ++ V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX)); ++#endif ++MODULE_LICENSE("GPL"); ++ ++/* ++ * helpers ++ */ ++#define dprintk(fmt, args...) \ ++ do { \ ++ if (debug > 0) { \ ++ printk(KERN_INFO "v4l2-loopback[" __stringify( \ ++ __LINE__) "], pid(%d): " fmt, \ ++ task_pid_nr(current), ##args); \ ++ } \ ++ } while (0) ++ ++#define MARK() \ ++ do { \ ++ if (debug > 1) { \ ++ printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \ ++ __LINE__, __func__, task_pid_nr(current)); \ ++ } \ ++ } while (0) ++ ++#define dprintkrw(fmt, args...) \ ++ do { \ ++ if (debug > 2) { \ ++ printk(KERN_INFO "v4l2-loopback[" __stringify( \ ++ __LINE__) "], pid(%d): " fmt, \ ++ task_pid_nr(current), ##args); \ ++ } \ ++ } while (0) ++ ++/* TODO: Make sure that function is never interrupted. */ ++static inline int mod_inc(int *number, int mod) ++{ ++ int result; ++ result = (*number + 1) % mod; ++ if (unlikely(result < 0)) ++ result += mod; ++ *number = result; ++ return result; ++} ++ ++static inline void v4l2l_get_timestamp(struct v4l2_buffer *b) ++{ ++ /* ktime_get_ts is considered deprecated, so use ktime_get_ts64 if possible */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 17, 0) ++ struct timespec ts; ++ ktime_get_ts(&ts); ++#else ++ struct timespec64 ts; ++ ktime_get_ts64(&ts); ++#endif ++ ++ b->timestamp.tv_sec = ts.tv_sec; ++ b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC); ++} ++ ++#if !defined(__poll_t) ++typedef unsigned __poll_t; ++#endif ++ ++/* module constants ++ * can be overridden during he build process using something like ++ * make KCPPFLAGS="-DMAX_DEVICES=100" ++ */ ++ ++/* maximum number of v4l2loopback devices that can be created */ ++#ifndef MAX_DEVICES ++#define MAX_DEVICES 8 ++#endif ++ ++/* whether the default is to announce capabilities exclusively or not */ ++#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS ++#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0 ++#endif ++ ++/* when a producer is considered to have gone stale */ ++#ifndef MAX_TIMEOUT ++#define MAX_TIMEOUT (100 * 1000) /* in msecs */ ++#endif ++ ++/* max buffers that can be mapped, actually they ++ * are all mapped to max_buffers buffers */ ++#ifndef MAX_BUFFERS ++#define MAX_BUFFERS 32 ++#endif ++ ++/* module parameters */ ++static int debug = 0; ++module_param(debug, int, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)"); ++ ++#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2 ++static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS; ++module_param(max_buffers, int, S_IRUGO); ++MODULE_PARM_DESC(max_buffers, ++ "how many buffers should be allocated [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]"); ++ ++/* how many times a device can be opened ++ * the per-module default value can be overridden on a per-device basis using ++ * the /sys/devices interface ++ * ++ * note that max_openers should be at least 2 in order to get a working system: ++ * one opener for the producer and one opener for the consumer ++ * however, we leave that to the user ++ */ ++#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10 ++static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS; ++module_param(max_openers, int, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC( ++ max_openers, ++ "how many users can open the loopback device [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]"); ++ ++static int devices = -1; ++module_param(devices, int, 0); ++MODULE_PARM_DESC(devices, "how many devices should be created"); ++ ++static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 }; ++module_param_array(video_nr, int, NULL, 0444); ++MODULE_PARM_DESC(video_nr, ++ "video device numbers (-1=auto, 0=/dev/video0, etc.)"); ++ ++static char *card_label[MAX_DEVICES]; ++module_param_array(card_label, charp, NULL, 0000); ++MODULE_PARM_DESC(card_label, "card labels for each device"); ++ ++static bool exclusive_caps[MAX_DEVICES] = { ++ [0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS ++}; ++module_param_array(exclusive_caps, bool, NULL, 0444); ++/* FIXXME: wording */ ++MODULE_PARM_DESC( ++ exclusive_caps, ++ "whether to announce OUTPUT/CAPTURE capabilities exclusively or not [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]"); ++ ++/* format specifications */ ++#define V4L2LOOPBACK_SIZE_MIN_WIDTH 48 ++#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 32 ++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192 ++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192 ++ ++#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640 ++#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480 ++ ++static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; ++module_param(max_width, int, S_IRUGO); ++MODULE_PARM_DESC(max_width, ++ "maximum allowed frame width [DEFAULT: " __stringify( ++ V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]"); ++static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; ++module_param(max_height, int, S_IRUGO); ++MODULE_PARM_DESC(max_height, ++ "maximum allowed frame height [DEFAULT: " __stringify( ++ V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]"); ++ ++static DEFINE_IDR(v4l2loopback_index_idr); ++static DEFINE_MUTEX(v4l2loopback_ctl_mutex); ++ ++/* frame intervals */ ++#define V4L2LOOPBACK_FPS_MIN 0 ++#define V4L2LOOPBACK_FPS_MAX 1000 ++ ++/* control IDs */ ++#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000) ++#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0) ++#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1) ++#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2) ++#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3) ++ ++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl); ++static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = { ++ .s_ctrl = v4l2loopback_s_ctrl, ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_KEEP_FORMAT, ++ .name = "keep_format", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_SUSTAIN_FRAMERATE, ++ .name = "sustain_framerate", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_TIMEOUT, ++ .name = "timeout", ++ .type = V4L2_CTRL_TYPE_INTEGER, ++ .min = 0, ++ .max = MAX_TIMEOUT, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_TIMEOUT_IMAGE_IO, ++ .name = "timeout_image_io", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++ ++/* module structures */ ++struct v4l2loopback_private { ++ int device_nr; ++}; ++ ++/* TODO(vasaka) use typenames which are common to kernel, but first find out if ++ * it is needed */ ++/* struct keeping state and settings of loopback device */ ++ ++struct v4l2l_buffer { ++ struct v4l2_buffer buffer; ++ struct list_head list_head; ++ int use_count; ++}; ++ ++struct v4l2_loopback_device { ++ struct v4l2_device v4l2_dev; ++ struct v4l2_ctrl_handler ctrl_handler; ++ struct video_device *vdev; ++ /* pixel and stream format */ ++ struct v4l2_pix_format pix_format; ++ struct v4l2_captureparm capture_param; ++ unsigned long frame_jiffies; ++ ++ /* ctrls */ ++ int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all ++ openers close() the device */ ++ int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain ++ (close to) nominal framerate */ ++ ++ /* buffers stuff */ ++ u8 *image; /* pointer to actual buffers data */ ++ unsigned long int imagesize; /* size of buffers data */ ++ int buffers_number; /* should not be big, 4 is a good choice */ ++ struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */ ++ int used_buffers; /* number of the actually used buffers */ ++ int max_openers; /* how many times can this device be opened */ ++ ++ int write_position; /* number of last written frame + 1 */ ++ struct list_head outbufs_list; /* buffers in output DQBUF order */ ++ int bufpos2index ++ [MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers) ++ * to inner buffer index */ ++ long buffer_size; ++ ++ /* sustain_framerate stuff */ ++ struct timer_list sustain_timer; ++ unsigned int reread_count; ++ ++ /* timeout stuff */ ++ unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */ ++ int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will ++ * read/write to timeout_image */ ++ u8 *timeout_image; /* copy of it will be captured when timeout passes */ ++ struct v4l2l_buffer timeout_image_buffer; ++ struct timer_list timeout_timer; ++ int timeout_happened; ++ ++ /* sync stuff */ ++ atomic_t open_count; ++ ++ int ready_for_capture; /* set to the number of writers that opened the ++ * device and negotiated format. */ ++ int ready_for_output; /* set to true when no writer is currently attached ++ * this differs slightly from !ready_for_capture, ++ * e.g. when using fallback images */ ++ int active_readers; /* increase if any reader starts streaming */ ++ int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE) ++ * should only be announced if the resp. "ready" ++ * flag is set; default=TRUE */ ++ ++ int max_width; ++ int max_height; ++ ++ char card_label[32]; ++ ++ wait_queue_head_t read_event; ++ spinlock_t lock; ++}; ++ ++/* types of opener shows what opener wants to do with loopback */ ++enum opener_type { ++ // clang-format off ++ UNNEGOTIATED = 0, ++ READER = 1, ++ WRITER = 2, ++ // clang-format on ++}; ++ ++/* struct keeping state and type of opener */ ++struct v4l2_loopback_opener { ++ enum opener_type type; ++ int read_position; /* number of last processed frame + 1 or ++ * write_position - 1 if reader went out of sync */ ++ unsigned int reread_count; ++ struct v4l2_buffer *buffers; ++ int buffers_number; /* should not be big, 4 is a good choice */ ++ int timeout_image_io; ++ ++ struct v4l2_fh fh; ++}; ++ ++#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh) ++ ++/* this is heavily inspired by the bttv driver found in the linux kernel */ ++struct v4l2l_format { ++ char *name; ++ int fourcc; /* video4linux 2 */ ++ int depth; /* bit/pixel */ ++ int flags; ++}; ++/* set the v4l2l_format.flags to PLANAR for non-packed formats */ ++#define FORMAT_FLAGS_PLANAR 0x01 ++#define FORMAT_FLAGS_COMPRESSED 0x02 ++ ++#include "v4l2loopback_formats.h" ++ ++static const unsigned int FORMATS = ARRAY_SIZE(formats); ++ ++static char *fourcc2str(unsigned int fourcc, char buf[4]) ++{ ++ buf[0] = (fourcc >> 0) & 0xFF; ++ buf[1] = (fourcc >> 8) & 0xFF; ++ buf[2] = (fourcc >> 16) & 0xFF; ++ buf[3] = (fourcc >> 24) & 0xFF; ++ ++ return buf; ++} ++ ++static const struct v4l2l_format *format_by_fourcc(int fourcc) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < FORMATS; i++) { ++ if (formats[i].fourcc == fourcc) ++ return formats + i; ++ } ++ ++ dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF, ++ (fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF, ++ (fourcc >> 24) & 0xFF); ++ return NULL; ++} ++ ++static void pix_format_set_size(struct v4l2_pix_format *f, ++ const struct v4l2l_format *fmt, ++ unsigned int width, unsigned int height) ++{ ++ f->width = width; ++ f->height = height; ++ ++ if (fmt->flags & FORMAT_FLAGS_PLANAR) { ++ f->bytesperline = width; /* Y plane */ ++ f->sizeimage = (width * height * fmt->depth) >> 3; ++ } else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) { ++ /* doesn't make sense for compressed formats */ ++ f->bytesperline = 0; ++ f->sizeimage = (width * height * fmt->depth) >> 3; ++ } else { ++ f->bytesperline = (width * fmt->depth) >> 3; ++ f->sizeimage = height * f->bytesperline; ++ } ++} ++ ++static int set_timeperframe(struct v4l2_loopback_device *dev, ++ struct v4l2_fract *tpf) ++{ ++ if ((tpf->denominator < 1) || (tpf->numerator < 1)) { ++ return -EINVAL; ++ } ++ dev->capture_param.timeperframe = *tpf; ++ dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator / ++ tpf->denominator); ++ return 0; ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd); ++ ++/* device attributes */ ++/* available via sysfs: /sys/devices/virtual/video4linux/video* */ ++ ++static ssize_t attr_show_format(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ /* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ const struct v4l2_fract *tpf; ++ char buf4cc[5], buf_fps[32]; ++ ++ if (!dev || !dev->ready_for_capture) ++ return 0; ++ tpf = &dev->capture_param.timeperframe; ++ ++ fourcc2str(dev->pix_format.pixelformat, buf4cc); ++ buf4cc[4] = 0; ++ if (tpf->numerator == 1) ++ snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator); ++ else ++ snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator, ++ tpf->numerator); ++ return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width, ++ dev->pix_format.height, buf_fps); ++} ++ ++static ssize_t attr_store_format(struct device *cd, ++ struct device_attribute *attr, const char *buf, ++ size_t len) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ int fps_num = 0, fps_den = 1; ++ ++ if (!dev) ++ return -ENODEV; ++ ++ /* only fps changing is supported */ ++ if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) { ++ struct v4l2_fract f = { .numerator = fps_den, ++ .denominator = fps_num }; ++ int err = 0; ++ if ((err = set_timeperframe(dev, &f)) < 0) ++ return err; ++ return len; ++ } ++ return -EINVAL; ++} ++ ++static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format, ++ attr_store_format); ++ ++static ssize_t attr_show_buffers(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ return sprintf(buf, "%d\n", dev->used_buffers); ++} ++ ++static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL); ++ ++static ssize_t attr_show_maxopeners(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ return sprintf(buf, "%d\n", dev->max_openers); ++} ++ ++static ssize_t attr_store_maxopeners(struct device *cd, ++ struct device_attribute *attr, ++ const char *buf, size_t len) ++{ ++ struct v4l2_loopback_device *dev = NULL; ++ unsigned long curr = 0; ++ ++ if (kstrtoul(buf, 0, &curr)) ++ return -EINVAL; ++ ++ dev = v4l2loopback_cd2dev(cd); ++ if (!dev) ++ return -ENODEV; ++ ++ if (dev->max_openers == curr) ++ return len; ++ ++ if (curr > __INT_MAX__ || dev->open_count.counter > curr) { ++ /* request to limit to less openers as are currently attached to us */ ++ return -EINVAL; ++ } ++ ++ dev->max_openers = (int)curr; ++ ++ return len; ++} ++ ++static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners, ++ attr_store_maxopeners); ++ ++static void v4l2loopback_remove_sysfs(struct video_device *vdev) ++{ ++#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x) ++ ++ if (vdev) { ++ V4L2_SYSFS_DESTROY(format); ++ V4L2_SYSFS_DESTROY(buffers); ++ V4L2_SYSFS_DESTROY(max_openers); ++ /* ... */ ++ } ++} ++ ++static void v4l2loopback_create_sysfs(struct video_device *vdev) ++{ ++ int res = 0; ++ ++#define V4L2_SYSFS_CREATE(x) \ ++ res = device_create_file(&vdev->dev, &dev_attr_##x); \ ++ if (res < 0) \ ++ break ++ if (!vdev) ++ return; ++ do { ++ V4L2_SYSFS_CREATE(format); ++ V4L2_SYSFS_CREATE(buffers); ++ V4L2_SYSFS_CREATE(max_openers); ++ /* ... */ ++ } while (0); ++ ++ if (res >= 0) ++ return; ++ dev_err(&vdev->dev, "%s error: %d\n", __func__, res); ++} ++ ++/* Event APIs */ ++ ++#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START) ++#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000 ++#define V4L2_EVENT_PRI_CLIENT_USAGE \ ++ (V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1) ++ ++struct v4l2_event_client_usage { ++ __u32 count; ++}; ++ ++/* global module data */ ++/* find a device based on it's device-number (e.g. '3' for /dev/video3) */ ++struct v4l2loopback_lookup_cb_data { ++ int device_nr; ++ struct v4l2_loopback_device *device; ++}; ++static int v4l2loopback_lookup_cb(int id, void *ptr, void *data) ++{ ++ struct v4l2_loopback_device *device = ptr; ++ struct v4l2loopback_lookup_cb_data *cbdata = data; ++ if (cbdata && device && device->vdev) { ++ if (device->vdev->num == cbdata->device_nr) { ++ cbdata->device = device; ++ cbdata->device_nr = id; ++ return 1; ++ } ++ } ++ return 0; ++} ++static int v4l2loopback_lookup(int device_nr, ++ struct v4l2_loopback_device **device) ++{ ++ struct v4l2loopback_lookup_cb_data data = { ++ .device_nr = device_nr, ++ .device = NULL, ++ }; ++ int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb, ++ &data); ++ if (1 == err) { ++ if (device) ++ *device = data.device; ++ return data.device_nr; ++ } ++ return -ENODEV; ++} ++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd) ++{ ++ struct video_device *loopdev = to_video_device(cd); ++ struct v4l2loopback_private *ptr = ++ (struct v4l2loopback_private *)video_get_drvdata(loopdev); ++ int nr = ptr->device_nr; ++ ++ return idr_find(&v4l2loopback_index_idr, nr); ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f) ++{ ++ struct v4l2loopback_private *ptr = video_drvdata(f); ++ int nr = ptr->device_nr; ++ ++ return idr_find(&v4l2loopback_index_idr, nr); ++} ++ ++/* forward declarations */ ++static void client_usage_queue_event(struct video_device *vdev); ++static void init_buffers(struct v4l2_loopback_device *dev); ++static int allocate_buffers(struct v4l2_loopback_device *dev); ++static void free_buffers(struct v4l2_loopback_device *dev); ++static void try_free_buffers(struct v4l2_loopback_device *dev); ++static int allocate_timeout_image(struct v4l2_loopback_device *dev); ++static void check_timers(struct v4l2_loopback_device *dev); ++static const struct v4l2_file_operations v4l2_loopback_fops; ++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops; ++ ++/* Queue helpers */ ++/* next functions sets buffer flags and adjusts counters accordingly */ ++static inline void set_done(struct v4l2l_buffer *buffer) ++{ ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; ++ buffer->buffer.flags |= V4L2_BUF_FLAG_DONE; ++} ++ ++static inline void set_queued(struct v4l2l_buffer *buffer) ++{ ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; ++ buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED; ++} ++ ++static inline void unset_flags(struct v4l2l_buffer *buffer) ++{ ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; ++} ++ ++/* V4L2 ioctl caps and params calls */ ++/* returns device capabilities ++ * called on VIDIOC_QUERYCAP ++ */ ++static int vidioc_querycap(struct file *file, void *priv, ++ struct v4l2_capability *cap) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ int device_nr = ++ ((struct v4l2loopback_private *)video_get_drvdata(dev->vdev)) ++ ->device_nr; ++ __u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; ++ ++ strlcpy(cap->driver, "v4l2 loopback", sizeof(cap->driver)); ++ snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label); ++ snprintf(cap->bus_info, sizeof(cap->bus_info), ++ "platform:v4l2loopback-%03d", device_nr); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 1, 0) ++ /* since 3.1.0, the v4l2-core system is supposed to set the version */ ++ cap->version = V4L2LOOPBACK_VERSION_CODE; ++#endif ++ ++ if (dev->announce_all_caps) { ++ capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT; ++ } else { ++ if (dev->ready_for_capture) { ++ capabilities |= V4L2_CAP_VIDEO_CAPTURE; ++ } ++ if (dev->ready_for_output) { ++ capabilities |= V4L2_CAP_VIDEO_OUTPUT; ++ } ++ } ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) ++ dev->vdev->device_caps = ++#endif /* >=linux-4.7.0 */ ++ cap->device_caps = cap->capabilities = capabilities; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 3, 0) ++ cap->capabilities |= V4L2_CAP_DEVICE_CAPS; ++#endif ++ ++ memset(cap->reserved, 0, sizeof(cap->reserved)); ++ return 0; ++} ++ ++static int vidioc_enum_framesizes(struct file *file, void *fh, ++ struct v4l2_frmsizeenum *argp) ++{ ++ struct v4l2_loopback_device *dev; ++ ++ /* there can be only one... */ ++ if (argp->index) ++ return -EINVAL; ++ ++ dev = v4l2loopback_getdevice(file); ++ if (dev->ready_for_capture) { ++ /* format has already been negotiated ++ * cannot change during runtime ++ */ ++ if (argp->pixel_format != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; ++ ++ argp->discrete.width = dev->pix_format.width; ++ argp->discrete.height = dev->pix_format.height; ++ } else { ++ /* if the format has not been negotiated yet, we accept anything ++ */ ++ if (NULL == format_by_fourcc(argp->pixel_format)) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS; ++ ++ argp->stepwise.min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH; ++ argp->stepwise.min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT; ++ ++ argp->stepwise.max_width = dev->max_width; ++ argp->stepwise.max_height = dev->max_height; ++ ++ argp->stepwise.step_width = 1; ++ argp->stepwise.step_height = 1; ++ } ++ return 0; ++} ++ ++/* returns frameinterval (fps) for the set resolution ++ * called on VIDIOC_ENUM_FRAMEINTERVALS ++ */ ++static int vidioc_enum_frameintervals(struct file *file, void *fh, ++ struct v4l2_frmivalenum *argp) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ ++ /* there can be only one... */ ++ if (argp->index) ++ return -EINVAL; ++ ++ if (dev->ready_for_capture) { ++ if (argp->width != dev->pix_format.width || ++ argp->height != dev->pix_format.height || ++ argp->pixel_format != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMIVAL_TYPE_DISCRETE; ++ argp->discrete = dev->capture_param.timeperframe; ++ } else { ++ if (argp->width < V4L2LOOPBACK_SIZE_MIN_WIDTH || ++ argp->width > max_width || ++ argp->height < V4L2LOOPBACK_SIZE_MIN_HEIGHT || ++ argp->height > max_height || ++ NULL == format_by_fourcc(argp->pixel_format)) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS; ++ argp->stepwise.min.numerator = 1; ++ argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX; ++ argp->stepwise.max.numerator = 1; ++ argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN; ++ argp->stepwise.step.numerator = 1; ++ argp->stepwise.step.denominator = 1; ++ } ++ ++ return 0; ++} ++ ++/* ------------------ CAPTURE ----------------------- */ ++ ++/* returns device formats ++ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_enum_fmt_cap(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ if (f->index) ++ return -EINVAL; ++ if (dev->ready_for_capture) { ++ const __u32 format = dev->pix_format.pixelformat; ++ ++ snprintf(f->description, sizeof(f->description), "[%c%c%c%c]", ++ (format >> 0) & 0xFF, (format >> 8) & 0xFF, ++ (format >> 16) & 0xFF, (format >> 24) & 0xFF); ++ ++ f->pixelformat = dev->pix_format.pixelformat; ++ } else { ++ return -EINVAL; ++ } ++ f->flags = 0; ++ MARK(); ++ return 0; ++} ++ ++/* returns current video format ++ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_g_fmt_cap(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ if (!dev->ready_for_capture) ++ return -EINVAL; ++ ++ fmt->fmt.pix = dev->pix_format; ++ MARK(); ++ return 0; ++} ++ ++/* checks if it is OK to change to format fmt; ++ * actual check is done by inner_try_fmt_cap ++ * just checking that pixelformat is OK and set other parameters, app should ++ * obey this decision ++ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_try_fmt_cap(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ char buf[5]; ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ if (0 == dev->ready_for_capture) { ++ dprintk("setting fmt_cap not possible yet\n"); ++ return -EBUSY; ++ } ++ ++ if (fmt->fmt.pix.pixelformat != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ fmt->fmt.pix = dev->pix_format; ++ ++ buf[4] = 0; ++ dprintk("capFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf)); ++ return 0; ++} ++ ++/* sets new output format, if possible ++ * actually format is set by input and we even do not check it, just return ++ * current one, but it is possible to set subregions of input TODO(vasaka) ++ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_s_fmt_cap(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return vidioc_try_fmt_cap(file, priv, fmt); ++} ++ ++/* ------------------ OUTPUT ----------------------- */ ++ ++/* returns device formats; ++ * LATER: allow all formats ++ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_enum_fmt_out(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ struct v4l2_loopback_device *dev; ++ const struct v4l2l_format *fmt; ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ if (dev->ready_for_capture) { ++ const __u32 format = dev->pix_format.pixelformat; ++ ++ /* format has been fixed by the writer, so only one single format is supported */ ++ if (f->index) ++ return -EINVAL; ++ ++ fmt = format_by_fourcc(format); ++ if (NULL == fmt) ++ return -EINVAL; ++ ++ /* f->flags = ??; */ ++ snprintf(f->description, sizeof(f->description), "%s", ++ fmt->name); ++ ++ f->pixelformat = dev->pix_format.pixelformat; ++ } else { ++ /* fill in a dummy format */ ++ /* coverity[unsigned_compare] */ ++ if (f->index < 0 || f->index >= FORMATS) ++ return -EINVAL; ++ ++ fmt = &formats[f->index]; ++ ++ f->pixelformat = fmt->fourcc; ++ snprintf(f->description, sizeof(f->description), "%s", ++ fmt->name); ++ } ++ f->flags = 0; ++ ++ return 0; ++} ++ ++/* returns current video format format fmt */ ++/* NOTE: this is called from the producer ++ * so if format has not been negotiated yet, ++ * it should return ALL of available formats, ++ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_g_fmt_out(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ /* ++ * LATER: this should return the currently valid format ++ * gstreamer doesn't like it, if this returns -EINVAL, as it ++ * then concludes that there is _no_ valid format ++ * CHECK whether this assumption is wrong, ++ * or whether we have to always provide a valid format ++ */ ++ ++ fmt->fmt.pix = dev->pix_format; ++ return 0; ++} ++ ++/* checks if it is OK to change to format fmt; ++ * if format is negotiated do not change it ++ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_try_fmt_out(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ /* TODO(vasaka) loopback does not care about formats writer want to set, ++ * maybe it is a good idea to restrict format somehow */ ++ if (dev->ready_for_capture) { ++ fmt->fmt.pix = dev->pix_format; ++ } else { ++ __u32 w = fmt->fmt.pix.width; ++ __u32 h = fmt->fmt.pix.height; ++ __u32 pixfmt = fmt->fmt.pix.pixelformat; ++ const struct v4l2l_format *format = format_by_fourcc(pixfmt); ++ ++ if (w > dev->max_width) ++ w = dev->max_width; ++ if (h > dev->max_height) ++ h = dev->max_height; ++ ++ dprintk("trying image %dx%d\n", w, h); ++ ++ if (w < 1) ++ w = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; ++ ++ if (h < 1) ++ h = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; ++ ++ if (NULL == format) ++ format = &formats[0]; ++ ++ pix_format_set_size(&fmt->fmt.pix, format, w, h); ++ ++ fmt->fmt.pix.pixelformat = format->fourcc; ++ ++ if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) || ++ (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3)) ++ fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; ++ ++ if (V4L2_FIELD_ANY == fmt->fmt.pix.field) ++ fmt->fmt.pix.field = V4L2_FIELD_NONE; ++ ++ /* FIXXME: try_fmt should never modify the device-state */ ++ dev->pix_format = fmt->fmt.pix; ++ } ++ return 0; ++} ++ ++/* sets new output format, if possible; ++ * allocate data here because we do not know if it will be streaming or ++ * read/write IO ++ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_s_fmt_out(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ char buf[5]; ++ int ret; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ret = vidioc_try_fmt_out(file, priv, fmt); ++ ++ dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture, ++ dev->pix_format.sizeimage); ++ ++ buf[4] = 0; ++ dprintk("outFOURCC=%s\n", fourcc2str(dev->pix_format.pixelformat, buf)); ++ ++ if (ret < 0) ++ return ret; ++ ++ if (!dev->ready_for_capture) { ++ dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage); ++ fmt->fmt.pix.sizeimage = dev->buffer_size; ++ ret = allocate_buffers(dev); ++ } ++ return ret; ++} ++ ++// #define V4L2L_OVERLAY ++#ifdef V4L2L_OVERLAY ++/* ------------------ OVERLAY ----------------------- */ ++/* currently unsupported */ ++/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work ++ * while it should only require it, if overlay is requested ++ * once the gstreamer element is fixed, remove the overlay dummies ++ */ ++#warning OVERLAY dummies ++static int vidioc_g_fmt_overlay(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return 0; ++} ++ ++static int vidioc_s_fmt_overlay(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return 0; ++} ++#endif /* V4L2L_OVERLAY */ ++ ++/* ------------------ PARAMs ----------------------- */ ++ ++/* get some data flow parameters, only capability, fps and readbuffers has ++ * effect on this driver ++ * called on VIDIOC_G_PARM ++ */ ++static int vidioc_g_parm(struct file *file, void *priv, ++ struct v4l2_streamparm *parm) ++{ ++ /* do not care about type of opener, hope these enums would always be ++ * compatible */ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ parm->parm.capture = dev->capture_param; ++ return 0; ++} ++ ++/* get some data flow parameters, only capability, fps and readbuffers has ++ * effect on this driver ++ * called on VIDIOC_S_PARM ++ */ ++static int vidioc_s_parm(struct file *file, void *priv, ++ struct v4l2_streamparm *parm) ++{ ++ struct v4l2_loopback_device *dev; ++ int err = 0; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ dprintk("vidioc_s_parm called frate=%d/%d\n", ++ parm->parm.capture.timeperframe.numerator, ++ parm->parm.capture.timeperframe.denominator); ++ ++ switch (parm->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if ((err = set_timeperframe( ++ dev, &parm->parm.capture.timeperframe)) < 0) ++ return err; ++ break; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if ((err = set_timeperframe( ++ dev, &parm->parm.capture.timeperframe)) < 0) ++ return err; ++ break; ++ default: ++ return -1; ++ } ++ ++ parm->parm.capture = dev->capture_param; ++ return 0; ++} ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++/* sets a tv standard, actually we do not need to handle this any special way ++ * added to support effecttv ++ * called on VIDIOC_S_STD ++ */ ++static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std) ++{ ++ v4l2_std_id req_std = 0, supported_std = 0; ++ const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0; ++ ++ if (_std) { ++ req_std = *_std; ++ *_std = all_std; ++ } ++ ++ /* we support everything in V4L2_STD_ALL, but not more... */ ++ supported_std = (all_std & req_std); ++ if (no_std == supported_std) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* gets a fake video standard ++ * called on VIDIOC_G_STD ++ */ ++static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm) ++{ ++ if (norm) ++ *norm = V4L2_STD_ALL; ++ return 0; ++} ++/* gets a fake video standard ++ * called on VIDIOC_QUERYSTD ++ */ ++static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm) ++{ ++ if (norm) ++ *norm = V4L2_STD_ALL; ++ return 0; ++} ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id, ++ s64 val) ++{ ++ switch (id) { ++ case CID_KEEP_FORMAT: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ dev->keep_format = val; ++ try_free_buffers( ++ dev); /* will only free buffers if !keep_format */ ++ break; ++ case CID_SUSTAIN_FRAMERATE: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ spin_lock_bh(&dev->lock); ++ dev->sustain_framerate = val; ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++ break; ++ case CID_TIMEOUT: ++ if (val < 0 || val > MAX_TIMEOUT) ++ return -EINVAL; ++ spin_lock_bh(&dev->lock); ++ dev->timeout_jiffies = msecs_to_jiffies(val); ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++ allocate_timeout_image(dev); ++ break; ++ case CID_TIMEOUT_IMAGE_IO: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ dev->timeout_image_io = val; ++ break; ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl) ++{ ++ struct v4l2_loopback_device *dev = container_of( ++ ctrl->handler, struct v4l2_loopback_device, ctrl_handler); ++ return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val); ++} ++ ++/* returns set of device outputs, in our case there is only one ++ * called on VIDIOC_ENUMOUTPUT ++ */ ++static int vidioc_enum_output(struct file *file, void *fh, ++ struct v4l2_output *outp) ++{ ++ __u32 index = outp->index; ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ MARK(); ++ ++ if (!dev->announce_all_caps && !dev->ready_for_output) ++ return -ENOTTY; ++ ++ if (0 != index) ++ return -EINVAL; ++ ++ /* clear all data (including the reserved fields) */ ++ memset(outp, 0, sizeof(*outp)); ++ ++ outp->index = index; ++ strlcpy(outp->name, "loopback in", sizeof(outp->name)); ++ outp->type = V4L2_OUTPUT_TYPE_ANALOG; ++ outp->audioset = 0; ++ outp->modulator = 0; ++#ifdef V4L2LOOPBACK_WITH_STD ++ outp->std = V4L2_STD_ALL; ++#ifdef V4L2_OUT_CAP_STD ++ outp->capabilities |= V4L2_OUT_CAP_STD; ++#endif /* V4L2_OUT_CAP_STD */ ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ return 0; ++} ++ ++/* which output is currently active, ++ * called on VIDIOC_G_OUTPUT ++ */ ++static int vidioc_g_output(struct file *file, void *fh, unsigned int *i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_output) ++ return -ENOTTY; ++ if (i) ++ *i = 0; ++ return 0; ++} ++ ++/* set output, can make sense if we have more than one video src, ++ * called on VIDIOC_S_OUTPUT ++ */ ++static int vidioc_s_output(struct file *file, void *fh, unsigned int i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_output) ++ return -ENOTTY; ++ ++ if (i) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* returns set of device inputs, in our case there is only one, ++ * but later I may add more ++ * called on VIDIOC_ENUMINPUT ++ */ ++static int vidioc_enum_input(struct file *file, void *fh, ++ struct v4l2_input *inp) ++{ ++ __u32 index = inp->index; ++ MARK(); ++ ++ if (0 != index) ++ return -EINVAL; ++ ++ /* clear all data (including the reserved fields) */ ++ memset(inp, 0, sizeof(*inp)); ++ ++ inp->index = index; ++ strlcpy(inp->name, "loopback", sizeof(inp->name)); ++ inp->type = V4L2_INPUT_TYPE_CAMERA; ++ inp->audioset = 0; ++ inp->tuner = 0; ++ inp->status = 0; ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ inp->std = V4L2_STD_ALL; ++#ifdef V4L2_IN_CAP_STD ++ inp->capabilities |= V4L2_IN_CAP_STD; ++#endif ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ return 0; ++} ++ ++/* which input is currently active, ++ * called on VIDIOC_G_INPUT ++ */ ++static int vidioc_g_input(struct file *file, void *fh, unsigned int *i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_capture) ++ return -ENOTTY; ++ if (i) ++ *i = 0; ++ return 0; ++} ++ ++/* set input, can make sense if we have more than one video src, ++ * called on VIDIOC_S_INPUT ++ */ ++static int vidioc_s_input(struct file *file, void *fh, unsigned int i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_capture) ++ return -ENOTTY; ++ if (i == 0) ++ return 0; ++ return -EINVAL; ++} ++ ++/* --------------- V4L2 ioctl buffer related calls ----------------- */ ++ ++/* negotiate buffer type ++ * only mmap streaming supported ++ * called on VIDIOC_REQBUFS ++ */ ++static int vidioc_reqbufs(struct file *file, void *fh, ++ struct v4l2_requestbuffers *b) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ int i; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count, ++ dev->buffers_number); ++ if (opener->timeout_image_io) { ++ if (b->memory != V4L2_MEMORY_MMAP) ++ return -EINVAL; ++ b->count = 1; ++ return 0; ++ } ++ ++ init_buffers(dev); ++ switch (b->memory) { ++ case V4L2_MEMORY_MMAP: ++ /* do nothing here, buffers are always allocated */ ++ if (b->count < 1 || dev->buffers_number < 1) ++ return 0; ++ ++ if (b->count > dev->buffers_number) ++ b->count = dev->buffers_number; ++ ++ /* make sure that outbufs_list contains buffers from 0 to used_buffers-1 ++ * actually, it will have been already populated via v4l2_loopback_init() ++ * at this point */ ++ if (list_empty(&dev->outbufs_list)) { ++ for (i = 0; i < dev->used_buffers; ++i) ++ list_add_tail(&dev->buffers[i].list_head, ++ &dev->outbufs_list); ++ } ++ ++ /* also, if dev->used_buffers is going to be decreased, we should remove ++ * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */ ++ if (b->count < dev->used_buffers) { ++ struct v4l2l_buffer *pos, *n; ++ ++ list_for_each_entry_safe(pos, n, &dev->outbufs_list, ++ list_head) { ++ if (pos->buffer.index >= b->count) ++ list_del(&pos->list_head); ++ } ++ ++ /* after we update dev->used_buffers, buffers in outbufs_list will ++ * correspond to dev->write_position + [0;b->count-1] range */ ++ i = dev->write_position; ++ list_for_each_entry(pos, &dev->outbufs_list, ++ list_head) { ++ dev->bufpos2index[mod_inc(&i, b->count)] = ++ pos->buffer.index; ++ } ++ } ++ ++ opener->buffers_number = b->count; ++ if (opener->buffers_number < dev->used_buffers) ++ dev->used_buffers = opener->buffers_number; ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++/* returns buffer asked for; ++ * give app as many buffers as it wants, if it less than MAX, ++ * but map them in our inner buffers ++ * called on VIDIOC_QUERYBUF ++ */ ++static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b) ++{ ++ enum v4l2_buf_type type; ++ int index; ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ ++ MARK(); ++ ++ type = b->type; ++ index = b->index; ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) && ++ (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) { ++ return -EINVAL; ++ } ++ if (b->index > max_buffers) ++ return -EINVAL; ++ ++ if (opener->timeout_image_io) ++ *b = dev->timeout_image_buffer.buffer; ++ else ++ *b = dev->buffers[b->index % dev->used_buffers].buffer; ++ ++ b->type = type; ++ b->index = index; ++ dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory, ++ dev->buffers_number, dev->buffer_size); ++ ++ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' ++ https://github.com/umlaeute/v4l2loopback/issues/60 */ ++ b->flags &= ~V4L2_BUF_FLAG_DONE; ++ b->flags |= V4L2_BUF_FLAG_QUEUED; ++ ++ return 0; ++} ++ ++static void buffer_written(struct v4l2_loopback_device *dev, ++ struct v4l2l_buffer *buf) ++{ ++ del_timer_sync(&dev->sustain_timer); ++ del_timer_sync(&dev->timeout_timer); ++ spin_lock_bh(&dev->lock); ++ ++ dev->bufpos2index[mod_inc(&dev->write_position, dev->used_buffers)] = ++ buf->buffer.index; ++ list_move_tail(&buf->list_head, &dev->outbufs_list); ++ dev->reread_count = 0; ++ ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++} ++ ++/* put buffer to queue ++ * called on VIDIOC_QBUF ++ */ ++static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ struct v4l2l_buffer *b; ++ int index; ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ if (buf->index > max_buffers) ++ return -EINVAL; ++ if (opener->timeout_image_io) ++ return 0; ++ ++ index = buf->index % dev->used_buffers; ++ b = &dev->buffers[index]; ++ ++ switch (buf->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ dprintkrw("capture QBUF index: %d\n", index); ++ set_queued(b); ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ dprintkrw("output QBUF pos: %d index: %d\n", ++ dev->write_position, index); ++ if (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0) ++ v4l2l_get_timestamp(&b->buffer); ++ else ++ b->buffer.timestamp = buf->timestamp; ++ b->buffer.bytesused = buf->bytesused; ++ set_done(b); ++ buffer_written(dev, b); ++ ++ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' ++ https://github.com/umlaeute/v4l2loopback/issues/60 */ ++ buf->flags &= ~V4L2_BUF_FLAG_DONE; ++ buf->flags |= V4L2_BUF_FLAG_QUEUED; ++ ++ wake_up_all(&dev->read_event); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++static int can_read(struct v4l2_loopback_device *dev, ++ struct v4l2_loopback_opener *opener) ++{ ++ int ret; ++ ++ spin_lock_bh(&dev->lock); ++ check_timers(dev); ++ ret = dev->write_position > opener->read_position || ++ dev->reread_count > opener->reread_count || dev->timeout_happened; ++ spin_unlock_bh(&dev->lock); ++ return ret; ++} ++ ++static int get_capture_buffer(struct file *file) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); ++ int pos, ret; ++ int timeout_happened; ++ ++ if ((file->f_flags & O_NONBLOCK) && ++ (dev->write_position <= opener->read_position && ++ dev->reread_count <= opener->reread_count && ++ !dev->timeout_happened)) ++ return -EAGAIN; ++ wait_event_interruptible(dev->read_event, can_read(dev, opener)); ++ ++ spin_lock_bh(&dev->lock); ++ if (dev->write_position == opener->read_position) { ++ if (dev->reread_count > opener->reread_count + 2) ++ opener->reread_count = dev->reread_count - 1; ++ ++opener->reread_count; ++ pos = (opener->read_position + dev->used_buffers - 1) % ++ dev->used_buffers; ++ } else { ++ opener->reread_count = 0; ++ if (dev->write_position > ++ opener->read_position + dev->used_buffers) ++ opener->read_position = dev->write_position - 1; ++ pos = mod_inc(&opener->read_position, dev->used_buffers); ++ } ++ timeout_happened = dev->timeout_happened; ++ dev->timeout_happened = 0; ++ spin_unlock_bh(&dev->lock); ++ ++ ret = dev->bufpos2index[pos]; ++ if (timeout_happened) { ++ if (ret < 0) { ++ dprintk("trying to return not mapped buf[%d]\n", ret); ++ return -EFAULT; ++ } ++ /* although allocated on-demand, timeout_image is freed only ++ * in free_buffers(), so we don't need to worry about it being ++ * deallocated suddenly */ ++ memcpy(dev->image + dev->buffers[ret].buffer.m.offset, ++ dev->timeout_image, dev->buffer_size); ++ } ++ return ret; ++} ++ ++/* put buffer to dequeue ++ * called on VIDIOC_DQBUF ++ */ ++static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ int index; ++ struct v4l2l_buffer *b; ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ if (opener->timeout_image_io) { ++ *buf = dev->timeout_image_buffer.buffer; ++ return 0; ++ } ++ ++ switch (buf->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ index = get_capture_buffer(file); ++ if (index < 0) ++ return index; ++ dprintkrw("capture DQBUF pos: %d index: %d\n", ++ opener->read_position - 1, index); ++ if (!(dev->buffers[index].buffer.flags & ++ V4L2_BUF_FLAG_MAPPED)) { ++ dprintk("trying to return not mapped buf[%d]\n", index); ++ return -EINVAL; ++ } ++ unset_flags(&dev->buffers[index]); ++ *buf = dev->buffers[index].buffer; ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer, ++ list_head); ++ list_move_tail(&b->list_head, &dev->outbufs_list); ++ dprintkrw("output DQBUF index: %d\n", b->buffer.index); ++ unset_flags(b); ++ *buf = b->buffer; ++ buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++/* ------------- STREAMING ------------------- */ ++ ++/* start streaming ++ * called on VIDIOC_STREAMON ++ */ ++static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (!dev->ready_for_capture) { ++ int ret = allocate_buffers(dev); ++ if (ret < 0) ++ return ret; ++ } ++ opener->type = WRITER; ++ dev->ready_for_output = 0; ++ dev->ready_for_capture++; ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (!dev->ready_for_capture) ++ return -EIO; ++ opener->type = READER; ++ dev->active_readers++; ++ client_usage_queue_event(dev->vdev); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ return -EINVAL; ++} ++ ++/* stop streaming ++ * called on VIDIOC_STREAMOFF ++ */ ++static int vidioc_streamoff(struct file *file, void *fh, ++ enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ ++ MARK(); ++ dprintk("%d\n", type); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (dev->ready_for_capture > 0) ++ dev->ready_for_capture--; ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (opener->type == READER) { ++ opener->type = 0; ++ dev->active_readers--; ++ client_usage_queue_event(dev->vdev); ++ } ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ return -EINVAL; ++} ++ ++#ifdef CONFIG_VIDEO_V4L1_COMPAT ++static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ p->frames = dev->buffers_number; ++ p->offsets[0] = 0; ++ p->offsets[1] = 0; ++ p->size = dev->buffer_size; ++ return 0; ++} ++#endif ++ ++static void client_usage_queue_event(struct video_device *vdev) ++{ ++ struct v4l2_event ev; ++ struct v4l2_loopback_device *dev; ++ ++ dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device, ++ v4l2_dev); ++ ++ memset(&ev, 0, sizeof(ev)); ++ ev.type = V4L2_EVENT_PRI_CLIENT_USAGE; ++ ((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers; ++ ++ v4l2_event_queue(vdev, &ev); ++} ++ ++static int client_usage_ops_add(struct v4l2_subscribed_event *sev, ++ unsigned elems) ++{ ++ if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL)) ++ return 0; ++ ++ client_usage_queue_event(sev->fh->vdev); ++ return 0; ++} ++ ++static void client_usage_ops_replace(struct v4l2_event *old, ++ const struct v4l2_event *new) ++{ ++ *((struct v4l2_event_client_usage *)&old->u) = ++ *((struct v4l2_event_client_usage *)&new->u); ++} ++ ++static void client_usage_ops_merge(const struct v4l2_event *old, ++ struct v4l2_event *new) ++{ ++ *((struct v4l2_event_client_usage *)&new->u) = ++ *((struct v4l2_event_client_usage *)&old->u); ++} ++ ++const struct v4l2_subscribed_event_ops client_usage_ops = { ++ .add = client_usage_ops_add, ++ .replace = client_usage_ops_replace, ++ .merge = client_usage_ops_merge, ++}; ++ ++static int vidioc_subscribe_event(struct v4l2_fh *fh, ++ const struct v4l2_event_subscription *sub) ++{ ++ switch (sub->type) { ++ case V4L2_EVENT_CTRL: ++ return v4l2_ctrl_subscribe_event(fh, sub); ++ case V4L2_EVENT_PRI_CLIENT_USAGE: ++ return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops); ++ } ++ ++ return -EINVAL; ++} ++ ++/* file operations */ ++static void vm_open(struct vm_area_struct *vma) ++{ ++ struct v4l2l_buffer *buf; ++ MARK(); ++ ++ buf = vma->vm_private_data; ++ buf->use_count++; ++ ++ buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED; ++} ++ ++static void vm_close(struct vm_area_struct *vma) ++{ ++ struct v4l2l_buffer *buf; ++ MARK(); ++ ++ buf = vma->vm_private_data; ++ buf->use_count--; ++ ++ if (buf->use_count <= 0) ++ buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED; ++} ++ ++static struct vm_operations_struct vm_ops = { ++ .open = vm_open, ++ .close = vm_close, ++}; ++ ++static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ u8 *addr; ++ unsigned long start; ++ unsigned long size; ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ struct v4l2l_buffer *buffer = NULL; ++ MARK(); ++ ++ start = (unsigned long)vma->vm_start; ++ size = (unsigned long)(vma->vm_end - vma->vm_start); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(file->private_data); ++ ++ if (size > dev->buffer_size) { ++ dprintk("userspace tries to mmap too much, fail\n"); ++ return -EINVAL; ++ } ++ if (opener->timeout_image_io) { ++ /* we are going to map the timeout_image_buffer */ ++ if ((vma->vm_pgoff << PAGE_SHIFT) != ++ dev->buffer_size * MAX_BUFFERS) { ++ dprintk("invalid mmap offset for timeout_image_io mode\n"); ++ return -EINVAL; ++ } ++ } else if ((vma->vm_pgoff << PAGE_SHIFT) > ++ dev->buffer_size * (dev->buffers_number - 1)) { ++ dprintk("userspace tries to mmap too far, fail\n"); ++ return -EINVAL; ++ } ++ ++ /* FIXXXXXME: allocation should not happen here! */ ++ if (NULL == dev->image) ++ if (allocate_buffers(dev) < 0) ++ return -EINVAL; ++ ++ if (opener->timeout_image_io) { ++ buffer = &dev->timeout_image_buffer; ++ addr = dev->timeout_image; ++ } else { ++ int i; ++ for (i = 0; i < dev->buffers_number; ++i) { ++ buffer = &dev->buffers[i]; ++ if ((buffer->buffer.m.offset >> PAGE_SHIFT) == ++ vma->vm_pgoff) ++ break; ++ } ++ ++ if (i >= dev->buffers_number) ++ return -EINVAL; ++ ++ addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT); ++ } ++ ++ while (size > 0) { ++ struct page *page; ++ ++ page = vmalloc_to_page(addr); ++ ++ if (vm_insert_page(vma, start, page) < 0) ++ return -EAGAIN; ++ ++ start += PAGE_SIZE; ++ addr += PAGE_SIZE; ++ size -= PAGE_SIZE; ++ } ++ ++ vma->vm_ops = &vm_ops; ++ vma->vm_private_data = buffer; ++ ++ vm_open(vma); ++ ++ MARK(); ++ return 0; ++} ++ ++static unsigned int v4l2_loopback_poll(struct file *file, ++ struct poll_table_struct *pts) ++{ ++ struct v4l2_loopback_opener *opener; ++ struct v4l2_loopback_device *dev; ++ __poll_t req_events = poll_requested_events(pts); ++ int ret_mask = 0; ++ MARK(); ++ ++ opener = fh_to_opener(file->private_data); ++ dev = v4l2loopback_getdevice(file); ++ ++ if (req_events & POLLPRI) { ++ if (!v4l2_event_pending(&opener->fh)) ++ poll_wait(file, &opener->fh.wait, pts); ++ if (v4l2_event_pending(&opener->fh)) { ++ ret_mask |= POLLPRI; ++ if (!(req_events & DEFAULT_POLLMASK)) ++ return ret_mask; ++ } ++ } ++ ++ switch (opener->type) { ++ case WRITER: ++ ret_mask |= POLLOUT | POLLWRNORM; ++ break; ++ case READER: ++ if (!can_read(dev, opener)) { ++ if (ret_mask) ++ return ret_mask; ++ poll_wait(file, &dev->read_event, pts); ++ } ++ if (can_read(dev, opener)) ++ ret_mask |= POLLIN | POLLRDNORM; ++ if (v4l2_event_pending(&opener->fh)) ++ ret_mask |= POLLPRI; ++ break; ++ default: ++ break; ++ } ++ ++ MARK(); ++ return ret_mask; ++} ++ ++/* do not want to limit device opens, it can be as many readers as user want, ++ * writers are limited by means of setting writer field */ ++static int v4l2_loopback_open(struct file *file) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ MARK(); ++ dev = v4l2loopback_getdevice(file); ++ if (dev->open_count.counter >= dev->max_openers) ++ return -EBUSY; ++ /* kfree on close */ ++ opener = kzalloc(sizeof(*opener), GFP_KERNEL); ++ if (opener == NULL) ++ return -ENOMEM; ++ ++ atomic_inc(&dev->open_count); ++ ++ opener->timeout_image_io = dev->timeout_image_io; ++ if (opener->timeout_image_io) { ++ int r = allocate_timeout_image(dev); ++ ++ if (r < 0) { ++ dprintk("timeout image allocation failed\n"); ++ ++ atomic_dec(&dev->open_count); ++ ++ kfree(opener); ++ return r; ++ } ++ } ++ ++ dev->timeout_image_io = 0; ++ ++ v4l2_fh_init(&opener->fh, video_devdata(file)); ++ file->private_data = &opener->fh; ++ ++ v4l2_fh_add(&opener->fh); ++ dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL); ++ MARK(); ++ return 0; ++} ++ ++static int v4l2_loopback_close(struct file *file) ++{ ++ struct v4l2_loopback_opener *opener; ++ struct v4l2_loopback_device *dev; ++ int is_writer = 0, is_reader = 0; ++ MARK(); ++ ++ opener = fh_to_opener(file->private_data); ++ dev = v4l2loopback_getdevice(file); ++ ++ if (WRITER == opener->type) ++ is_writer = 1; ++ if (READER == opener->type) ++ is_reader = 1; ++ ++ atomic_dec(&dev->open_count); ++ if (dev->open_count.counter == 0) { ++ del_timer_sync(&dev->sustain_timer); ++ del_timer_sync(&dev->timeout_timer); ++ } ++ try_free_buffers(dev); ++ ++ v4l2_fh_del(&opener->fh); ++ v4l2_fh_exit(&opener->fh); ++ ++ kfree(opener); ++ if (is_writer) ++ dev->ready_for_output = 1; ++ if (is_reader) { ++ dev->active_readers--; ++ client_usage_queue_event(dev->vdev); ++ } ++ MARK(); ++ return 0; ++} ++ ++static ssize_t v4l2_loopback_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int read_index; ++ struct v4l2_loopback_device *dev; ++ struct v4l2_buffer *b; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ read_index = get_capture_buffer(file); ++ if (read_index < 0) ++ return read_index; ++ if (count > dev->buffer_size) ++ count = dev->buffer_size; ++ b = &dev->buffers[read_index].buffer; ++ if (count > b->bytesused) ++ count = b->bytesused; ++ if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset), ++ count)) { ++ printk(KERN_ERR ++ "v4l2-loopback: failed copy_to_user() in read buf\n"); ++ return -EFAULT; ++ } ++ dprintkrw("leave v4l2_loopback_read()\n"); ++ return count; ++} ++ ++static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct v4l2_loopback_opener *opener; ++ struct v4l2_loopback_device *dev; ++ int write_index; ++ struct v4l2_buffer *b; ++ int err = 0; ++ ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(file->private_data); ++ ++ if (UNNEGOTIATED == opener->type) { ++ spin_lock(&dev->lock); ++ ++ if (dev->ready_for_output) { ++ err = vidioc_streamon(file, file->private_data, ++ V4L2_BUF_TYPE_VIDEO_OUTPUT); ++ } ++ ++ spin_unlock(&dev->lock); ++ ++ if (err < 0) ++ return err; ++ } ++ ++ if (WRITER != opener->type) ++ return -EINVAL; ++ ++ if (!dev->ready_for_capture) { ++ int ret = allocate_buffers(dev); ++ if (ret < 0) ++ return ret; ++ dev->ready_for_capture = 1; ++ } ++ dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count); ++ if (count > dev->buffer_size) ++ count = dev->buffer_size; ++ ++ write_index = dev->write_position % dev->used_buffers; ++ b = &dev->buffers[write_index].buffer; ++ ++ if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf, ++ count)) { ++ printk(KERN_ERR ++ "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n", ++ count); ++ return -EFAULT; ++ } ++ v4l2l_get_timestamp(b); ++ b->bytesused = count; ++ b->sequence = dev->write_position; ++ buffer_written(dev, &dev->buffers[write_index]); ++ wake_up_all(&dev->read_event); ++ dprintkrw("leave v4l2_loopback_write()\n"); ++ return count; ++} ++ ++/* init functions */ ++/* frees buffers, if already allocated */ ++static void free_buffers(struct v4l2_loopback_device *dev) ++{ ++ MARK(); ++ dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev); ++ if (!dev) ++ return; ++ if (dev->image) { ++ vfree(dev->image); ++ dev->image = NULL; ++ } ++ if (dev->timeout_image) { ++ vfree(dev->timeout_image); ++ dev->timeout_image = NULL; ++ } ++ dev->imagesize = 0; ++} ++/* frees buffers, if they are no longer needed */ ++static void try_free_buffers(struct v4l2_loopback_device *dev) ++{ ++ MARK(); ++ if (0 == dev->open_count.counter && !dev->keep_format) { ++ free_buffers(dev); ++ dev->ready_for_capture = 0; ++ dev->buffer_size = 0; ++ dev->write_position = 0; ++ } ++} ++/* allocates buffers, if buffer_size is set */ ++static int allocate_buffers(struct v4l2_loopback_device *dev) ++{ ++ int err; ++ ++ MARK(); ++ /* vfree on close file operation in case no open handles left */ ++ ++ if (dev->buffer_size < 1 || dev->buffers_number < 1) ++ return -EINVAL; ++ ++ if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number) ++ return -ENOSPC; ++ ++ if (dev->image) { ++ dprintk("allocating buffers again: %ld %ld\n", ++ dev->buffer_size * dev->buffers_number, dev->imagesize); ++ /* FIXME: prevent double allocation more intelligently! */ ++ if (dev->buffer_size * dev->buffers_number == dev->imagesize) ++ return 0; ++ ++ /* if there is only one writer, no problem should occur */ ++ if (dev->open_count.counter == 1) ++ free_buffers(dev); ++ else ++ return -EINVAL; ++ } ++ ++ dev->imagesize = (unsigned long)dev->buffer_size * ++ (unsigned long)dev->buffers_number; ++ ++ dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size, ++ dev->buffers_number); ++ err = -ENOMEM; ++ ++ if (dev->timeout_jiffies > 0) { ++ err = allocate_timeout_image(dev); ++ if (err < 0) ++ goto error; ++ } ++ ++ dev->image = vmalloc(dev->imagesize); ++ if (dev->image == NULL) ++ goto error; ++ ++ dprintk("vmallocated %ld bytes\n", dev->imagesize); ++ MARK(); ++ ++ init_buffers(dev); ++ return 0; ++ ++error: ++ free_buffers(dev); ++ return err; ++} ++ ++/* init inner buffers, they are capture mode and flags are set as ++ * for capture mod buffers */ ++static void init_buffers(struct v4l2_loopback_device *dev) ++{ ++ int i; ++ int buffer_size; ++ int bytesused; ++ MARK(); ++ ++ buffer_size = dev->buffer_size; ++ bytesused = dev->pix_format.sizeimage; ++ ++ for (i = 0; i < dev->buffers_number; ++i) { ++ struct v4l2_buffer *b = &dev->buffers[i].buffer; ++ b->index = i; ++ b->bytesused = bytesused; ++ b->length = buffer_size; ++ b->field = V4L2_FIELD_NONE; ++ b->flags = 0; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 6, 1) ++ b->input = 0; ++#endif ++ b->m.offset = i * buffer_size; ++ b->memory = V4L2_MEMORY_MMAP; ++ b->sequence = 0; ++ b->timestamp.tv_sec = 0; ++ b->timestamp.tv_usec = 0; ++ b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ ++ v4l2l_get_timestamp(b); ++ } ++ dev->timeout_image_buffer = dev->buffers[0]; ++ dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size; ++ MARK(); ++} ++ ++static int allocate_timeout_image(struct v4l2_loopback_device *dev) ++{ ++ MARK(); ++ if (dev->buffer_size <= 0) ++ return -EINVAL; ++ ++ if (dev->timeout_image == NULL) { ++ dev->timeout_image = vzalloc(dev->buffer_size); ++ if (dev->timeout_image == NULL) ++ return -ENOMEM; ++ } ++ return 0; ++} ++ ++/* fills and register video device */ ++static void init_vdev(struct video_device *vdev, int nr) ++{ ++ MARK(); ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ vdev->tvnorms = V4L2_STD_ALL; ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ vdev->vfl_type = VFL_TYPE_VIDEO; ++ vdev->fops = &v4l2_loopback_fops; ++ vdev->ioctl_ops = &v4l2_loopback_ioctl_ops; ++ vdev->release = &video_device_release; ++ vdev->minor = -1; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) ++ vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE | ++ V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE | ++ V4L2_CAP_STREAMING; ++#endif ++ ++ if (debug > 1) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 20, 0) ++ vdev->debug = V4L2_DEBUG_IOCTL | V4L2_DEBUG_IOCTL_ARG; ++#else ++ vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL | ++ V4L2_DEV_DEBUG_IOCTL_ARG; ++#endif ++ ++ /* since kernel-3.7, there is a new field 'vfl_dir' that has to be ++ * set to VFL_DIR_M2M for bidirectional devices */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 7, 0) ++ vdev->vfl_dir = VFL_DIR_M2M; ++#endif ++ ++ MARK(); ++} ++ ++/* init default capture parameters, only fps may be changed in future */ ++static void init_capture_param(struct v4l2_captureparm *capture_param) ++{ ++ MARK(); ++ capture_param->capability = 0; ++ capture_param->capturemode = 0; ++ capture_param->extendedmode = 0; ++ capture_param->readbuffers = max_buffers; ++ capture_param->timeperframe.numerator = 1; ++ capture_param->timeperframe.denominator = 30; ++} ++ ++static void check_timers(struct v4l2_loopback_device *dev) ++{ ++ if (!dev->ready_for_capture) ++ return; ++ ++ if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer)) ++ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); ++ if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer)) ++ mod_timer(&dev->sustain_timer, ++ jiffies + dev->frame_jiffies * 3 / 2); ++} ++#ifdef HAVE_TIMER_SETUP ++static void sustain_timer_clb(struct timer_list *t) ++{ ++ struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer); ++#else ++static void sustain_timer_clb(unsigned long nr) ++{ ++ struct v4l2_loopback_device *dev = ++ idr_find(&v4l2loopback_index_idr, nr); ++#endif ++ spin_lock(&dev->lock); ++ if (dev->sustain_framerate) { ++ dev->reread_count++; ++ dprintkrw("reread: %d %d\n", dev->write_position, ++ dev->reread_count); ++ if (dev->reread_count == 1) ++ mod_timer(&dev->sustain_timer, ++ jiffies + max(1UL, dev->frame_jiffies / 2)); ++ else ++ mod_timer(&dev->sustain_timer, ++ jiffies + dev->frame_jiffies); ++ wake_up_all(&dev->read_event); ++ } ++ spin_unlock(&dev->lock); ++} ++#ifdef HAVE_TIMER_SETUP ++static void timeout_timer_clb(struct timer_list *t) ++{ ++ struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer); ++#else ++static void timeout_timer_clb(unsigned long nr) ++{ ++ struct v4l2_loopback_device *dev = ++ idr_find(&v4l2loopback_index_idr, nr); ++#endif ++ spin_lock(&dev->lock); ++ if (dev->timeout_jiffies > 0) { ++ dev->timeout_happened = 1; ++ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); ++ wake_up_all(&dev->read_event); ++ } ++ spin_unlock(&dev->lock); ++} ++ ++/* init loopback main structure */ ++#define DEFAULT_FROM_CONF(confmember, default_condition, default_value) \ ++ ((conf) ? \ ++ ((conf->confmember default_condition) ? (default_value) : \ ++ (conf->confmember)) : \ ++ default_value) ++ ++static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_ctrl_handler *hdl; ++ struct v4l2loopback_private *vdev_priv = NULL; ++ ++ int err = -ENOMEM; ++ ++ int _max_width = DEFAULT_FROM_CONF( ++ max_width, < V4L2LOOPBACK_SIZE_MIN_WIDTH, max_width); ++ int _max_height = DEFAULT_FROM_CONF( ++ max_height, < V4L2LOOPBACK_SIZE_MIN_HEIGHT, max_height); ++ bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ? ++ (conf->announce_all_caps) : ++ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS; ++ int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers); ++ int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers); ++ ++ int nr = -1; ++ ++ _announce_all_caps = (!!_announce_all_caps); ++ ++ if (conf) { ++ if (conf->capture_nr >= 0 && ++ conf->output_nr == conf->capture_nr) { ++ nr = conf->capture_nr; ++ } else if (conf->capture_nr < 0 && conf->output_nr < 0) { ++ nr = -1; ++ } else if (conf->capture_nr < 0) { ++ nr = conf->output_nr; ++ } else if (conf->output_nr < 0) { ++ nr = conf->capture_nr; ++ } else { ++ printk(KERN_ERR ++ "split OUTPUT and CAPTURE devices not yet supported."); ++ printk(KERN_INFO ++ "both devices must have the same number (%d != %d).", ++ conf->output_nr, conf->capture_nr); ++ return -EINVAL; ++ } ++ } ++ ++ if (idr_find(&v4l2loopback_index_idr, nr)) ++ return -EEXIST; ++ ++ dprintk("creating v4l2loopback-device #%d\n", nr); ++ dev = kzalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) ++ return -ENOMEM; ++ ++ /* allocate id, if @id >= 0, we're requesting that specific id */ ++ if (nr >= 0) { ++ err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1, ++ GFP_KERNEL); ++ if (err == -ENOSPC) ++ err = -EEXIST; ++ } else { ++ err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL); ++ } ++ if (err < 0) ++ goto out_free_dev; ++ nr = err; ++ err = -ENOMEM; ++ ++ if (conf && conf->card_label[0]) { ++ snprintf(dev->card_label, sizeof(dev->card_label), "%s", ++ conf->card_label); ++ } else { ++ snprintf(dev->card_label, sizeof(dev->card_label), ++ "Dummy video device (0x%04X)", nr); ++ } ++ snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), ++ "v4l2loopback-%03d", nr); ++ ++ err = v4l2_device_register(NULL, &dev->v4l2_dev); ++ if (err) ++ goto out_free_idr; ++ MARK(); ++ ++ dev->vdev = video_device_alloc(); ++ if (dev->vdev == NULL) { ++ err = -ENOMEM; ++ goto out_unregister; ++ } ++ ++ vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL); ++ if (vdev_priv == NULL) { ++ err = -ENOMEM; ++ goto out_unregister; ++ } ++ ++ video_set_drvdata(dev->vdev, vdev_priv); ++ if (video_get_drvdata(dev->vdev) == NULL) { ++ err = -ENOMEM; ++ goto out_unregister; ++ } ++ ++ MARK(); ++ snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s", ++ dev->card_label); ++ ++ vdev_priv->device_nr = nr; ++ ++ init_vdev(dev->vdev, nr); ++ dev->vdev->v4l2_dev = &dev->v4l2_dev; ++ init_capture_param(&dev->capture_param); ++ err = set_timeperframe(dev, &dev->capture_param.timeperframe); ++ if (err) ++ goto out_unregister; ++ dev->keep_format = 0; ++ dev->sustain_framerate = 0; ++ ++ dev->announce_all_caps = _announce_all_caps; ++ dev->max_width = _max_width; ++ dev->max_height = _max_height; ++ dev->max_openers = _max_openers; ++ dev->buffers_number = dev->used_buffers = _max_buffers; ++ ++ dev->write_position = 0; ++ ++ MARK(); ++ spin_lock_init(&dev->lock); ++ INIT_LIST_HEAD(&dev->outbufs_list); ++ if (list_empty(&dev->outbufs_list)) { ++ int i; ++ ++ for (i = 0; i < dev->used_buffers; ++i) ++ list_add_tail(&dev->buffers[i].list_head, ++ &dev->outbufs_list); ++ } ++ memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index)); ++ atomic_set(&dev->open_count, 0); ++ dev->ready_for_capture = 0; ++ dev->ready_for_output = 1; ++ ++ dev->buffer_size = 0; ++ dev->image = NULL; ++ dev->imagesize = 0; ++#ifdef HAVE_TIMER_SETUP ++ timer_setup(&dev->sustain_timer, sustain_timer_clb, 0); ++ timer_setup(&dev->timeout_timer, timeout_timer_clb, 0); ++#else ++ setup_timer(&dev->sustain_timer, sustain_timer_clb, nr); ++ setup_timer(&dev->timeout_timer, timeout_timer_clb, nr); ++#endif ++ dev->reread_count = 0; ++ dev->timeout_jiffies = 0; ++ dev->timeout_image = NULL; ++ dev->timeout_happened = 0; ++ ++ hdl = &dev->ctrl_handler; ++ err = v4l2_ctrl_handler_init(hdl, 4); ++ if (err) ++ goto out_unregister; ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL); ++ if (hdl->error) { ++ err = hdl->error; ++ goto out_free_handler; ++ } ++ dev->v4l2_dev.ctrl_handler = hdl; ++ ++ err = v4l2_ctrl_handler_setup(hdl); ++ if (err) ++ goto out_free_handler; ++ ++ /* FIXME set buffers to 0 */ ++ ++ /* Set initial format */ ++ dev->pix_format.width = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; */ ++ dev->pix_format.height = 0; /* V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; */ ++ dev->pix_format.pixelformat = formats[0].fourcc; ++ dev->pix_format.colorspace = ++ V4L2_COLORSPACE_SRGB; /* do we need to set this ? */ ++ dev->pix_format.field = V4L2_FIELD_NONE; ++ ++ dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage); ++ dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size, ++ dev->pix_format.sizeimage); ++ ++ if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0)) ++ goto out_free_handler; ++ ++ init_waitqueue_head(&dev->read_event); ++ ++ /* register the device -> it creates /dev/video* */ ++ if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) { ++ printk(KERN_ERR ++ "v4l2loopback: failed video_register_device()\n"); ++ err = -EFAULT; ++ goto out_free_device; ++ } ++ v4l2loopback_create_sysfs(dev->vdev); ++ ++ MARK(); ++ if (ret_nr) ++ *ret_nr = dev->vdev->num; ++ return 0; ++ ++out_free_device: ++ video_device_release(dev->vdev); ++out_free_handler: ++ v4l2_ctrl_handler_free(&dev->ctrl_handler); ++out_unregister: ++ video_set_drvdata(dev->vdev, NULL); ++ if (vdev_priv != NULL) ++ kfree(vdev_priv); ++ v4l2_device_unregister(&dev->v4l2_dev); ++out_free_idr: ++ idr_remove(&v4l2loopback_index_idr, nr); ++out_free_dev: ++ kfree(dev); ++ return err; ++} ++ ++static void v4l2_loopback_remove(struct v4l2_loopback_device *dev) ++{ ++ free_buffers(dev); ++ v4l2loopback_remove_sysfs(dev->vdev); ++ kfree(video_get_drvdata(dev->vdev)); ++ video_unregister_device(dev->vdev); ++ v4l2_device_unregister(&dev->v4l2_dev); ++ v4l2_ctrl_handler_free(&dev->ctrl_handler); ++ kfree(dev); ++} ++ ++static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd, ++ unsigned long parm) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_config conf; ++ struct v4l2_loopback_config *confptr = &conf; ++ int device_nr; ++ int ret; ++ ++ ret = mutex_lock_killable(&v4l2loopback_ctl_mutex); ++ if (ret) ++ return ret; ++ ++ ret = -EINVAL; ++ switch (cmd) { ++ default: ++ ret = -ENOSYS; ++ break; ++ /* add a v4l2loopback device (pair), based on the user-provided specs */ ++ case V4L2LOOPBACK_CTL_ADD: ++ if (parm) { ++ if ((ret = copy_from_user(&conf, (void *)parm, ++ sizeof(conf))) < 0) ++ break; ++ } else ++ confptr = NULL; ++ ret = v4l2_loopback_add(confptr, &device_nr); ++ if (ret >= 0) ++ ret = device_nr; ++ break; ++ /* remove a v4l2loopback device (both capture and output) */ ++ case V4L2LOOPBACK_CTL_REMOVE: ++ ret = v4l2loopback_lookup((int)parm, &dev); ++ if (ret >= 0 && dev) { ++ int nr = ret; ++ ret = -EBUSY; ++ if (dev->open_count.counter > 0) ++ break; ++ idr_remove(&v4l2loopback_index_idr, nr); ++ v4l2_loopback_remove(dev); ++ ret = 0; ++ }; ++ break; ++ /* get information for a loopback device. ++ * this is mostly about limits (which cannot be queried directly with VIDIOC_G_FMT and friends ++ */ ++ case V4L2LOOPBACK_CTL_QUERY: ++ if (!parm) ++ break; ++ if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) < ++ 0) ++ break; ++ device_nr = (conf.output_nr < 0) ? conf.capture_nr : ++ conf.output_nr; ++ MARK(); ++ /* get the device from either capture_nr or output_nr (whatever is valid) */ ++ if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0) ++ break; ++ MARK(); ++ /* if we got the device from output_nr and there is a valid capture_nr, ++ * make sure that both refer to the same device (or bail out) ++ */ ++ if ((device_nr != conf.capture_nr) && (conf.capture_nr >= 0) && ++ (ret != v4l2loopback_lookup(conf.capture_nr, 0))) ++ break; ++ MARK(); ++ /* if otoh, we got the device from capture_nr and there is a valid output_nr, ++ * make sure that both refer to the same device (or bail out) ++ */ ++ if ((device_nr != conf.output_nr) && (conf.output_nr >= 0) && ++ (ret != v4l2loopback_lookup(conf.output_nr, 0))) ++ break; ++ MARK(); ++ ++ /* v4l2_loopback_config identified a single device, so fetch the data */ ++ snprintf(conf.card_label, sizeof(conf.card_label), "%s", ++ dev->card_label); ++ MARK(); ++ conf.output_nr = conf.capture_nr = dev->vdev->num; ++ conf.max_width = dev->max_width; ++ conf.max_height = dev->max_height; ++ conf.announce_all_caps = dev->announce_all_caps; ++ conf.max_buffers = dev->buffers_number; ++ conf.max_openers = dev->max_openers; ++ conf.debug = debug; ++ MARK(); ++ if (copy_to_user((void *)parm, &conf, sizeof(conf))) { ++ ret = -EFAULT; ++ break; ++ } ++ MARK(); ++ ret = 0; ++ ; ++ break; ++ } ++ ++ MARK(); ++ mutex_unlock(&v4l2loopback_ctl_mutex); ++ MARK(); ++ return ret; ++} ++ ++/* LINUX KERNEL */ ++ ++static const struct file_operations v4l2loopback_ctl_fops = { ++ // clang-format off ++ .owner = THIS_MODULE, ++ .open = nonseekable_open, ++ .unlocked_ioctl = v4l2loopback_control_ioctl, ++ .compat_ioctl = v4l2loopback_control_ioctl, ++ .llseek = noop_llseek, ++ // clang-format on ++}; ++ ++static struct miscdevice v4l2loopback_misc = { ++ // clang-format off ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "v4l2loopback", ++ .fops = &v4l2loopback_ctl_fops, ++ // clang-format on ++}; ++ ++static const struct v4l2_file_operations v4l2_loopback_fops = { ++ // clang-format off ++ .owner = THIS_MODULE, ++ .open = v4l2_loopback_open, ++ .release = v4l2_loopback_close, ++ .read = v4l2_loopback_read, ++ .write = v4l2_loopback_write, ++ .poll = v4l2_loopback_poll, ++ .mmap = v4l2_loopback_mmap, ++ .unlocked_ioctl = video_ioctl2, ++ // clang-format on ++}; ++ ++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = { ++ // clang-format off ++ .vidioc_querycap = &vidioc_querycap, ++ .vidioc_enum_framesizes = &vidioc_enum_framesizes, ++ .vidioc_enum_frameintervals = &vidioc_enum_frameintervals, ++ ++ .vidioc_enum_output = &vidioc_enum_output, ++ .vidioc_g_output = &vidioc_g_output, ++ .vidioc_s_output = &vidioc_s_output, ++ ++ .vidioc_enum_input = &vidioc_enum_input, ++ .vidioc_g_input = &vidioc_g_input, ++ .vidioc_s_input = &vidioc_s_input, ++ ++ .vidioc_enum_fmt_vid_cap = &vidioc_enum_fmt_cap, ++ .vidioc_g_fmt_vid_cap = &vidioc_g_fmt_cap, ++ .vidioc_s_fmt_vid_cap = &vidioc_s_fmt_cap, ++ .vidioc_try_fmt_vid_cap = &vidioc_try_fmt_cap, ++ ++ .vidioc_enum_fmt_vid_out = &vidioc_enum_fmt_out, ++ .vidioc_s_fmt_vid_out = &vidioc_s_fmt_out, ++ .vidioc_g_fmt_vid_out = &vidioc_g_fmt_out, ++ .vidioc_try_fmt_vid_out = &vidioc_try_fmt_out, ++ ++#ifdef V4L2L_OVERLAY ++ .vidioc_s_fmt_vid_overlay = &vidioc_s_fmt_overlay, ++ .vidioc_g_fmt_vid_overlay = &vidioc_g_fmt_overlay, ++#endif ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ .vidioc_s_std = &vidioc_s_std, ++ .vidioc_g_std = &vidioc_g_std, ++ .vidioc_querystd = &vidioc_querystd, ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ .vidioc_g_parm = &vidioc_g_parm, ++ .vidioc_s_parm = &vidioc_s_parm, ++ ++ .vidioc_reqbufs = &vidioc_reqbufs, ++ .vidioc_querybuf = &vidioc_querybuf, ++ .vidioc_qbuf = &vidioc_qbuf, ++ .vidioc_dqbuf = &vidioc_dqbuf, ++ ++ .vidioc_streamon = &vidioc_streamon, ++ .vidioc_streamoff = &vidioc_streamoff, ++ ++#ifdef CONFIG_VIDEO_V4L1_COMPAT ++ .vidiocgmbuf = &vidiocgmbuf, ++#endif ++ ++ .vidioc_subscribe_event = &vidioc_subscribe_event, ++ .vidioc_unsubscribe_event = &v4l2_event_unsubscribe, ++ // clang-format on ++}; ++ ++static int free_device_cb(int id, void *ptr, void *data) ++{ ++ struct v4l2_loopback_device *dev = ptr; ++ v4l2_loopback_remove(dev); ++ return 0; ++} ++static void free_devices(void) ++{ ++ idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL); ++ idr_destroy(&v4l2loopback_index_idr); ++} ++ ++static int __init v4l2loopback_init_module(void) ++{ ++ int err; ++ int i; ++ MARK(); ++ ++ err = misc_register(&v4l2loopback_misc); ++ if (err < 0) ++ return err; ++ ++ if (devices < 0) { ++ devices = 1; ++ ++ /* try guessing the devices from the "video_nr" parameter */ ++ for (i = MAX_DEVICES - 1; i >= 0; i--) { ++ if (video_nr[i] >= 0) { ++ devices = i + 1; ++ break; ++ } ++ } ++ } ++ ++ if (devices > MAX_DEVICES) { ++ devices = MAX_DEVICES; ++ printk(KERN_INFO ++ "v4l2loopback: number of initial devices is limited to: %d\n", ++ MAX_DEVICES); ++ } ++ ++ if (max_buffers > MAX_BUFFERS) { ++ max_buffers = MAX_BUFFERS; ++ printk(KERN_INFO ++ "v4l2loopback: number of buffers is limited to: %d\n", ++ MAX_BUFFERS); ++ } ++ ++ if (max_openers < 0) { ++ printk(KERN_INFO ++ "v4l2loopback: allowing %d openers rather than %d\n", ++ 2, max_openers); ++ max_openers = 2; ++ } ++ ++ if (max_width < V4L2LOOPBACK_SIZE_MIN_WIDTH) { ++ max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; ++ printk(KERN_INFO "v4l2loopback: using max_width %d\n", ++ max_width); ++ } ++ if (max_height < V4L2LOOPBACK_SIZE_MIN_HEIGHT) { ++ max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; ++ printk(KERN_INFO "v4l2loopback: using max_height %d\n", ++ max_height); ++ } ++ ++ /* kfree on module release */ ++ for (i = 0; i < devices; i++) { ++ struct v4l2_loopback_config cfg = { ++ // clang-format off ++ .output_nr = video_nr[i], ++ .capture_nr = video_nr[i], ++ .max_width = max_width, ++ .max_height = max_height, ++ .announce_all_caps = (!exclusive_caps[i]), ++ .max_buffers = max_buffers, ++ .max_openers = max_openers, ++ .debug = debug, ++ // clang-format on ++ }; ++ cfg.card_label[0] = 0; ++ if (card_label[i]) ++ snprintf(cfg.card_label, sizeof(cfg.card_label), "%s", ++ card_label[i]); ++ err = v4l2_loopback_add(&cfg, 0); ++ if (err) { ++ free_devices(); ++ goto error; ++ } ++ } ++ ++ dprintk("module installed\n"); ++ ++ printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n", ++ // clang-format off ++ (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff, ++ (V4L2LOOPBACK_VERSION_CODE >> 8) & 0xff, ++ (V4L2LOOPBACK_VERSION_CODE ) & 0xff, ++#ifdef SNAPSHOT_VERSION ++ " (" __stringify(SNAPSHOT_VERSION) ")" ++#else ++ "" ++#endif ++ ); ++ // clang-format on ++ ++ return 0; ++error: ++ misc_deregister(&v4l2loopback_misc); ++ return err; ++} ++ ++static void v4l2loopback_cleanup_module(void) ++{ ++ MARK(); ++ /* unregister the device -> it deletes /dev/video* */ ++ free_devices(); ++ /* and get rid of /dev/v4l2loopback */ ++ misc_deregister(&v4l2loopback_misc); ++ dprintk("module removed\n"); ++} ++ ++MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR); ++ ++module_init(v4l2loopback_init_module); ++module_exit(v4l2loopback_cleanup_module); +diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h +new file mode 100644 +index 000000000000..10f8e662d37a +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback.h +@@ -0,0 +1,96 @@ ++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ ++/* ++ * v4l2loopback.h ++ * ++ * Written by IOhannes m zmölnig, 7/1/20. ++ * ++ * Copyright 2020 by IOhannes m zmölnig. Redistribution of this file is ++ * permitted under the GNU General Public License. ++ */ ++#ifndef _V4L2LOOPBACK_H ++#define _V4L2LOOPBACK_H ++ ++#define V4L2LOOPBACK_VERSION_MAJOR 0 ++#define V4L2LOOPBACK_VERSION_MINOR 12 ++#define V4L2LOOPBACK_VERSION_BUGFIX 7 ++ ++/* /dev/v4l2loopback interface */ ++ ++struct v4l2_loopback_config { ++ /** ++ * the device-number (/dev/video) ++ * V4L2LOOPBACK_CTL_ADD: ++ * setting this to a value<0, will allocate an available one ++ * if nr>=0 and the device already exists, the ioctl will EEXIST ++ * if output_nr and capture_nr are the same, only a single device will be created ++ * NOTE: currently split-devices (where output_nr and capture_nr differ) ++ * are not implemented yet. ++ * until then, requesting different device-IDs will result in EINVAL. ++ * ++ * V4L2LOOPBACK_CTL_QUERY: ++ * either both output_nr and capture_nr must refer to the same loopback, ++ * or one (and only one) of them must be -1 ++ * ++ */ ++ int output_nr; ++ int capture_nr; ++ ++ /** ++ * a nice name for your device ++ * if (*card_label)==0, an automatic name is assigned ++ */ ++ char card_label[32]; ++ ++ /** ++ * maximum allowed frame size ++ * if too low, default values are used ++ */ ++ int max_width; ++ int max_height; ++ ++ /** ++ * number of buffers to allocate for the queue ++ * if set to <=0, default values are used ++ */ ++ int max_buffers; ++ ++ /** ++ * how many consumers are allowed to open this device concurrently ++ * if set to <=0, default values are used ++ */ ++ int max_openers; ++ ++ /** ++ * set the debugging level for this device ++ */ ++ int debug; ++ ++ /** ++ * whether to announce OUTPUT/CAPTURE capabilities exclusively ++ * for this device or not ++ * (!exclusive_caps) ++ * NOTE: this is going to be removed once separate output/capture ++ * devices are implemented ++ */ ++ int announce_all_caps; ++}; ++ ++/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the ++ * to-be-created device set. ++ * if the ptr is NULL, a new device is created with default values at the driver's discretion. ++ * ++ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY, ++ * to get more information on the device) ++ */ ++#define V4L2LOOPBACK_CTL_ADD 0x4C80 ++ ++/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set ++ * (the two values must either refer to video-devices associated with the same loopback device ++ * or exactly one of them must be <0 ++ */ ++#define V4L2LOOPBACK_CTL_QUERY 0x4C82 ++ ++/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */ ++#define V4L2LOOPBACK_CTL_REMOVE 0x4C81 ++ ++#endif /* _V4L2LOOPBACK_H */ +diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h +new file mode 100644 +index 000000000000..d855a3796554 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback_formats.h +@@ -0,0 +1,445 @@ ++static const struct v4l2l_format formats[] = { ++#ifndef V4L2_PIX_FMT_VP9 ++#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0') ++#endif ++#ifndef V4L2_PIX_FMT_HEVC ++#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C') ++#endif ++ ++ /* here come the packed formats */ ++ { ++ .name = "32 bpp RGB, le", ++ .fourcc = V4L2_PIX_FMT_BGR32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "32 bpp RGB, be", ++ .fourcc = V4L2_PIX_FMT_RGB32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "24 bpp RGB, le", ++ .fourcc = V4L2_PIX_FMT_BGR24, ++ .depth = 24, ++ .flags = 0, ++ }, ++ { ++ .name = "24 bpp RGB, be", ++ .fourcc = V4L2_PIX_FMT_RGB24, ++ .depth = 24, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_ABGR32 ++ { ++ .name = "32 bpp RGBA, le", ++ .fourcc = V4L2_PIX_FMT_ABGR32, ++ .depth = 32, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_RGBA32 ++ { ++ .name = "32 bpp RGBA", ++ .fourcc = V4L2_PIX_FMT_RGBA32, ++ .depth = 32, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_RGB332 ++ { ++ .name = "8 bpp RGB-3-3-2", ++ .fourcc = V4L2_PIX_FMT_RGB332, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB332 */ ++#ifdef V4L2_PIX_FMT_RGB444 ++ { ++ .name = "16 bpp RGB (xxxxrrrr ggggbbbb)", ++ .fourcc = V4L2_PIX_FMT_RGB444, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB444 */ ++#ifdef V4L2_PIX_FMT_RGB555 ++ { ++ .name = "16 bpp RGB-5-5-5", ++ .fourcc = V4L2_PIX_FMT_RGB555, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB555 */ ++#ifdef V4L2_PIX_FMT_RGB565 ++ { ++ .name = "16 bpp RGB-5-6-5", ++ .fourcc = V4L2_PIX_FMT_RGB565, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB565 */ ++#ifdef V4L2_PIX_FMT_RGB555X ++ { ++ .name = "16 bpp RGB-5-5-5 BE", ++ .fourcc = V4L2_PIX_FMT_RGB555X, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB555X */ ++#ifdef V4L2_PIX_FMT_RGB565X ++ { ++ .name = "16 bpp RGB-5-6-5 BE", ++ .fourcc = V4L2_PIX_FMT_RGB565X, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB565X */ ++#ifdef V4L2_PIX_FMT_BGR666 ++ { ++ .name = "18 bpp BGR-6-6-6", ++ .fourcc = V4L2_PIX_FMT_BGR666, ++ .depth = 18, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_BGR666 */ ++ { ++ .name = "4:2:2, packed, YUYV", ++ .fourcc = V4L2_PIX_FMT_YUYV, ++ .depth = 16, ++ .flags = 0, ++ }, ++ { ++ .name = "4:2:2, packed, UYVY", ++ .fourcc = V4L2_PIX_FMT_UYVY, ++ .depth = 16, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_YVYU ++ { ++ .name = "4:2:2, packed YVYU", ++ .fourcc = V4L2_PIX_FMT_YVYU, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_VYUY ++ { ++ .name = "4:2:2, packed VYUY", ++ .fourcc = V4L2_PIX_FMT_VYUY, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif ++ { ++ .name = "4:2:2, packed YYUV", ++ .fourcc = V4L2_PIX_FMT_YYUV, ++ .depth = 16, ++ .flags = 0, ++ }, ++ { ++ .name = "YUV-8-8-8-8", ++ .fourcc = V4L2_PIX_FMT_YUV32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "8 bpp, Greyscale", ++ .fourcc = V4L2_PIX_FMT_GREY, ++ .depth = 8, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_Y4 ++ { ++ .name = "4 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y4, ++ .depth = 4, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y4 */ ++#ifdef V4L2_PIX_FMT_Y6 ++ { ++ .name = "6 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y6, ++ .depth = 6, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y6 */ ++#ifdef V4L2_PIX_FMT_Y10 ++ { ++ .name = "10 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y10, ++ .depth = 10, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y10 */ ++#ifdef V4L2_PIX_FMT_Y12 ++ { ++ .name = "12 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y12, ++ .depth = 12, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y12 */ ++ { ++ .name = "16 bpp, Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y16, ++ .depth = 16, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_YUV444 ++ { ++ .name = "16 bpp xxxxyyyy uuuuvvvv", ++ .fourcc = V4L2_PIX_FMT_YUV444, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV444 */ ++#ifdef V4L2_PIX_FMT_YUV555 ++ { ++ .name = "16 bpp YUV-5-5-5", ++ .fourcc = V4L2_PIX_FMT_YUV555, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV555 */ ++#ifdef V4L2_PIX_FMT_YUV565 ++ { ++ .name = "16 bpp YUV-5-6-5", ++ .fourcc = V4L2_PIX_FMT_YUV565, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV565 */ ++ ++/* bayer formats */ ++#ifdef V4L2_PIX_FMT_SRGGB8 ++ { ++ .name = "Bayer RGGB 8bit", ++ .fourcc = V4L2_PIX_FMT_SRGGB8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SRGGB8 */ ++#ifdef V4L2_PIX_FMT_SGRBG8 ++ { ++ .name = "Bayer GRBG 8bit", ++ .fourcc = V4L2_PIX_FMT_SGRBG8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SGRBG8 */ ++#ifdef V4L2_PIX_FMT_SGBRG8 ++ { ++ .name = "Bayer GBRG 8bit", ++ .fourcc = V4L2_PIX_FMT_SGBRG8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SGBRG8 */ ++#ifdef V4L2_PIX_FMT_SBGGR8 ++ { ++ .name = "Bayer BA81 8bit", ++ .fourcc = V4L2_PIX_FMT_SBGGR8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SBGGR8 */ ++ ++ /* here come the planar formats */ ++ { ++ .name = "4:1:0, planar, Y-Cr-Cb", ++ .fourcc = V4L2_PIX_FMT_YVU410, ++ .depth = 9, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:2:0, planar, Y-Cr-Cb", ++ .fourcc = V4L2_PIX_FMT_YVU420, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:1:0, planar, Y-Cb-Cr", ++ .fourcc = V4L2_PIX_FMT_YUV410, ++ .depth = 9, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:2:0, planar, Y-Cb-Cr", ++ .fourcc = V4L2_PIX_FMT_YUV420, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#ifdef V4L2_PIX_FMT_YUV422P ++ { ++ .name = "16 bpp YVU422 planar", ++ .fourcc = V4L2_PIX_FMT_YUV422P, ++ .depth = 16, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_YUV422P */ ++#ifdef V4L2_PIX_FMT_YUV411P ++ { ++ .name = "16 bpp YVU411 planar", ++ .fourcc = V4L2_PIX_FMT_YUV411P, ++ .depth = 16, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_YUV411P */ ++#ifdef V4L2_PIX_FMT_Y41P ++ { ++ .name = "12 bpp YUV 4:1:1", ++ .fourcc = V4L2_PIX_FMT_Y41P, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_Y41P */ ++#ifdef V4L2_PIX_FMT_NV12 ++ { ++ .name = "12 bpp Y/CbCr 4:2:0 ", ++ .fourcc = V4L2_PIX_FMT_NV12, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_NV12 */ ++ ++/* here come the compressed formats */ ++ ++#ifdef V4L2_PIX_FMT_MJPEG ++ { ++ .name = "Motion-JPEG", ++ .fourcc = V4L2_PIX_FMT_MJPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MJPEG */ ++#ifdef V4L2_PIX_FMT_JPEG ++ { ++ .name = "JFIF JPEG", ++ .fourcc = V4L2_PIX_FMT_JPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_JPEG */ ++#ifdef V4L2_PIX_FMT_DV ++ { ++ .name = "DV1394", ++ .fourcc = V4L2_PIX_FMT_DV, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_DV */ ++#ifdef V4L2_PIX_FMT_MPEG ++ { ++ .name = "MPEG-1/2/4 Multiplexed", ++ .fourcc = V4L2_PIX_FMT_MPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG */ ++#ifdef V4L2_PIX_FMT_H264 ++ { ++ .name = "H264 with start codes", ++ .fourcc = V4L2_PIX_FMT_H264, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264 */ ++#ifdef V4L2_PIX_FMT_H264_NO_SC ++ { ++ .name = "H264 without start codes", ++ .fourcc = V4L2_PIX_FMT_H264_NO_SC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264_NO_SC */ ++#ifdef V4L2_PIX_FMT_H264_MVC ++ { ++ .name = "H264 MVC", ++ .fourcc = V4L2_PIX_FMT_H264_MVC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264_MVC */ ++#ifdef V4L2_PIX_FMT_H263 ++ { ++ .name = "H263", ++ .fourcc = V4L2_PIX_FMT_H263, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H263 */ ++#ifdef V4L2_PIX_FMT_MPEG1 ++ { ++ .name = "MPEG-1 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG1, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG1 */ ++#ifdef V4L2_PIX_FMT_MPEG2 ++ { ++ .name = "MPEG-2 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG2, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG2 */ ++#ifdef V4L2_PIX_FMT_MPEG4 ++ { ++ .name = "MPEG-4 part 2 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG4, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG4 */ ++#ifdef V4L2_PIX_FMT_XVID ++ { ++ .name = "Xvid", ++ .fourcc = V4L2_PIX_FMT_XVID, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_XVID */ ++#ifdef V4L2_PIX_FMT_VC1_ANNEX_G ++ { ++ .name = "SMPTE 421M Annex G compliant stream", ++ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_G, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */ ++#ifdef V4L2_PIX_FMT_VC1_ANNEX_L ++ { ++ .name = "SMPTE 421M Annex L compliant stream", ++ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_L, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */ ++#ifdef V4L2_PIX_FMT_VP8 ++ { ++ .name = "VP8", ++ .fourcc = V4L2_PIX_FMT_VP8, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VP8 */ ++#ifdef V4L2_PIX_FMT_VP9 ++ { ++ .name = "VP9", ++ .fourcc = V4L2_PIX_FMT_VP9, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VP9 */ ++#ifdef V4L2_PIX_FMT_HEVC ++ { ++ .name = "HEVC", ++ .fourcc = V4L2_PIX_FMT_HEVC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_HEVC */ ++}; +-- +2.40.0.rc2 diff --git a/patches/0003-bore.patch b/patches/0003-bore.patch index 9e5bc88..2ba8854 100644 --- a/patches/0003-bore.patch +++ b/patches/0003-bore.patch @@ -1,23 +1,22 @@ -From f169eabeb1ba8f339ab9bebec8d503c70c5f5879 Mon Sep 17 00:00:00 2001 +From e016cce088886f56617becc8fcc598a0114e4faa Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Fri, 17 Feb 2023 15:39:23 +0100 -Subject: [PATCH] bore-cachy +Date: Sat, 11 Mar 2023 18:44:19 +0100 +Subject: [PATCH] bore-eevdf Signed-off-by: Peter Jung --- - include/linux/sched.h | 5 ++ - init/Kconfig | 20 ++++++ - kernel/sched/core.c | 29 +++++++++ - kernel/sched/debug.c | 3 + - kernel/sched/fair.c | 132 +++++++++++++++++++++++++++++++++++++++- - kernel/sched/features.h | 4 ++ - 6 files changed, 190 insertions(+), 3 deletions(-) + include/linux/sched.h | 5 ++ + init/Kconfig | 20 +++++++ + kernel/sched/core.c | 29 ++++++++++ + kernel/sched/debug.c | 3 + + kernel/sched/fair.c | 124 +++++++++++++++++++++++++++++++++++++++++- + 5 files changed, 180 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index df219c7cd6aa..a3538eacb095 100644 +index 764df627c243..f912da35db34 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h -@@ -556,6 +556,11 @@ struct sched_entity { +@@ -558,6 +558,11 @@ struct sched_entity { u64 sum_exec_runtime; u64 vruntime; u64 prev_sum_exec_runtime; @@ -26,11 +25,11 @@ index df219c7cd6aa..a3538eacb095 100644 + u64 burst_time; + u8 burst_score; +#endif // CONFIG_SCHED_BORE + s64 lag; + u64 slice; - u64 nr_migrations; - u64 prev_sleep_sum_runtime; diff --git a/init/Kconfig b/init/Kconfig -index 85a602dba878..bc69f062ca76 100644 +index 748a9491ca12..d10f1e6257cd 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1318,6 +1318,26 @@ config CHECKPOINT_RESTORE @@ -61,10 +60,10 @@ index 85a602dba878..bc69f062ca76 100644 bool "Automatic process group scheduling" select CGROUPS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 919edb034108..fd52870a002f 100644 +index 9db5f9ec9022..1f1e1f586407 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4420,6 +4420,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4418,6 +4418,21 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } @@ -86,16 +85,16 @@ index 919edb034108..fd52870a002f 100644 /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4438,6 +4453,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4434,6 +4449,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.vruntime = 0; - p->se.dur_avg = 0; - p->se.prev_sleep_sum_runtime = 0; +#ifdef CONFIG_SCHED_BORE + p->se.burst_time = 0; +#endif // CONFIG_SCHED_BORE - INIT_LIST_HEAD(&p->se.group_node); - RB_CLEAR_NODE(&p->se.latency_node); - + p->se.dur_avg = 0; + p->se.prev_sleep_sum_runtime = 0; + p->se.lag = 0; @@ -4664,6 +4682,10 @@ late_initcall(sched_core_sysctl_init); int sched_fork(unsigned long clone_flags, struct task_struct *p) { @@ -107,7 +106,7 @@ index 919edb034108..fd52870a002f 100644 /* * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external -@@ -9154,6 +9176,9 @@ void __init init_idle(struct task_struct *idle, int cpu) +@@ -9153,6 +9175,9 @@ void __init init_idle(struct task_struct *idle, int cpu) idle->__state = TASK_RUNNING; idle->se.exec_start = sched_clock(); @@ -117,22 +116,22 @@ index 919edb034108..fd52870a002f 100644 /* * PF_KTHREAD should already be set at this point; regardless, make it * look like a proper per-CPU kthread. -@@ -9821,6 +9846,10 @@ void __init sched_init(void) +@@ -9820,6 +9845,10 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 1.7.10 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification for 1.7-eevdf2 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 177934290ec4..2f40a238cdad 100644 +index fe9edfa43f65..3672df7c1f6a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c -@@ -547,6 +547,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -551,6 +551,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); @@ -143,7 +142,7 @@ index 177934290ec4..2f40a238cdad 100644 SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 5ef893ce5734..590adb9a3e37 100644 +index c40b775452bc..1e4ca5419a11 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -156,14 +155,14 @@ index 5ef893ce5734..590adb9a3e37 100644 */ #include #include -@@ -140,6 +143,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; +@@ -141,6 +144,16 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#ifdef CONFIG_SCHED_BORE +unsigned int __read_mostly sched_bore = 1; +unsigned int __read_mostly sched_burst_penalty_scale = 1280; -+unsigned int __read_mostly sched_burst_granularity = 12; ++unsigned int __read_mostly sched_burst_granularity = 6; +unsigned int __read_mostly sched_burst_smoothness = 2; +static int three = 3; +static int sixty_four = 64; @@ -173,7 +172,7 @@ index 5ef893ce5734..590adb9a3e37 100644 int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) { -@@ -203,6 +216,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -204,6 +217,44 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -185,7 +184,7 @@ index 5ef893ce5734..590adb9a3e37 100644 + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, -+ .extra2 = &three, ++ .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_burst_penalty_scale", @@ -218,7 +217,7 @@ index 5ef893ce5734..590adb9a3e37 100644 { .procname = "sched_child_runs_first", .data = &sysctl_sched_child_runs_first, -@@ -978,6 +1029,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) +@@ -1182,6 +1233,39 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ @@ -258,21 +257,21 @@ index 5ef893ce5734..590adb9a3e37 100644 /* * Update the current task's runtime statistics. */ -@@ -1007,6 +1091,13 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1211,6 +1295,13 @@ static void update_curr(struct cfs_rq *cfs_rq) curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq->exec_clock, delta_exec); +#ifdef CONFIG_SCHED_BORE + curr->burst_time += delta_exec; + update_burst_score(curr); -+ if (sched_bore & 1) ++ if (sched_bore) + curr->vruntime += calc_delta_fair_bscale(delta_exec, curr); + else +#endif // CONFIG_SCHED_BORE curr->vruntime += calc_delta_fair(delta_exec, curr); - update_min_vruntime(cfs_rq); - -@@ -5057,6 +5148,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + /* + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i +@@ -5283,6 +5374,11 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } @@ -284,7 +283,7 @@ index 5ef893ce5734..590adb9a3e37 100644 static int wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); -@@ -5101,7 +5197,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +@@ -5330,7 +5426,13 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) se = second; } @@ -299,7 +298,7 @@ index 5ef893ce5734..590adb9a3e37 100644 /* * Someone really wants this to run. If it's not unfair, run it. */ -@@ -6394,6 +6496,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -6615,6 +6717,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) util_est_dequeue(&rq->cfs, p); for_each_sched_entity(se) { @@ -309,7 +308,7 @@ index 5ef893ce5734..590adb9a3e37 100644 cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); -@@ -7856,7 +7961,12 @@ static unsigned long wakeup_gran(struct sched_entity *se) +@@ -8070,7 +8175,12 @@ static unsigned long wakeup_gran(struct sched_entity *se) * */ static int @@ -321,12 +320,11 @@ index 5ef893ce5734..590adb9a3e37 100644 +#endif // CONFIG_SCHED_BORE { s64 gran, vdiff = curr->vruntime - se->vruntime; - s64 offset = wakeup_latency_gran(curr, se); -@@ -7876,12 +7986,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) - * chance to preempt current. - */ - gran = min_t(s64, gran, get_latency_max()); -- + +@@ -8078,11 +8188,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) + return -1; + + gran = wakeup_gran(se); +#ifdef CONFIG_SCHED_BORE + if (do_scale) gran = burst_scale(gran, se); +#endif // CONFIG_SCHED_BORE @@ -344,21 +342,7 @@ index 5ef893ce5734..590adb9a3e37 100644 static void set_last_buddy(struct sched_entity *se) { -@@ -7981,7 +8099,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - return; - - update_curr(cfs_rq_of(se)); -- if (wakeup_preempt_entity(se, pse) == 1) { -+#ifdef CONFIG_SCHED_BORE -+ if (wakeup_preempt_entity_bscale(se, pse, sched_bore & 2) == 1) -+#else // CONFIG_SCHED_BORE -+ if (wakeup_preempt_entity(se, pse) == 1) -+#endif // CONFIG_SCHED_BORE -+ { - /* - * Bias pick_next to pick the sched entity that is - * triggering this preemption. -@@ -8217,6 +8340,9 @@ static void yield_task_fair(struct rq *rq) +@@ -8430,6 +8549,9 @@ static void yield_task_fair(struct rq *rq) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se; @@ -368,21 +352,5 @@ index 5ef893ce5734..590adb9a3e37 100644 /* * Are we the only task in the tree? -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index efdc29c42161..0f28637ce1aa 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -4,7 +4,11 @@ - * them to run sooner, but does not allow tons of sleepers to - * rip the spread apart. - */ -+#ifdef CONFIG_SCHED_BORE -+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, false) -+#else // CONFIG_SCHED_BORE - SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) -+#endif // CONFIG_SCHED_BORE - - /* - * Place new tasks ahead so that they do not starve already running -- -2.39.2 +2.40.0.rc2 diff --git a/patches/0004-eevdf.patch b/patches/0004-eevdf.patch new file mode 100644 index 0000000..11213cb --- /dev/null +++ b/patches/0004-eevdf.patch @@ -0,0 +1,1326 @@ +From b6d3ec3be2639fe928a09b558e979c36b41ea63b Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sat, 11 Mar 2023 18:42:39 +0100 +Subject: [PATCH] EEVDF + +Ever since looking at the latency-nice patches, I've wondered if EEVDF would +not make more sense, and I did point Vincent at some older patches I had for +that (which is here his augmented rbtree thing comes from). + +Also, since I really dislike the dual tree, I also figured we could dynamically +switch between an augmented tree and not (and while I have code for that, +that's not included in this posting because with the current results I don't +think we actually need this). + +Anyway, since I'm somewhat under the weather, I spend last week desperately +trying to connect a small cluster of neurons in defiance of the snot overlord +and bring back the EEVDF patches from the dark crypts where they'd been +gathering cobwebs for the past 13 odd years. + +By friday they worked well enough, and this morning (because obviously I forgot +the weekend is ideal to run benchmarks) I ran a bunch of hackbenck, netperf, +tbench and sysbench -- there's a bunch of wins and losses, but nothing that +indicates a total fail. + +( in fact, some of the schbench results seem to indicate EEVDF schedules a lot + more consistent than CFS and has a bunch of latency wins ) + +( hackbench also doesn't show the augmented tree and generally more expensive + pick to be a loss, in fact it shows a slight win here ) + + hackbech load + cyclictest --policy other results: + + EEVDF CFS + + # Min Latencies: 00053 + LNICE(19) # Avg Latencies: 04350 + # Max Latencies: 76019 + + # Min Latencies: 00052 00053 + LNICE(0) # Avg Latencies: 00690 00687 + # Max Latencies: 14145 13913 + + # Min Latencies: 00019 + LNICE(-19) # Avg Latencies: 00261 + # Max Latencies: 05642 + +The nice -19 numbers aren't as pretty as Vincent's, but at the end I was going +cross-eyed from staring at tree prints and I just couldn't figure out where it +was going side-ways. + +There's definitely more benchmarking/tweaking to be done (0-day already +reported a stress-ng loss), but if we can pull this off we can delete a whole +much of icky heuristics code. EEVDF is a much better defined policy than what +we currently have. + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/cgroup-v2.rst | 10 + + include/linux/rbtree_augmented.h | 26 ++ + include/linux/sched.h | 8 + + include/linux/sched/prio.h | 27 ++ + include/uapi/linux/sched.h | 4 +- + include/uapi/linux/sched/types.h | 19 + + init/init_task.c | 1 + + kernel/sched/core.c | 66 ++++ + kernel/sched/debug.c | 39 +- + kernel/sched/fair.c | 486 ++++++++++++++++++++---- + kernel/sched/features.h | 10 +- + kernel/sched/sched.h | 12 + + tools/include/uapi/linux/sched.h | 4 +- + 13 files changed, 614 insertions(+), 98 deletions(-) + +diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst +index 74cec76be9f2..2e511d4a4c6a 100644 +--- a/Documentation/admin-guide/cgroup-v2.rst ++++ b/Documentation/admin-guide/cgroup-v2.rst +@@ -1118,6 +1118,16 @@ All time durations are in microseconds. + values similar to the sched_setattr(2). This maximum utilization + value is used to clamp the task specific maximum utilization clamp. + ++ cpu.latency.nice ++ A read-write single value file which exists on non-root ++ cgroups. The default is "0". ++ ++ The nice value is in the range [-20, 19]. ++ ++ This interface file allows reading and setting latency using the ++ same values used by sched_setattr(2). The latency_nice of a group is ++ used to limit the impact of the latency_nice of a task outside the ++ group. + + + Memory +diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h +index d1c53e9d8c75..a78e692a9ff5 100644 +--- a/include/linux/rbtree_augmented.h ++++ b/include/linux/rbtree_augmented.h +@@ -60,6 +60,32 @@ rb_insert_augmented_cached(struct rb_node *node, + rb_insert_augmented(node, &root->rb_root, augment); + } + ++static __always_inline struct rb_node * ++rb_add_augmented_cached(struct rb_node *node, struct rb_root_cached *tree, ++ bool (*less)(struct rb_node *, const struct rb_node *), ++ const struct rb_augment_callbacks *augment) ++{ ++ struct rb_node **link = &tree->rb_root.rb_node; ++ struct rb_node *parent = NULL; ++ bool leftmost = true; ++ ++ while (*link) { ++ parent = *link; ++ if (less(node, parent)) { ++ link = &parent->rb_left; ++ } else { ++ link = &parent->rb_right; ++ leftmost = false; ++ } ++ } ++ ++ rb_link_node(node, parent, link); ++ augment->propagate(parent, NULL); /* suboptimal */ ++ rb_insert_augmented_cached(node, tree, leftmost, augment); ++ ++ return leftmost ? node : NULL; ++} ++ + /* + * Template for declaring augmented rbtree callbacks (generic case) + * +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 28ce1be0ba47..764df627c243 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -548,6 +548,9 @@ struct sched_entity { + /* For load-balancing: */ + struct load_weight load; + struct rb_node run_node; ++ u64 deadline; ++ u64 min_deadline; ++ + struct list_head group_node; + unsigned int on_rq; + +@@ -555,6 +558,8 @@ struct sched_entity { + u64 sum_exec_runtime; + u64 vruntime; + u64 prev_sum_exec_runtime; ++ s64 lag; ++ u64 slice; + + u64 nr_migrations; + u64 prev_sleep_sum_runtime; +@@ -571,6 +576,8 @@ struct sched_entity { + /* cached value of my_q->h_nr_running */ + unsigned long runnable_weight; + #endif ++ /* preemption offset in ns */ ++ long latency_offset; + + #ifdef CONFIG_SMP + /* +@@ -787,6 +794,7 @@ struct task_struct { + int static_prio; + int normal_prio; + unsigned int rt_priority; ++ int latency_prio; + + struct sched_entity se; + struct sched_rt_entity rt; +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index ab83d85e1183..be79503d86af 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -42,4 +42,31 @@ static inline long rlimit_to_nice(long prio) + return (MAX_NICE - prio + 1); + } + ++/* ++ * Latency nice is meant to provide scheduler hints about the relative ++ * latency requirements of a task with respect to other tasks. ++ * Thus a task with latency_nice == 19 can be hinted as the task with no ++ * latency requirements, in contrast to the task with latency_nice == -20 ++ * which should be given priority in terms of lower latency. ++ */ ++#define MAX_LATENCY_NICE 19 ++#define MIN_LATENCY_NICE -20 ++ ++#define LATENCY_NICE_WIDTH \ ++ (MAX_LATENCY_NICE - MIN_LATENCY_NICE + 1) ++ ++/* ++ * Default tasks should be treated as a task with latency_nice = 0. ++ */ ++#define DEFAULT_LATENCY_NICE 0 ++#define DEFAULT_LATENCY_PRIO (DEFAULT_LATENCY_NICE + LATENCY_NICE_WIDTH/2) ++ ++/* ++ * Convert user-nice values [ -20 ... 0 ... 19 ] ++ * to static latency [ 0..39 ], ++ * and back. ++ */ ++#define NICE_TO_LATENCY(nice) ((nice) + DEFAULT_LATENCY_PRIO) ++#define LATENCY_TO_NICE(prio) ((prio) - DEFAULT_LATENCY_PRIO) ++ + #endif /* _LINUX_SCHED_PRIO_H */ +diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +index 3bac0a8ceab2..b2e932c25be6 100644 +--- a/include/uapi/linux/sched.h ++++ b/include/uapi/linux/sched.h +@@ -132,6 +132,7 @@ struct clone_args { + #define SCHED_FLAG_KEEP_PARAMS 0x10 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 ++#define SCHED_FLAG_LATENCY_NICE 0x80 + + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) +@@ -143,6 +144,7 @@ struct clone_args { + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ +- SCHED_FLAG_UTIL_CLAMP) ++ SCHED_FLAG_UTIL_CLAMP | \ ++ SCHED_FLAG_LATENCY_NICE) + + #endif /* _UAPI_LINUX_SCHED_H */ +diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h +index f2c4589d4dbf..db1e8199e8c8 100644 +--- a/include/uapi/linux/sched/types.h ++++ b/include/uapi/linux/sched/types.h +@@ -10,6 +10,7 @@ struct sched_param { + + #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ ++#define SCHED_ATTR_SIZE_VER2 60 /* add: latency_nice */ + + /* + * Extended scheduling parameters data structure. +@@ -98,6 +99,22 @@ struct sched_param { + * scheduled on a CPU with no more capacity than the specified value. + * + * A task utilization boundary can be reset by setting the attribute to -1. ++ * ++ * Latency Tolerance Attributes ++ * =========================== ++ * ++ * A subset of sched_attr attributes allows to specify the relative latency ++ * requirements of a task with respect to the other tasks running/queued in the ++ * system. ++ * ++ * @ sched_latency_nice task's latency_nice value ++ * ++ * The latency_nice of a task can have any value in a range of ++ * [MIN_LATENCY_NICE..MAX_LATENCY_NICE]. ++ * ++ * A task with latency_nice with the value of LATENCY_NICE_MIN can be ++ * taken for a task requiring a lower latency as opposed to the task with ++ * higher latency_nice. + */ + struct sched_attr { + __u32 size; +@@ -120,6 +137,8 @@ struct sched_attr { + __u32 sched_util_min; + __u32 sched_util_max; + ++ /* latency requirement hints */ ++ __s32 sched_latency_nice; + }; + + #endif /* _UAPI_LINUX_SCHED_TYPES_H */ +diff --git a/init/init_task.c b/init/init_task.c +index ff6c4b9bfe6b..071deff8dbd1 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -78,6 +78,7 @@ struct task_struct init_task + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++ .latency_prio = DEFAULT_LATENCY_PRIO, + .policy = SCHED_NORMAL, + .cpus_ptr = &init_task.cpus_mask, + .user_cpus_ptr = NULL, +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 5237639786b7..9db5f9ec9022 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -1283,6 +1283,11 @@ static void set_load_weight(struct task_struct *p, bool update_load) + } + } + ++static void set_latency_offset(struct task_struct *p) ++{ ++ p->se.latency_offset = calc_latency_offset(p->latency_prio); ++} ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * Serializes updates of utilization clamp values +@@ -4431,8 +4436,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.vruntime = 0; + p->se.dur_avg = 0; + p->se.prev_sleep_sum_runtime = 0; ++ p->se.lag = 0; + INIT_LIST_HEAD(&p->se.group_node); + ++ set_latency_offset(p); ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; + #endif +@@ -4684,6 +4692,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); + ++ p->latency_prio = NICE_TO_LATENCY(0); ++ set_latency_offset(p); ++ + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: +@@ -7446,6 +7457,15 @@ static void __setscheduler_params(struct task_struct *p, + set_load_weight(p, true); + } + ++static void __setscheduler_latency(struct task_struct *p, ++ const struct sched_attr *attr) ++{ ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { ++ p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice); ++ set_latency_offset(p); ++ } ++} ++ + /* + * Check the target process has a UID that matches the current process's: + */ +@@ -7586,6 +7606,13 @@ static int __sched_setscheduler(struct task_struct *p, + return retval; + } + ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) { ++ if (attr->sched_latency_nice > MAX_LATENCY_NICE) ++ return -EINVAL; ++ if (attr->sched_latency_nice < MIN_LATENCY_NICE) ++ return -EINVAL; ++ } ++ + if (pi) + cpuset_read_lock(); + +@@ -7620,6 +7647,9 @@ static int __sched_setscheduler(struct task_struct *p, + goto change; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) + goto change; ++ if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE && ++ attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio)) ++ goto change; + + p->sched_reset_on_fork = reset_on_fork; + retval = 0; +@@ -7708,6 +7738,7 @@ static int __sched_setscheduler(struct task_struct *p, + __setscheduler_params(p, attr); + __setscheduler_prio(p, newprio); + } ++ __setscheduler_latency(p, attr); + __setscheduler_uclamp(p, attr); + + if (queued) { +@@ -7918,6 +7949,9 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + size < SCHED_ATTR_SIZE_VER1) + return -EINVAL; + ++ if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) && ++ size < SCHED_ATTR_SIZE_VER2) ++ return -EINVAL; + /* + * XXX: Do we want to be lenient like existing syscalls; or do we want + * to be strict and return an error on out-of-bounds values? +@@ -8155,6 +8189,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; + ++ kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio); ++ + #ifdef CONFIG_UCLAMP_TASK + /* + * This could race with another potential updater, but this is fine +@@ -11027,6 +11063,25 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, + { + return sched_group_set_idle(css_tg(css), idle); + } ++ ++static s64 cpu_latency_nice_read_s64(struct cgroup_subsys_state *css, ++ struct cftype *cft) ++{ ++ return LATENCY_TO_NICE(css_tg(css)->latency_prio); ++} ++ ++static int cpu_latency_nice_write_s64(struct cgroup_subsys_state *css, ++ struct cftype *cft, s64 nice) ++{ ++ int prio; ++ ++ if (nice < MIN_LATENCY_NICE || nice > MAX_LATENCY_NICE) ++ return -ERANGE; ++ ++ prio = NICE_TO_LATENCY(nice); ++ ++ return sched_group_set_latency(css_tg(css), prio); ++} + #endif + + static struct cftype cpu_legacy_files[] = { +@@ -11041,6 +11096,11 @@ static struct cftype cpu_legacy_files[] = { + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, ++ { ++ .name = "latency.nice", ++ .read_s64 = cpu_latency_nice_read_s64, ++ .write_s64 = cpu_latency_nice_write_s64, ++ }, + #endif + #ifdef CONFIG_CFS_BANDWIDTH + { +@@ -11258,6 +11318,12 @@ static struct cftype cpu_files[] = { + .read_s64 = cpu_idle_read_s64, + .write_s64 = cpu_idle_write_s64, + }, ++ { ++ .name = "latency.nice", ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .read_s64 = cpu_latency_nice_read_s64, ++ .write_s64 = cpu_latency_nice_write_s64, ++ }, + #endif + #ifdef CONFIG_CFS_BANDWIDTH + { +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 8d64fba16cfe..fe9edfa43f65 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -535,9 +535,13 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, " %15s %5d %9Ld.%06ld %9Ld %5d ", ++ SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + p->comm, task_pid_nr(p), + SPLIT_NS(p->se.vruntime), ++ entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', ++ SPLIT_NS(p->se.deadline), ++ SPLIT_NS(p->se.slice), ++ SPLIT_NS(p->se.sum_exec_runtime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); + +@@ -580,10 +584,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) + + void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + { +- s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, +- spread, rq0_min_vruntime, spread0; ++ s64 left_vruntime = -1, min_vruntime, right_vruntime = -1, spread; ++ struct sched_entity *last, *first; + struct rq *rq = cpu_rq(cpu); +- struct sched_entity *last; + unsigned long flags; + + #ifdef CONFIG_FAIR_GROUP_SCHED +@@ -597,26 +600,25 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + SPLIT_NS(cfs_rq->exec_clock)); + + raw_spin_rq_lock_irqsave(rq, flags); +- if (rb_first_cached(&cfs_rq->tasks_timeline)) +- MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; ++ first = __pick_first_entity(cfs_rq); ++ if (first) ++ left_vruntime = first->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) +- max_vruntime = last->vruntime; ++ right_vruntime = last->vruntime; + min_vruntime = cfs_rq->min_vruntime; +- rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; + raw_spin_rq_unlock_irqrestore(rq, flags); +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", +- SPLIT_NS(MIN_vruntime)); ++ ++ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "left_vruntime", ++ SPLIT_NS(left_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", +- SPLIT_NS(max_vruntime)); +- spread = max_vruntime - MIN_vruntime; +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", +- SPLIT_NS(spread)); +- spread0 = min_vruntime - rq0_min_vruntime; +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", +- SPLIT_NS(spread0)); ++ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "avg_vruntime", ++ SPLIT_NS(avg_vruntime(cfs_rq))); ++ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "right_vruntime", ++ SPLIT_NS(right_vruntime)); ++ spread = right_vruntime - left_vruntime; ++ SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); +@@ -1044,6 +1046,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + #endif + P(policy); + P(prio); ++ P(latency_prio); + if (task_has_dl_policy(p)) { + P(dl.runtime); + P(dl.deadline); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 84254f52c56a..c40b775452bc 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + #include + +@@ -619,13 +620,134 @@ static inline bool entity_before(struct sched_entity *a, + return (s64)(a->vruntime - b->vruntime) < 0; + } + ++static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ return (s64)(se->vruntime - cfs_rq->min_vruntime); ++} ++ + #define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + ++/* ++ * Compute virtual time from the per-task service numbers: ++ * ++ * Fair schedulers conserve lag: \Sum lag_i = 0 ++ * ++ * lag_i = S - s_i = w_i * (V - v_i) ++ * ++ * \Sum lag_i = 0 -> \Sum w_i * (V - v_i) = V * \Sum w_i - \Sum w_i * v_i = 0 ++ * ++ * From which we solve V: ++ * ++ * \Sum v_i * w_i ++ * V = -------------- ++ * \Sum w_i ++ * ++ * However, since v_i is u64, and the multiplcation could easily overflow ++ * transform it into a relative form that uses smaller quantities: ++ * ++ * Substitute: v_i == (v_i - v) + v ++ * ++ * \Sum ((v_i - v) + v) * w_i \Sum (v_i - v) * w_i ++ * V = -------------------------- = -------------------- + v ++ * \Sum w_i \Sum w_i ++ * ++ * min_vruntime = v ++ * avg_vruntime = \Sum (v_i - v) * w_i ++ * cfs_rq->load = \Sum w_i ++ * ++ * Since min_vruntime is a monotonic increasing variable that closely tracks ++ * the per-task service, these deltas: (v_i - v), will be in the order of the ++ * maximal (virtual) lag induced in the system due to quantisation. ++ */ ++static void ++avg_vruntime_add(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ s64 key = entity_key(cfs_rq, se); ++ cfs_rq->avg_vruntime += key * se->load.weight; ++ cfs_rq->avg_load += se->load.weight; ++} ++ ++static void ++avg_vruntime_sub(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ s64 key = entity_key(cfs_rq, se); ++ cfs_rq->avg_vruntime -= key * se->load.weight; ++ cfs_rq->avg_load -= se->load.weight; ++} ++ ++static inline ++void avg_vruntime_update(struct cfs_rq *cfs_rq, s64 delta) ++{ ++ /* ++ * v' = v + d ==> avg_vruntime' = avg_runtime - d*avg_load ++ */ ++ cfs_rq->avg_vruntime -= cfs_rq->avg_load * delta; ++} ++ ++u64 avg_vruntime(struct cfs_rq *cfs_rq) ++{ ++ struct sched_entity *curr = cfs_rq->curr; ++ s64 lag = cfs_rq->avg_vruntime; ++ long load = cfs_rq->avg_load; ++ ++ if (curr && curr->on_rq) { ++ lag += entity_key(cfs_rq, curr) * curr->load.weight; ++ load += curr->load.weight; ++ } ++ ++ if (load) ++ lag = div_s64(lag, load); ++ ++ return cfs_rq->min_vruntime + lag; ++} ++ ++/* ++ * Entity is eligible once it received less service than it ought to have, ++ * eg. lag >= 0. ++ * ++ * lag_i = S - s_i = w_i*(V - w_i) ++ * ++ * lag_i >= 0 -> V >= v_i ++ * ++ * \Sum (v_i - v)*w_i ++ * V = ------------------ + v ++ * \Sum w_i ++ * ++ * lag_i >= 0 -> \Sum (v_i - v)*w_i >= (v_i - v)*(\Sum w_i) ++ */ ++int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se) ++{ ++ struct sched_entity *curr = cfs_rq->curr; ++ s64 avg_vruntime = cfs_rq->avg_vruntime; ++ long avg_load = cfs_rq->avg_load; ++ ++ if (curr && curr->on_rq) { ++ avg_vruntime += entity_key(cfs_rq, curr) * curr->load.weight; ++ avg_load += curr->load.weight; ++ } ++ ++ return avg_vruntime >= entity_key(cfs_rq, se) * avg_load; ++} ++ ++static u64 __update_min_vruntime(struct cfs_rq *cfs_rq, u64 vruntime) ++{ ++ u64 min_vruntime = cfs_rq->min_vruntime; ++ /* ++ * open coded max_vruntime() to allow updating avg_vruntime ++ */ ++ s64 delta = (s64)(vruntime - min_vruntime); ++ if (delta > 0) { ++ avg_vruntime_update(cfs_rq, delta); ++ min_vruntime = vruntime; ++ } ++ return min_vruntime; ++} ++ + static void update_min_vruntime(struct cfs_rq *cfs_rq) + { ++ struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; +- struct rb_node *leftmost = rb_first_cached(&cfs_rq->tasks_timeline); + + u64 vruntime = cfs_rq->min_vruntime; + +@@ -636,9 +758,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + curr = NULL; + } + +- if (leftmost) { /* non-empty tree */ +- struct sched_entity *se = __node_2_se(leftmost); +- ++ if (se) { + if (!curr) + vruntime = se->vruntime; + else +@@ -647,7 +767,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + + /* ensure we never gain time by being placed backwards. */ + u64_u32_store(cfs_rq->min_vruntime, +- max_vruntime(cfs_rq->min_vruntime, vruntime)); ++ __update_min_vruntime(cfs_rq, vruntime)); + } + + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -655,17 +775,51 @@ static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) + return entity_before(__node_2_se(a), __node_2_se(b)); + } + ++#define deadline_gt(field, lse, rse) ({ (s64)((lse)->field - (rse)->field) > 0; }) ++ ++static inline void __update_min_deadline(struct sched_entity *se, struct rb_node *node) ++{ ++ if (node) { ++ struct sched_entity *rse = __node_2_se(node); ++ if (deadline_gt(min_deadline, se, rse)) ++ se->min_deadline = rse->min_deadline; ++ } ++} ++ ++/* ++ * se->min_deadline = min(se->deadline, left->min_deadline, right->min_deadline) ++ */ ++static inline bool min_deadline_update(struct sched_entity *se, bool exit) ++{ ++ u64 old_min_deadline = se->min_deadline; ++ struct rb_node *node = &se->run_node; ++ ++ se->min_deadline = se->deadline; ++ __update_min_deadline(se, node->rb_right); ++ __update_min_deadline(se, node->rb_left); ++ ++ return se->min_deadline == old_min_deadline; ++} ++ ++RB_DECLARE_CALLBACKS(static, min_deadline_cb, struct sched_entity, ++ run_node, min_deadline, min_deadline_update); ++ + /* + * Enqueue an entity into the rb-tree: + */ + static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); ++ avg_vruntime_add(cfs_rq, se); ++ se->min_deadline = se->deadline; ++ rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, ++ __entity_less, &min_deadline_cb); + } + + static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- rb_erase_cached(&se->run_node, &cfs_rq->tasks_timeline); ++ rb_erase_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, ++ &min_deadline_cb); ++ avg_vruntime_sub(cfs_rq, se); + } + + struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +@@ -688,6 +842,101 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) + return __node_2_se(next); + } + ++static struct sched_entity *pick_cfs(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ struct sched_entity *left = __pick_first_entity(cfs_rq); ++ ++ /* ++ * If curr is set we have to see if its left of the leftmost entity ++ * still in the tree, provided there was anything in the tree at all. ++ */ ++ if (!left || (curr && entity_before(curr, left))) ++ left = curr; ++ ++ return left; ++} ++ ++/* ++ * Earliest Eligible Virtual Deadline First ++ * ++ * In order to provide latency guarantees for different request sizes ++ * EEVDF selects the best runnable task from two criteria: ++ * ++ * 1) the task must be eligible (must be owed service) ++ * ++ * 2) from those tasks that meet 1), we select the one ++ * with the earliest virtual deadline. ++ * ++ * We can do this in O(log n) time due to an augmented RB-tree. The ++ * tree keeps the entries sorted on service, but also functions as a ++ * heap based on the deadline by keeping: ++ * ++ * se->min_deadline = min(se->deadline, se->{left,right}->min_deadline) ++ * ++ * Which allows an EDF like search on (sub)trees. ++ */ ++static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) ++{ ++ struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; ++ struct sched_entity *curr = cfs_rq->curr; ++ struct sched_entity *best = NULL; ++ ++ if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) ++ curr = NULL; ++ ++ while (node) { ++ struct sched_entity *se = __node_2_se(node); ++ ++ /* ++ * If this entity is not eligible, try the left subtree. ++ * ++ * XXX: would it be worth it to do the single division for ++ * avg_vruntime() once, instead of the multiplication ++ * in entity_eligible() O(log n) times? ++ */ ++ if (!entity_eligible(cfs_rq, se)) { ++ node = node->rb_left; ++ continue; ++ } ++ ++ /* ++ * If this entity has an earlier deadline than the previous ++ * best, take this one. If it also has the earliest deadline ++ * of its subtree, we're done. ++ */ ++ if (!best || deadline_gt(deadline, best, se)) { ++ best = se; ++ if (best->deadline == best->min_deadline) ++ break; ++ } ++ ++ /* ++ * If the earlest deadline in this subtree is in the fully ++ * eligible left half of our space, go there. ++ */ ++ if (node->rb_left && ++ __node_2_se(node->rb_left)->min_deadline == se->min_deadline) { ++ node = node->rb_left; ++ continue; ++ } ++ ++ node = node->rb_right; ++ } ++ ++ if (!best || (curr && deadline_gt(deadline, best, curr))) ++ best = curr; ++ ++ if (unlikely(!best)) { ++ struct sched_entity *left = __pick_first_entity(cfs_rq); ++ if (left) { ++ pr_err("EEVDF scheduling fail, picking leftmost\n"); ++ return left; ++ } ++ } ++ ++ return best; ++} ++ + #ifdef CONFIG_SCHED_DEBUG + struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + { +@@ -721,6 +970,14 @@ int sched_update_scaling(void) + } + #endif + ++long calc_latency_offset(int prio) ++{ ++ u32 weight = sched_prio_to_weight[prio]; ++ u64 base = sysctl_sched_min_granularity; ++ ++ return div_u64(base << SCHED_FIXEDPOINT_SHIFT, weight); ++} ++ + /* + * delta /= w + */ +@@ -797,14 +1054,30 @@ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) + return slice; + } + +-/* +- * We calculate the vruntime slice of a to-be-inserted task. +- * +- * vs = s/w +- */ +-static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) ++static void set_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) + { +- return calc_delta_fair(sched_slice(cfs_rq, se), se); ++ if (sched_feat(EEVDF)) { ++ /* ++ * For EEVDF the virtual time slope is determined by w_i (iow. ++ * nice) while the request time r_i is determined by ++ * latency-nice. ++ */ ++ se->slice = se->latency_offset; ++ } else { ++ /* ++ * When many tasks blow up the sched_period; it is possible ++ * that sched_slice() reports unusually large results (when ++ * many tasks are very light for example). Therefore impose a ++ * maximum. ++ */ ++ se->slice = min_t(u64, sched_slice(cfs_rq, se), sysctl_sched_latency); ++ } ++ ++ /* ++ * vd_i = ve_i + r_i / w_i ++ */ ++ se->deadline = se->vruntime + calc_delta_fair(se->slice, se); ++ se->min_deadline = se->deadline; + } + + #include "pelt.h" +@@ -939,6 +1212,13 @@ static void update_curr(struct cfs_rq *cfs_rq) + schedstat_add(cfs_rq->exec_clock, delta_exec); + + curr->vruntime += calc_delta_fair(delta_exec, curr); ++ /* ++ * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i ++ * this is probably good enough. ++ */ ++ if ((s64)(curr->vruntime - curr->deadline) > 0) ++ set_slice(cfs_rq, curr); ++ + update_min_vruntime(cfs_rq); + + if (entity_is_task(curr)) { +@@ -3340,6 +3620,8 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); ++ else ++ avg_vruntime_sub(cfs_rq, se); + update_load_sub(&cfs_rq->load, se->load.weight); + } + dequeue_load_avg(cfs_rq, se); +@@ -3355,9 +3637,11 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + #endif + + enqueue_load_avg(cfs_rq, se); +- if (se->on_rq) ++ if (se->on_rq) { + update_load_add(&cfs_rq->load, se->load.weight); +- ++ if (cfs_rq->curr != se) ++ avg_vruntime_add(cfs_rq, se); ++ } + } + + void reweight_task(struct task_struct *p, int prio) +@@ -4669,49 +4953,49 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) + { +- u64 vruntime = cfs_rq->min_vruntime; +- u64 sleep_time; ++ u64 vruntime = avg_vruntime(cfs_rq); + +- /* +- * The 'current' period is already promised to the current tasks, +- * however the extra weight of the new task will slow them down a +- * little, place the new task so that it fits in the slot that +- * stays open at the end. +- */ +- if (initial && sched_feat(START_DEBIT)) +- vruntime += sched_vslice(cfs_rq, se); ++ if (sched_feat(PRESERVE_LAG)) ++ vruntime -= se->lag; + +- /* sleeps up to a single latency don't count. */ +- if (!initial) { +- unsigned long thresh; ++ if (sched_feat(FAIR_SLEEPERS)) { ++// u64 sleep_time; + +- if (se_is_idle(se)) +- thresh = sysctl_sched_min_granularity; +- else +- thresh = sysctl_sched_latency; ++ /* sleeps up to a single latency don't count. */ ++ if (!initial) { ++ unsigned long thresh = TICK_NSEC; ++ ++ if (!sched_feat(EEVDF)) { ++ if (se_is_idle(se)) ++ thresh = sysctl_sched_min_granularity; ++ else ++ thresh = sysctl_sched_latency; ++ } ++ ++ /* ++ * Halve their sleep time's effect, to allow ++ * for a gentler effect of sleepers: ++ */ ++ if (sched_feat(GENTLE_FAIR_SLEEPERS)) ++ thresh >>= 1; ++ ++ vruntime -= calc_delta_fair(thresh, se); ++ } + + /* +- * Halve their sleep time's effect, to allow +- * for a gentler effect of sleepers: ++ * Pull vruntime of the entity being placed to the base level of ++ * cfs_rq, to prevent boosting it if placed backwards. If the entity ++ * slept for a long time, don't even try to compare its vruntime with ++ * the base as it may be too far off and the comparison may get ++ * inversed due to s64 overflow. ++ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; ++ if ((s64)sleep_time < 60LL * NSEC_PER_SEC) + */ +- if (sched_feat(GENTLE_FAIR_SLEEPERS)) +- thresh >>= 1; +- +- vruntime -= thresh; ++ vruntime = max_vruntime(se->vruntime, vruntime); + } + +- /* +- * Pull vruntime of the entity being placed to the base level of +- * cfs_rq, to prevent boosting it if placed backwards. If the entity +- * slept for a long time, don't even try to compare its vruntime with +- * the base as it may be too far off and the comparison may get +- * inversed due to s64 overflow. +- */ +- sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start; +- if ((s64)sleep_time > 60LL * NSEC_PER_SEC) +- se->vruntime = vruntime; +- else +- se->vruntime = max_vruntime(se->vruntime, vruntime); ++ se->vruntime = vruntime; ++ set_slice(cfs_rq, se); + } + + static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +@@ -4879,6 +5163,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + clear_buddies(cfs_rq, se); + ++ if (sched_feat(PRESERVE_LAG) && (flags & DEQUEUE_SLEEP)) ++ se->lag = avg_vruntime(cfs_rq) - se->vruntime; ++ + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; +@@ -4917,19 +5204,20 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + static void + check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + { +- unsigned long ideal_runtime, delta_exec; ++ unsigned long delta_exec; + struct sched_entity *se; + s64 delta; + +- /* +- * When many tasks blow up the sched_period; it is possible that +- * sched_slice() reports unusually large results (when many tasks are +- * very light for example). Therefore impose a maximum. +- */ +- ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency); ++ if (sched_feat(EEVDF)) { ++ if (pick_eevdf(cfs_rq) != curr) ++ goto preempt; ++ ++ return; ++ } + + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; +- if (delta_exec > ideal_runtime) { ++ if (delta_exec > curr->slice) { ++preempt: + resched_curr(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get +@@ -4953,7 +5241,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + if (delta < 0) + return; + +- if (delta > ideal_runtime) ++ if (delta > curr->slice) + resched_curr(rq_of(cfs_rq)); + } + +@@ -5008,17 +5296,20 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + static struct sched_entity * + pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) + { +- struct sched_entity *left = __pick_first_entity(cfs_rq); +- struct sched_entity *se; ++ struct sched_entity *left, *se; + +- /* +- * If curr is set we have to see if its left of the leftmost entity +- * still in the tree, provided there was anything in the tree at all. +- */ +- if (!left || (curr && entity_before(curr, left))) +- left = curr; ++ if (sched_feat(EEVDF)) { ++ /* ++ * Enabling NEXT_BUDDY will affect latency but not fairness. ++ */ ++ if (sched_feat(NEXT_BUDDY) && ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ return cfs_rq->next; + +- se = left; /* ideally we run the leftmost entity */ ++ return pick_eevdf(cfs_rq); ++ } ++ ++ se = left = pick_cfs(cfs_rq, curr); + + /* + * Avoid running the skip buddy, if running something else can +@@ -6113,13 +6404,12 @@ static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {} + static void hrtick_start_fair(struct rq *rq, struct task_struct *p) + { + struct sched_entity *se = &p->se; +- struct cfs_rq *cfs_rq = cfs_rq_of(se); + + SCHED_WARN_ON(task_rq(p) != rq); + + if (rq->cfs.h_nr_running > 1) { +- u64 slice = sched_slice(cfs_rq, se); + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; ++ u64 slice = se->slice; + s64 delta = slice - ran; + + if (delta < 0) { +@@ -7891,7 +8181,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + if (cse_is_idle != pse_is_idle) + return; + +- update_curr(cfs_rq_of(se)); ++ cfs_rq = cfs_rq_of(se); ++ update_curr(cfs_rq); ++ ++ if (sched_feat(EEVDF)) { ++ /* ++ * XXX pick_eevdf(cfs_rq) != se ? ++ */ ++ if (pick_eevdf(cfs_rq) == pse) ++ goto preempt; ++ ++ return; ++ } ++ + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is +@@ -8137,7 +8439,7 @@ static void yield_task_fair(struct rq *rq) + + clear_buddies(cfs_rq, se); + +- if (curr->policy != SCHED_BATCH) { ++ if (sched_feat(EEVDF) || curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. +@@ -8150,6 +8452,8 @@ static void yield_task_fair(struct rq *rq) + */ + rq_clock_skip_update(rq); + } ++ if (sched_feat(EEVDF)) ++ se->deadline += calc_delta_fair(se->slice, se); + + set_skip_buddy(se); + } +@@ -11902,8 +12206,8 @@ static void rq_offline_fair(struct rq *rq) + static inline bool + __entity_slice_used(struct sched_entity *se, int min_nr_tasks) + { +- u64 slice = sched_slice(cfs_rq_of(se), se); + u64 rtime = se->sum_exec_runtime - se->prev_sum_exec_runtime; ++ u64 slice = se->slice; + + return (rtime * min_nr_tasks > slice); + } +@@ -12330,6 +12634,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) + goto err; + + tg->shares = NICE_0_LOAD; ++ tg->latency_prio = DEFAULT_LATENCY_PRIO; + + init_cfs_bandwidth(tg_cfs_bandwidth(tg)); + +@@ -12428,6 +12733,9 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + } + + se->my_q = cfs_rq; ++ ++ se->latency_offset = calc_latency_offset(tg->latency_prio); ++ + /* guarantee group entities always have weight */ + update_load_set(&se->load, NICE_0_LOAD); + se->parent = parent; +@@ -12558,6 +12866,34 @@ int sched_group_set_idle(struct task_group *tg, long idle) + return 0; + } + ++int sched_group_set_latency(struct task_group *tg, int prio) ++{ ++ long latency_offset; ++ int i; ++ ++ if (tg == &root_task_group) ++ return -EINVAL; ++ ++ mutex_lock(&shares_mutex); ++ ++ if (tg->latency_prio == prio) { ++ mutex_unlock(&shares_mutex); ++ return 0; ++ } ++ ++ tg->latency_prio = prio; ++ latency_offset = calc_latency_offset(prio); ++ ++ for_each_possible_cpu(i) { ++ struct sched_entity *se = tg->se[i]; ++ ++ WRITE_ONCE(se->latency_offset, latency_offset); ++ } ++ ++ mutex_unlock(&shares_mutex); ++ return 0; ++} ++ + #else /* CONFIG_FAIR_GROUP_SCHED */ + + void free_fair_sched_group(struct task_group *tg) { } +@@ -12584,7 +12920,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task + * idle runqueue: + */ + if (rq->cfs.load.weight) +- rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); ++ rr_interval = NS_TO_JIFFIES(se->slice); + + return rr_interval; + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index efdc29c42161..49c7e6fa4c71 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -1,16 +1,18 @@ + /* SPDX-License-Identifier: GPL-2.0 */ ++ + /* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ ++SCHED_FEAT(FAIR_SLEEPERS, false) + SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true) + + /* +- * Place new tasks ahead so that they do not starve already running +- * tasks ++ * Using the avg_vruntime, do the right thing and preserve lag ++ * across sleep+wake cycles. + */ +-SCHED_FEAT(START_DEBIT, true) ++SCHED_FEAT(PRESERVE_LAG, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +@@ -102,3 +104,5 @@ SCHED_FEAT(LATENCY_WARN, false) + + SCHED_FEAT(ALT_PERIOD, true) + SCHED_FEAT(BASE_SLICE, true) ++ ++SCHED_FEAT(EEVDF, true) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 9e8bb6278604..fe5af7aaa931 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -378,6 +378,8 @@ struct task_group { + + /* A positive value indicates that this is a SCHED_IDLE group. */ + int idle; ++ /* latency priority of the group. */ ++ int latency_prio; + + #ifdef CONFIG_SMP + /* +@@ -488,6 +490,8 @@ extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); + + extern int sched_group_set_idle(struct task_group *tg, long idle); + ++extern int sched_group_set_latency(struct task_group *tg, int prio); ++ + #ifdef CONFIG_SMP + extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +@@ -554,6 +558,9 @@ struct cfs_rq { + unsigned int idle_nr_running; /* SCHED_IDLE */ + unsigned int idle_h_nr_running; /* SCHED_IDLE */ + ++ s64 avg_vruntime; ++ u64 avg_load; ++ + u64 exec_clock; + u64 min_vruntime; + #ifdef CONFIG_SCHED_CORE +@@ -2478,6 +2485,8 @@ extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + #endif + ++extern long calc_latency_offset(int prio); ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +@@ -3251,4 +3260,7 @@ static inline void update_current_exec_runtime(struct task_struct *curr, + cgroup_account_cputime(curr, delta_exec); + } + ++extern u64 avg_vruntime(struct cfs_rq *cfs_rq); ++extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); ++ + #endif /* _KERNEL_SCHED_SCHED_H */ +diff --git a/tools/include/uapi/linux/sched.h b/tools/include/uapi/linux/sched.h +index 3bac0a8ceab2..b2e932c25be6 100644 +--- a/tools/include/uapi/linux/sched.h ++++ b/tools/include/uapi/linux/sched.h +@@ -132,6 +132,7 @@ struct clone_args { + #define SCHED_FLAG_KEEP_PARAMS 0x10 + #define SCHED_FLAG_UTIL_CLAMP_MIN 0x20 + #define SCHED_FLAG_UTIL_CLAMP_MAX 0x40 ++#define SCHED_FLAG_LATENCY_NICE 0x80 + + #define SCHED_FLAG_KEEP_ALL (SCHED_FLAG_KEEP_POLICY | \ + SCHED_FLAG_KEEP_PARAMS) +@@ -143,6 +144,7 @@ struct clone_args { + SCHED_FLAG_RECLAIM | \ + SCHED_FLAG_DL_OVERRUN | \ + SCHED_FLAG_KEEP_ALL | \ +- SCHED_FLAG_UTIL_CLAMP) ++ SCHED_FLAG_UTIL_CLAMP | \ ++ SCHED_FLAG_LATENCY_NICE) + + #endif /* _UAPI_LINUX_SCHED_H */ +-- +2.40.0.rc2 diff --git a/patches/0004-hdr.patch b/patches/0005-hdr.patch similarity index 100% rename from patches/0004-hdr.patch rename to patches/0005-hdr.patch diff --git a/scripts/patch.sh b/scripts/patch.sh index 0948c18..fb03347 100755 --- a/scripts/patch.sh +++ b/scripts/patch.sh @@ -7,10 +7,12 @@ echo "Pika Kernel - Applying patches" patch -Np1 < "../patches/0001-cachy-all.patch" # orig patch from cachy - 0001-Add-latency-priority-for-CFS-class.patch patch -Np1 < "../patches/0002-cfs-nice.patch" -# orig patch from cachy - 0001-bore-cachy.patch -patch -Np1 < "../patches/0003-bore.patch" +# orig patch from cachy +patch -Np1 < "../patches/0003-eevdf.patch" +# orig patch from cachy - 0001-bore-eevdf.patch +patch -Np1 < "../patches/0004-bore.patch" # HDR patch - from cachy (but they deleted it) -patch -Np1 < "../patches/0004-hdr.patch" +patch -Np1 < "../patches/0005-hdr.patch" # Nobara patches are here: https://github.com/sammilucia/nobara-kernel-fork # Extra Leigon laptop goodies patch -Np1 < "../patches/0001-Add-legion-laptop-v0.1.patch" diff --git a/scripts/source.sh b/scripts/source.sh index d25ac4f..493c981 100755 --- a/scripts/source.sh +++ b/scripts/source.sh @@ -2,7 +2,7 @@ echo "Pika Kernel - Getting source" -wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.1.tar.gz -tar -zxf ./linux-6.2.1.tar.gz +wget -nv https://cdn.kernel.org/pub/linux/kernel/v6.x/linux-6.2.5.tar.gz +tar -zxf ./linux-6.2.5.tar.gz -cd linux-6.2.1 \ No newline at end of file +cd linux-6.2.5 \ No newline at end of file