From 53d26f1e843c6117e14bf9d0b41ca7f986f4ff5b Mon Sep 17 00:00:00 2001 From: Piotr Gorski Date: Sun, 16 Jul 2023 11:24:25 +0200 Subject: [PATCH] bcachefs Signed-off-by: Piotr Gorski --- Documentation/admin-guide/sysctl/vm.rst | 16 + Documentation/filesystems/proc.rst | 28 + MAINTAINERS | 55 + arch/arm64/include/asm/spectre.h | 4 +- arch/powerpc/mm/book3s64/radix_pgtable.c | 2 +- arch/x86/kernel/amd_gart_64.c | 2 +- block/bdev.c | 2 +- block/bio.c | 18 +- block/blk-core.c | 1 + block/blk.h | 1 - drivers/block/virtio_blk.c | 4 +- drivers/gpu/drm/gud/gud_drv.c | 2 +- drivers/iommu/dma-iommu.c | 2 +- drivers/md/bcache/Kconfig | 10 +- drivers/md/bcache/Makefile | 4 +- drivers/md/bcache/bcache.h | 2 +- drivers/md/bcache/super.c | 1 - drivers/md/bcache/util.h | 3 +- drivers/mmc/core/block.c | 4 +- drivers/mtd/spi-nor/debugfs.c | 6 +- .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 4 +- drivers/scsi/sd.c | 8 +- drivers/xen/grant-dma-ops.c | 2 +- drivers/xen/swiotlb-xen.c | 2 +- fs/Kconfig | 1 + fs/Makefile | 1 + fs/aio.c | 70 +- fs/bcachefs/Kconfig | 77 + fs/bcachefs/Makefile | 74 + fs/bcachefs/acl.c | 412 ++ fs/bcachefs/acl.h | 58 + fs/bcachefs/alloc_background.c | 2209 +++++++++ fs/bcachefs/alloc_background.h | 257 ++ fs/bcachefs/alloc_foreground.c | 1536 +++++++ fs/bcachefs/alloc_foreground.h | 224 + fs/bcachefs/alloc_types.h | 126 + fs/bcachefs/backpointers.c | 889 ++++ fs/bcachefs/backpointers.h | 131 + fs/bcachefs/bbpos.h | 48 + fs/bcachefs/bcachefs.h | 1185 +++++ fs/bcachefs/bcachefs_format.h | 2319 ++++++++++ fs/bcachefs/bcachefs_ioctl.h | 368 ++ fs/bcachefs/bkey.c | 1063 +++++ fs/bcachefs/bkey.h | 774 ++++ fs/bcachefs/bkey_buf.h | 61 + fs/bcachefs/bkey_cmp.h | 129 + fs/bcachefs/bkey_methods.c | 519 +++ fs/bcachefs/bkey_methods.h | 191 + fs/bcachefs/bkey_sort.c | 201 + fs/bcachefs/bkey_sort.h | 44 + fs/bcachefs/bset.c | 1587 +++++++ fs/bcachefs/bset.h | 541 +++ fs/bcachefs/btree_cache.c | 1277 ++++++ fs/bcachefs/btree_cache.h | 130 + fs/bcachefs/btree_gc.c | 2144 +++++++++ fs/bcachefs/btree_gc.h | 112 + fs/bcachefs/btree_io.c | 2266 ++++++++++ fs/bcachefs/btree_io.h | 228 + fs/bcachefs/btree_iter.c | 3214 +++++++++++++ fs/bcachefs/btree_iter.h | 924 ++++ fs/bcachefs/btree_key_cache.c | 1088 +++++ fs/bcachefs/btree_key_cache.h | 48 + fs/bcachefs/btree_locking.c | 797 ++++ fs/bcachefs/btree_locking.h | 424 ++ fs/bcachefs/btree_types.h | 742 +++ fs/bcachefs/btree_update.h | 357 ++ fs/bcachefs/btree_update_interior.c | 2488 ++++++++++ fs/bcachefs/btree_update_interior.h | 328 ++ fs/bcachefs/btree_update_leaf.c | 2065 +++++++++ fs/bcachefs/btree_write_buffer.c | 346 ++ fs/bcachefs/btree_write_buffer.h | 14 + fs/bcachefs/btree_write_buffer_types.h | 44 + fs/bcachefs/buckets.c | 2171 +++++++++ fs/bcachefs/buckets.h | 357 ++ fs/bcachefs/buckets_types.h | 92 + fs/bcachefs/buckets_waiting_for_journal.c | 166 + fs/bcachefs/buckets_waiting_for_journal.h | 15 + .../buckets_waiting_for_journal_types.h | 23 + fs/bcachefs/chardev.c | 769 ++++ fs/bcachefs/chardev.h | 31 + fs/bcachefs/checksum.c | 709 +++ fs/bcachefs/checksum.h | 209 + fs/bcachefs/clock.c | 193 + fs/bcachefs/clock.h | 38 + fs/bcachefs/clock_types.h | 37 + fs/bcachefs/compress.c | 712 +++ fs/bcachefs/compress.h | 55 + fs/bcachefs/counters.c | 107 + fs/bcachefs/counters.h | 17 + fs/bcachefs/darray.h | 87 + fs/bcachefs/data_update.c | 562 +++ fs/bcachefs/data_update.h | 43 + fs/bcachefs/debug.c | 957 ++++ fs/bcachefs/debug.h | 32 + fs/bcachefs/dirent.c | 565 +++ fs/bcachefs/dirent.h | 70 + fs/bcachefs/disk_groups.c | 555 +++ fs/bcachefs/disk_groups.h | 106 + fs/bcachefs/ec.c | 1960 ++++++++ fs/bcachefs/ec.h | 263 ++ fs/bcachefs/ec_types.h | 41 + fs/bcachefs/errcode.c | 63 + fs/bcachefs/errcode.h | 246 + fs/bcachefs/error.c | 297 ++ fs/bcachefs/error.h | 206 + fs/bcachefs/extent_update.c | 173 + fs/bcachefs/extent_update.h | 12 + fs/bcachefs/extents.c | 1394 ++++++ fs/bcachefs/extents.h | 757 ++++ fs/bcachefs/extents_types.h | 40 + fs/bcachefs/eytzinger.h | 281 ++ fs/bcachefs/fifo.h | 127 + fs/bcachefs/fs-common.c | 501 +++ fs/bcachefs/fs-common.h | 43 + fs/bcachefs/fs-io.c | 3982 +++++++++++++++++ fs/bcachefs/fs-io.h | 54 + fs/bcachefs/fs-ioctl.c | 556 +++ fs/bcachefs/fs-ioctl.h | 81 + fs/bcachefs/fs.c | 1943 ++++++++ fs/bcachefs/fs.h | 208 + fs/bcachefs/fsck.c | 2452 ++++++++++ fs/bcachefs/fsck.h | 14 + fs/bcachefs/inode.c | 872 ++++ fs/bcachefs/inode.h | 196 + fs/bcachefs/io.c | 3056 +++++++++++++ fs/bcachefs/io.h | 202 + fs/bcachefs/io_types.h | 165 + fs/bcachefs/journal.c | 1438 ++++++ fs/bcachefs/journal.h | 526 +++ fs/bcachefs/journal_io.c | 1863 ++++++++ fs/bcachefs/journal_io.h | 64 + fs/bcachefs/journal_reclaim.c | 873 ++++ fs/bcachefs/journal_reclaim.h | 86 + fs/bcachefs/journal_sb.c | 219 + fs/bcachefs/journal_sb.h | 24 + fs/bcachefs/journal_seq_blacklist.c | 322 ++ fs/bcachefs/journal_seq_blacklist.h | 22 + fs/bcachefs/journal_types.h | 345 ++ fs/bcachefs/keylist.c | 52 + fs/bcachefs/keylist.h | 74 + fs/bcachefs/keylist_types.h | 16 + fs/bcachefs/lru.c | 178 + fs/bcachefs/lru.h | 64 + fs/bcachefs/migrate.c | 182 + fs/bcachefs/migrate.h | 7 + fs/bcachefs/move.c | 1168 +++++ fs/bcachefs/move.h | 96 + fs/bcachefs/move_types.h | 36 + fs/bcachefs/movinggc.c | 421 ++ fs/bcachefs/movinggc.h | 12 + fs/bcachefs/nocow_locking.c | 123 + fs/bcachefs/nocow_locking.h | 49 + fs/bcachefs/nocow_locking_types.h | 20 + fs/bcachefs/opts.c | 592 +++ fs/bcachefs/opts.h | 563 +++ fs/bcachefs/printbuf.c | 415 ++ fs/bcachefs/printbuf.h | 284 ++ fs/bcachefs/quota.c | 981 ++++ fs/bcachefs/quota.h | 74 + fs/bcachefs/quota_types.h | 43 + fs/bcachefs/rebalance.c | 364 ++ fs/bcachefs/rebalance.h | 28 + fs/bcachefs/rebalance_types.h | 26 + fs/bcachefs/recovery.c | 1669 +++++++ fs/bcachefs/recovery.h | 60 + fs/bcachefs/reflink.c | 399 ++ fs/bcachefs/reflink.h | 81 + fs/bcachefs/replicas.c | 1059 +++++ fs/bcachefs/replicas.h | 91 + fs/bcachefs/replicas_types.h | 27 + fs/bcachefs/seqmutex.h | 48 + fs/bcachefs/siphash.c | 173 + fs/bcachefs/siphash.h | 87 + fs/bcachefs/str_hash.h | 370 ++ fs/bcachefs/subvolume.c | 1734 +++++++ fs/bcachefs/subvolume.h | 251 ++ fs/bcachefs/subvolume_types.h | 31 + fs/bcachefs/super-io.c | 1711 +++++++ fs/bcachefs/super-io.h | 142 + fs/bcachefs/super.c | 2006 +++++++++ fs/bcachefs/super.h | 266 ++ fs/bcachefs/super_types.h | 51 + fs/bcachefs/sysfs.c | 1064 +++++ fs/bcachefs/sysfs.h | 48 + fs/bcachefs/tests.c | 939 ++++ fs/bcachefs/tests.h | 15 + fs/bcachefs/trace.c | 16 + fs/bcachefs/trace.h | 1247 ++++++ fs/bcachefs/two_state_shared_lock.c | 8 + fs/bcachefs/two_state_shared_lock.h | 59 + fs/bcachefs/util.c | 1137 +++++ fs/bcachefs/util.h | 846 ++++ fs/bcachefs/varint.c | 122 + fs/bcachefs/varint.h | 11 + fs/bcachefs/vstructs.h | 63 + fs/bcachefs/xattr.c | 648 +++ fs/bcachefs/xattr.h | 50 + fs/dcache.c | 12 +- fs/inode.c | 218 +- fs/iomap/buffered-io.c | 45 +- fs/super.c | 40 +- fs/xfs/xfs_iomap.c | 3 + fs/xfs/xfs_mount.h | 2 + fs/xfs/xfs_super.c | 6 +- include/asm-generic/codetag.lds.h | 15 + include/asm-generic/vmlinux.lds.h | 3 + include/linux/alloc_tag.h | 160 + include/linux/bio.h | 7 +- include/linux/blkdev.h | 1 + .../md/bcache => include/linux}/closure.h | 46 +- include/linux/codetag.h | 110 + include/linux/dcache.h | 1 + include/linux/dma-map-ops.h | 2 +- include/linux/dynamic_fault.h | 79 + include/linux/exportfs.h | 6 + include/linux/fortify-string.h | 5 +- include/linux/fs.h | 16 +- include/linux/generic-radix-tree.h | 68 +- include/linux/gfp.h | 111 +- include/linux/gfp_types.h | 101 +- include/linux/hrtimer.h | 2 +- include/linux/iomap.h | 1 + include/linux/list_bl.h | 22 + include/linux/lockdep.h | 10 + include/linux/lockdep_types.h | 2 +- include/linux/mean_and_variance.h | 198 + include/linux/memcontrol.h | 56 +- include/linux/mempool.h | 73 +- include/linux/mm.h | 8 + include/linux/mm_types.h | 4 +- include/linux/nodemask.h | 2 +- include/linux/nodemask_types.h | 9 + include/linux/page_ext.h | 1 - include/linux/pagemap.h | 9 +- include/linux/percpu.h | 19 +- include/linux/pgalloc_tag.h | 105 + include/linux/prandom.h | 1 - include/linux/rhashtable-types.h | 9 +- include/linux/sched.h | 29 +- include/linux/seq_buf.h | 2 + include/linux/shrinker.h | 9 +- include/linux/six.h | 388 ++ include/linux/slab.h | 180 +- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 4 +- include/linux/string.h | 5 +- include/linux/string_helpers.h | 13 +- include/linux/time_namespace.h | 2 + include/linux/vmalloc.h | 60 +- init/Kconfig | 4 + init/init_task.c | 1 + kernel/Kconfig.locks | 3 + kernel/dma/mapping.c | 4 +- kernel/locking/Makefile | 1 + kernel/locking/lockdep.c | 46 + kernel/locking/osq_lock.c | 2 + kernel/locking/six.c | 893 ++++ kernel/module/main.c | 25 +- kernel/stacktrace.c | 2 + lib/Kconfig | 3 + lib/Kconfig.debug | 54 + lib/Makefile | 9 +- lib/alloc_tag.c | 225 + {drivers/md/bcache => lib}/closure.c | 36 +- lib/codetag.c | 393 ++ lib/dynamic_fault.c | 371 ++ lib/errname.c | 1 + lib/generic-radix-tree.c | 76 +- lib/iov_iter.c | 43 +- lib/math/Kconfig | 3 + lib/math/Makefile | 2 + lib/math/mean_and_variance.c | 158 + lib/math/mean_and_variance_test.c | 239 + lib/rhashtable.c | 42 +- lib/seq_buf.c | 10 + lib/string.c | 19 + lib/string_helpers.c | 26 +- lib/test-string_helpers.c | 4 +- mm/Makefile | 2 +- mm/compaction.c | 10 +- mm/filemap.c | 6 +- mm/huge_memory.c | 2 + mm/hugetlb.c | 8 +- mm/kfence/core.c | 14 +- mm/kfence/kfence.h | 4 +- mm/madvise.c | 61 + mm/memcontrol.c | 56 +- mm/mempolicy.c | 42 +- mm/mempool.c | 34 +- mm/mm_init.c | 1 + mm/oom_kill.c | 23 - mm/page_alloc.c | 66 +- mm/page_ext.c | 13 + mm/page_owner.c | 2 +- mm/percpu-internal.h | 26 +- mm/percpu.c | 120 +- {lib => mm}/show_mem.c | 37 + mm/slab.c | 24 +- mm/slab.h | 252 +- mm/slab_common.c | 148 +- mm/slub.c | 26 +- mm/util.c | 44 +- mm/vmalloc.c | 88 +- mm/vmscan.c | 99 +- scripts/Kbuild.include | 10 + scripts/Makefile.lib | 2 +- scripts/kallsyms.c | 13 + scripts/module.lds.S | 7 + 308 files changed, 96643 insertions(+), 930 deletions(-) create mode 100644 fs/bcachefs/Kconfig create mode 100644 fs/bcachefs/Makefile create mode 100644 fs/bcachefs/acl.c create mode 100644 fs/bcachefs/acl.h create mode 100644 fs/bcachefs/alloc_background.c create mode 100644 fs/bcachefs/alloc_background.h create mode 100644 fs/bcachefs/alloc_foreground.c create mode 100644 fs/bcachefs/alloc_foreground.h create mode 100644 fs/bcachefs/alloc_types.h create mode 100644 fs/bcachefs/backpointers.c create mode 100644 fs/bcachefs/backpointers.h create mode 100644 fs/bcachefs/bbpos.h create mode 100644 fs/bcachefs/bcachefs.h create mode 100644 fs/bcachefs/bcachefs_format.h create mode 100644 fs/bcachefs/bcachefs_ioctl.h create mode 100644 fs/bcachefs/bkey.c create mode 100644 fs/bcachefs/bkey.h create mode 100644 fs/bcachefs/bkey_buf.h create mode 100644 fs/bcachefs/bkey_cmp.h create mode 100644 fs/bcachefs/bkey_methods.c create mode 100644 fs/bcachefs/bkey_methods.h create mode 100644 fs/bcachefs/bkey_sort.c create mode 100644 fs/bcachefs/bkey_sort.h create mode 100644 fs/bcachefs/bset.c create mode 100644 fs/bcachefs/bset.h create mode 100644 fs/bcachefs/btree_cache.c create mode 100644 fs/bcachefs/btree_cache.h create mode 100644 fs/bcachefs/btree_gc.c create mode 100644 fs/bcachefs/btree_gc.h create mode 100644 fs/bcachefs/btree_io.c create mode 100644 fs/bcachefs/btree_io.h create mode 100644 fs/bcachefs/btree_iter.c create mode 100644 fs/bcachefs/btree_iter.h create mode 100644 fs/bcachefs/btree_key_cache.c create mode 100644 fs/bcachefs/btree_key_cache.h create mode 100644 fs/bcachefs/btree_locking.c create mode 100644 fs/bcachefs/btree_locking.h create mode 100644 fs/bcachefs/btree_types.h create mode 100644 fs/bcachefs/btree_update.h create mode 100644 fs/bcachefs/btree_update_interior.c create mode 100644 fs/bcachefs/btree_update_interior.h create mode 100644 fs/bcachefs/btree_update_leaf.c create mode 100644 fs/bcachefs/btree_write_buffer.c create mode 100644 fs/bcachefs/btree_write_buffer.h create mode 100644 fs/bcachefs/btree_write_buffer_types.h create mode 100644 fs/bcachefs/buckets.c create mode 100644 fs/bcachefs/buckets.h create mode 100644 fs/bcachefs/buckets_types.h create mode 100644 fs/bcachefs/buckets_waiting_for_journal.c create mode 100644 fs/bcachefs/buckets_waiting_for_journal.h create mode 100644 fs/bcachefs/buckets_waiting_for_journal_types.h create mode 100644 fs/bcachefs/chardev.c create mode 100644 fs/bcachefs/chardev.h create mode 100644 fs/bcachefs/checksum.c create mode 100644 fs/bcachefs/checksum.h create mode 100644 fs/bcachefs/clock.c create mode 100644 fs/bcachefs/clock.h create mode 100644 fs/bcachefs/clock_types.h create mode 100644 fs/bcachefs/compress.c create mode 100644 fs/bcachefs/compress.h create mode 100644 fs/bcachefs/counters.c create mode 100644 fs/bcachefs/counters.h create mode 100644 fs/bcachefs/darray.h create mode 100644 fs/bcachefs/data_update.c create mode 100644 fs/bcachefs/data_update.h create mode 100644 fs/bcachefs/debug.c create mode 100644 fs/bcachefs/debug.h create mode 100644 fs/bcachefs/dirent.c create mode 100644 fs/bcachefs/dirent.h create mode 100644 fs/bcachefs/disk_groups.c create mode 100644 fs/bcachefs/disk_groups.h create mode 100644 fs/bcachefs/ec.c create mode 100644 fs/bcachefs/ec.h create mode 100644 fs/bcachefs/ec_types.h create mode 100644 fs/bcachefs/errcode.c create mode 100644 fs/bcachefs/errcode.h create mode 100644 fs/bcachefs/error.c create mode 100644 fs/bcachefs/error.h create mode 100644 fs/bcachefs/extent_update.c create mode 100644 fs/bcachefs/extent_update.h create mode 100644 fs/bcachefs/extents.c create mode 100644 fs/bcachefs/extents.h create mode 100644 fs/bcachefs/extents_types.h create mode 100644 fs/bcachefs/eytzinger.h create mode 100644 fs/bcachefs/fifo.h create mode 100644 fs/bcachefs/fs-common.c create mode 100644 fs/bcachefs/fs-common.h create mode 100644 fs/bcachefs/fs-io.c create mode 100644 fs/bcachefs/fs-io.h create mode 100644 fs/bcachefs/fs-ioctl.c create mode 100644 fs/bcachefs/fs-ioctl.h create mode 100644 fs/bcachefs/fs.c create mode 100644 fs/bcachefs/fs.h create mode 100644 fs/bcachefs/fsck.c create mode 100644 fs/bcachefs/fsck.h create mode 100644 fs/bcachefs/inode.c create mode 100644 fs/bcachefs/inode.h create mode 100644 fs/bcachefs/io.c create mode 100644 fs/bcachefs/io.h create mode 100644 fs/bcachefs/io_types.h create mode 100644 fs/bcachefs/journal.c create mode 100644 fs/bcachefs/journal.h create mode 100644 fs/bcachefs/journal_io.c create mode 100644 fs/bcachefs/journal_io.h create mode 100644 fs/bcachefs/journal_reclaim.c create mode 100644 fs/bcachefs/journal_reclaim.h create mode 100644 fs/bcachefs/journal_sb.c create mode 100644 fs/bcachefs/journal_sb.h create mode 100644 fs/bcachefs/journal_seq_blacklist.c create mode 100644 fs/bcachefs/journal_seq_blacklist.h create mode 100644 fs/bcachefs/journal_types.h create mode 100644 fs/bcachefs/keylist.c create mode 100644 fs/bcachefs/keylist.h create mode 100644 fs/bcachefs/keylist_types.h create mode 100644 fs/bcachefs/lru.c create mode 100644 fs/bcachefs/lru.h create mode 100644 fs/bcachefs/migrate.c create mode 100644 fs/bcachefs/migrate.h create mode 100644 fs/bcachefs/move.c create mode 100644 fs/bcachefs/move.h create mode 100644 fs/bcachefs/move_types.h create mode 100644 fs/bcachefs/movinggc.c create mode 100644 fs/bcachefs/movinggc.h create mode 100644 fs/bcachefs/nocow_locking.c create mode 100644 fs/bcachefs/nocow_locking.h create mode 100644 fs/bcachefs/nocow_locking_types.h create mode 100644 fs/bcachefs/opts.c create mode 100644 fs/bcachefs/opts.h create mode 100644 fs/bcachefs/printbuf.c create mode 100644 fs/bcachefs/printbuf.h create mode 100644 fs/bcachefs/quota.c create mode 100644 fs/bcachefs/quota.h create mode 100644 fs/bcachefs/quota_types.h create mode 100644 fs/bcachefs/rebalance.c create mode 100644 fs/bcachefs/rebalance.h create mode 100644 fs/bcachefs/rebalance_types.h create mode 100644 fs/bcachefs/recovery.c create mode 100644 fs/bcachefs/recovery.h create mode 100644 fs/bcachefs/reflink.c create mode 100644 fs/bcachefs/reflink.h create mode 100644 fs/bcachefs/replicas.c create mode 100644 fs/bcachefs/replicas.h create mode 100644 fs/bcachefs/replicas_types.h create mode 100644 fs/bcachefs/seqmutex.h create mode 100644 fs/bcachefs/siphash.c create mode 100644 fs/bcachefs/siphash.h create mode 100644 fs/bcachefs/str_hash.h create mode 100644 fs/bcachefs/subvolume.c create mode 100644 fs/bcachefs/subvolume.h create mode 100644 fs/bcachefs/subvolume_types.h create mode 100644 fs/bcachefs/super-io.c create mode 100644 fs/bcachefs/super-io.h create mode 100644 fs/bcachefs/super.c create mode 100644 fs/bcachefs/super.h create mode 100644 fs/bcachefs/super_types.h create mode 100644 fs/bcachefs/sysfs.c create mode 100644 fs/bcachefs/sysfs.h create mode 100644 fs/bcachefs/tests.c create mode 100644 fs/bcachefs/tests.h create mode 100644 fs/bcachefs/trace.c create mode 100644 fs/bcachefs/trace.h create mode 100644 fs/bcachefs/two_state_shared_lock.c create mode 100644 fs/bcachefs/two_state_shared_lock.h create mode 100644 fs/bcachefs/util.c create mode 100644 fs/bcachefs/util.h create mode 100644 fs/bcachefs/varint.c create mode 100644 fs/bcachefs/varint.h create mode 100644 fs/bcachefs/vstructs.h create mode 100644 fs/bcachefs/xattr.c create mode 100644 fs/bcachefs/xattr.h create mode 100644 include/asm-generic/codetag.lds.h create mode 100644 include/linux/alloc_tag.h rename {drivers/md/bcache => include/linux}/closure.h (93%) create mode 100644 include/linux/codetag.h create mode 100644 include/linux/dynamic_fault.h create mode 100644 include/linux/mean_and_variance.h create mode 100644 include/linux/nodemask_types.h create mode 100644 include/linux/pgalloc_tag.h create mode 100644 include/linux/six.h create mode 100644 kernel/locking/six.c create mode 100644 lib/alloc_tag.c rename {drivers/md/bcache => lib}/closure.c (88%) create mode 100644 lib/codetag.c create mode 100644 lib/dynamic_fault.c create mode 100644 lib/math/mean_and_variance.c create mode 100644 lib/math/mean_and_variance_test.c rename {lib => mm}/show_mem.c (57%) diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 45ba1f4dc..0a012ac13 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -43,6 +43,7 @@ Currently, these files are in /proc/sys/vm: - legacy_va_layout - lowmem_reserve_ratio - max_map_count +- mem_profiling (only if CONFIG_MEM_ALLOC_PROFILING=y) - memory_failure_early_kill - memory_failure_recovery - min_free_kbytes @@ -425,6 +426,21 @@ e.g., up to one or two maps per allocation. The default value is 65530. +mem_profiling +============== + +Enable memory profiling (when CONFIG_MEM_ALLOC_PROFILING=y) + +1: Enable memory profiling. + +0: Disabld memory profiling. + +Enabling memory profiling introduces a small performance overhead for all +memory allocations. + +The default value depends on CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT. + + memory_failure_early_kill: ========================== diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 7897a7daf..810f851e6 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -683,6 +683,7 @@ files are there, and which are missing. ============ =============================================================== File Content ============ =============================================================== + allocinfo Memory allocations profiling information apm Advanced power management info buddyinfo Kernel memory allocator information (see text) (2.5) bus Directory containing bus specific information @@ -942,6 +943,33 @@ also be allocatable although a lot of filesystem metadata may have to be reclaimed to achieve this. +allocinfo +~~~~~~~ + +Provides information about memory allocations at all locations in the code +base. Each allocation in the code is identified by its source file, line +number, module and the function calling the allocation. The number of bytes +allocated at each location is reported. + +Example output. + +:: + + > cat /proc/allocinfo + + 153MiB mm/slub.c:1826 module:slub func:alloc_slab_page + 6.08MiB mm/slab_common.c:950 module:slab_common func:_kmalloc_order + 5.09MiB mm/memcontrol.c:2814 module:memcontrol func:alloc_slab_obj_exts + 4.54MiB mm/page_alloc.c:5777 module:page_alloc func:alloc_pages_exact + 1.32MiB include/asm-generic/pgalloc.h:63 module:pgtable func:__pte_alloc_one + 1.16MiB fs/xfs/xfs_log_priv.h:700 module:xfs func:xlog_kvmalloc + 1.00MiB mm/swap_cgroup.c:48 module:swap_cgroup func:swap_cgroup_prepare + 734KiB fs/xfs/kmem.c:20 module:xfs func:kmem_alloc + 640KiB kernel/rcu/tree.c:3184 module:tree func:fill_page_cache_func + 640KiB drivers/char/virtio_console.c:452 module:virtio_console func:alloc_buf + ... + + meminfo ~~~~~~~ diff --git a/MAINTAINERS b/MAINTAINERS index 35e195946..48763cc35 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3522,6 +3522,13 @@ W: http://bcache.evilpiepirate.org C: irc://irc.oftc.net/bcache F: drivers/md/bcache/ +BCACHEFS +M: Kent Overstreet +L: linux-bcachefs@vger.kernel.org +S: Supported +C: irc://irc.oftc.net/bcache +F: fs/bcachefs/ + BDISP ST MEDIA DRIVER M: Fabien Dessenne L: linux-media@vger.kernel.org @@ -5064,6 +5071,14 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git timers/core F: Documentation/devicetree/bindings/timer/ F: drivers/clocksource/ +CLOSURES +M: Kent Overstreet +L: linux-bcachefs@vger.kernel.org +S: Supported +C: irc://irc.oftc.net/bcache +F: include/linux/closure.h +F: lib/closure.c + CMPC ACPI DRIVER M: Thadeu Lima de Souza Cascardo M: Daniel Oliveira Nascimento @@ -5114,6 +5129,13 @@ S: Supported F: Documentation/process/code-of-conduct-interpretation.rst F: Documentation/process/code-of-conduct.rst +CODE TAGGING +M: Suren Baghdasaryan +M: Kent Overstreet +S: Maintained +F: include/linux/codetag.h +F: lib/codetag.c + COMEDI DRIVERS M: Ian Abbott M: H Hartley Sweeten @@ -8662,6 +8684,13 @@ F: Documentation/devicetree/bindings/power/power?domain* F: drivers/base/power/domain*.c F: include/linux/pm_domain.h +GENERIC RADIX TREE +M: Kent Overstreet +S: Supported +C: irc://irc.oftc.net/bcache +F: include/linux/generic-radix-tree.h +F: lib/generic-radix-tree.c + GENERIC RESISTIVE TOUCHSCREEN ADC DRIVER M: Eugen Hristev L: linux-input@vger.kernel.org @@ -12850,6 +12879,15 @@ F: Documentation/devicetree/bindings/net/ieee802154/mcr20a.txt F: drivers/net/ieee802154/mcr20a.c F: drivers/net/ieee802154/mcr20a.h +MEAN AND VARIANCE LIBRARY +M: Daniel B. Hill +M: Kent Overstreet +S: Maintained +T: git https://github.com/YellowOnion/linux/ +F: include/linux/mean_and_variance.h +F: lib/math/mean_and_variance.c +F: lib/math/mean_and_variance_test.c + MEASUREMENT COMPUTING CIO-DAC IIO DRIVER M: William Breathitt Gray L: linux-iio@vger.kernel.org @@ -13489,6 +13527,15 @@ F: mm/memblock.c F: mm/mm_init.c F: tools/testing/memblock/ +MEMORY ALLOCATION PROFILING +M: Suren Baghdasaryan +M: Kent Overstreet +S: Maintained +F: include/linux/alloc_tag.h +F: include/linux/codetag_ctx.h +F: lib/alloc_tag.c +F: lib/pgalloc_tag.c + MEMORY CONTROLLER DRIVERS M: Krzysztof Kozlowski L: linux-kernel@vger.kernel.org @@ -19376,6 +19423,14 @@ S: Maintained W: http://www.winischhofer.at/linuxsisusbvga.shtml F: drivers/usb/misc/sisusbvga/ +SIX LOCKS +M: Kent Overstreet +L: linux-bcachefs@vger.kernel.org +S: Supported +C: irc://irc.oftc.net/bcache +F: include/linux/six.h +F: kernel/locking/six.c + SL28 CPLD MFD DRIVER M: Michael Walle S: Maintained diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index db7b371b3..31823d971 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -13,8 +13,8 @@ #define __BP_HARDEN_HYP_VECS_SZ ((BP_HARDEN_EL2_SLOTS - 1) * SZ_2K) #ifndef __ASSEMBLY__ - -#include +#include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c index 2297aa764..4f8d43b74 100644 --- a/arch/powerpc/mm/book3s64/radix_pgtable.c +++ b/arch/powerpc/mm/book3s64/radix_pgtable.c @@ -261,7 +261,7 @@ print_mapping(unsigned long start, unsigned long end, unsigned long size, bool e if (end <= start) return; - string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf)); + string_get_size(size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf, exec ? " (exec)" : ""); diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index 56a917df4..842a0ec5e 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -676,7 +676,7 @@ static const struct dma_map_ops gart_dma_ops = { .get_sgtable = dma_common_get_sgtable, .dma_supported = dma_direct_supported, .get_required_mask = dma_direct_get_required_mask, - .alloc_pages = dma_direct_alloc_pages, + .alloc_pages_op = dma_direct_alloc_pages, .free_pages = dma_direct_free_pages, }; diff --git a/block/bdev.c b/block/bdev.c index 21c63bfef..a4d7e8732 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -934,7 +934,7 @@ EXPORT_SYMBOL(lookup_bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty) { - struct super_block *sb = get_super(bdev); + struct super_block *sb = try_get_super(bdev); int res = 0; if (sb) { diff --git a/block/bio.c b/block/bio.c index 043944fd4..70b5c987b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -606,15 +606,15 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) } EXPORT_SYMBOL(bio_kmalloc); -void zero_fill_bio(struct bio *bio) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; - bio_for_each_segment(bv, bio, iter) + __bio_for_each_segment(bv, bio, iter, start) memzero_bvec(&bv); } -EXPORT_SYMBOL(zero_fill_bio); +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size @@ -1245,7 +1245,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) struct page **pages = (struct page **)bv; ssize_t size, left; unsigned len, i = 0; - size_t offset, trim; + size_t offset; int ret = 0; /* @@ -1274,10 +1274,12 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); - iov_iter_revert(iter, trim); + if (bio->bi_bdev) { + size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + iov_iter_revert(iter, trim); + size -= trim; + } - size -= trim; if (unlikely(!size)) { ret = -EFAULT; goto out; @@ -1481,6 +1483,7 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. @@ -1540,6 +1543,7 @@ void bio_check_pages_dirty(struct bio *bio) spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { diff --git a/block/blk-core.c b/block/blk-core.c index 1da77e7d6..b7b0237c3 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -205,6 +205,7 @@ const char *blk_status_to_str(blk_status_t status) return ""; return blk_errors[idx].name; } +EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue diff --git a/block/blk.h b/block/blk.h index 45547bcf1..f20f9ca03 100644 --- a/block/blk.h +++ b/block/blk.h @@ -251,7 +251,6 @@ static inline void bio_integrity_free(struct bio *bio) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index b47358da9..be10661f1 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -990,9 +990,9 @@ static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize) nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9); string_get_size(nblocks, queue_logical_block_size(q), - STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); + STRING_SIZE_BASE2, cap_str_2, sizeof(cap_str_2)); string_get_size(nblocks, queue_logical_block_size(q), - STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); + 0, cap_str_10, sizeof(cap_str_10)); dev_notice(&vdev->dev, "[%s] %s%llu %d-byte logical blocks (%s/%s)\n", diff --git a/drivers/gpu/drm/gud/gud_drv.c b/drivers/gpu/drm/gud/gud_drv.c index 9d7bf8ee4..6b1748e1f 100644 --- a/drivers/gpu/drm/gud/gud_drv.c +++ b/drivers/gpu/drm/gud/gud_drv.c @@ -329,7 +329,7 @@ static int gud_stats_debugfs(struct seq_file *m, void *data) struct gud_device *gdrm = to_gud_device(entry->dev); char buf[10]; - string_get_size(gdrm->bulk_len, 1, STRING_UNITS_2, buf, sizeof(buf)); + string_get_size(gdrm->bulk_len, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); seq_printf(m, "Max buffer size: %s\n", buf); seq_printf(m, "Number of errors: %u\n", gdrm->stats_num_errors); diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 7a9f0b0bd..76a9d5ca4 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1556,7 +1556,7 @@ static const struct dma_map_ops iommu_dma_ops = { .flags = DMA_F_PCI_P2PDMA_SUPPORTED, .alloc = iommu_dma_alloc, .free = iommu_dma_free, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, .alloc_noncontiguous = iommu_dma_alloc_noncontiguous, .free_noncontiguous = iommu_dma_free_noncontiguous, diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig index 529c9d04e..b2d10063d 100644 --- a/drivers/md/bcache/Kconfig +++ b/drivers/md/bcache/Kconfig @@ -4,6 +4,7 @@ config BCACHE tristate "Block device as cache" select BLOCK_HOLDER_DEPRECATED if SYSFS select CRC64 + select CLOSURES help Allows a block device to be used as cache for other devices; uses a btree for indexing and the layout is optimized for SSDs. @@ -19,15 +20,6 @@ config BCACHE_DEBUG Enables extra debugging tools, allows expensive runtime checks to be turned on. -config BCACHE_CLOSURES_DEBUG - bool "Debug closures" - depends on BCACHE - select DEBUG_FS - help - Keeps all active closures in a linked list and provides a debugfs - interface to list them, which makes it possible to see asynchronous - operations that get stuck. - config BCACHE_ASYNC_REGISTRATION bool "Asynchronous device registration" depends on BCACHE diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile index 5b87e5967..054e8a33a 100644 --- a/drivers/md/bcache/Makefile +++ b/drivers/md/bcache/Makefile @@ -2,6 +2,6 @@ obj-$(CONFIG_BCACHE) += bcache.o -bcache-y := alloc.o bset.o btree.o closure.o debug.o extents.o\ - io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ +bcache-y := alloc.o bset.o btree.o debug.o extents.o io.o\ + journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\ util.o writeback.o features.o diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index aebb7ef10..c8b4914ad 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -179,6 +179,7 @@ #define pr_fmt(fmt) "bcache: %s() " fmt, __func__ #include +#include #include #include #include @@ -192,7 +193,6 @@ #include "bcache_ondisk.h" #include "bset.h" #include "util.h" -#include "closure.h" struct bucket { atomic_t pin; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7e9d19fd2..35c701d54 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -2911,7 +2911,6 @@ static int __init bcache_init(void) goto err; bch_debug_init(); - closure_debug_init(); bcache_is_reboot = false; diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h index 6f3cb7c92..f61ab1bad 100644 --- a/drivers/md/bcache/util.h +++ b/drivers/md/bcache/util.h @@ -4,6 +4,7 @@ #define _BCACHE_UTIL_H #include +#include #include #include #include @@ -13,8 +14,6 @@ #include #include -#include "closure.h" - struct closure; #ifdef CONFIG_BCACHE_DEBUG diff --git a/drivers/mmc/core/block.c b/drivers/mmc/core/block.c index d920c4178..ae9ab7816 100644 --- a/drivers/mmc/core/block.c +++ b/drivers/mmc/core/block.c @@ -2503,7 +2503,7 @@ static struct mmc_blk_data *mmc_blk_alloc_req(struct mmc_card *card, blk_queue_write_cache(md->queue.queue, cache_enabled, fua_enabled); - string_get_size((u64)size, 512, STRING_UNITS_2, + string_get_size((u64)size, 512, STRING_SIZE_BASE2, cap_str, sizeof(cap_str)); pr_info("%s: %s %s %s %s\n", md->disk->disk_name, mmc_card_id(card), mmc_card_name(card), @@ -2699,7 +2699,7 @@ static int mmc_blk_alloc_rpmb_part(struct mmc_card *card, list_add(&rpmb->node, &md->rpmbs); - string_get_size((u64)size, 512, STRING_UNITS_2, + string_get_size((u64)size, 512, STRING_SIZE_BASE2, cap_str, sizeof(cap_str)); pr_info("%s: %s %s %s, chardev (%d:%d)\n", diff --git a/drivers/mtd/spi-nor/debugfs.c b/drivers/mtd/spi-nor/debugfs.c index e11536fff..9f1ea83e2 100644 --- a/drivers/mtd/spi-nor/debugfs.c +++ b/drivers/mtd/spi-nor/debugfs.c @@ -84,7 +84,7 @@ static int spi_nor_params_show(struct seq_file *s, void *data) seq_printf(s, "name\t\t%s\n", info->name); seq_printf(s, "id\t\t%*ph\n", SPI_NOR_MAX_ID_LEN, nor->id); - string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); + string_get_size(params->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); seq_printf(s, "size\t\t%s\n", buf); seq_printf(s, "write size\t%u\n", params->writesize); seq_printf(s, "page size\t%u\n", params->page_size); @@ -129,14 +129,14 @@ static int spi_nor_params_show(struct seq_file *s, void *data) struct spi_nor_erase_type *et = &erase_map->erase_type[i]; if (et->size) { - string_get_size(et->size, 1, STRING_UNITS_2, buf, + string_get_size(et->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); seq_printf(s, " %02x (%s) [%d]\n", et->opcode, buf, i); } } if (!(nor->flags & SNOR_F_NO_OP_CHIP_ERASE)) { - string_get_size(params->size, 1, STRING_UNITS_2, buf, sizeof(buf)); + string_get_size(params->size, 1, STRING_SIZE_BASE2, buf, sizeof(buf)); seq_printf(s, " %02x (%s)\n", SPINOR_OP_CHIP_ERASE, buf); } diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 14e0d989c..7d5fbebd3 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -3457,8 +3457,8 @@ static void mem_region_show(struct seq_file *seq, const char *name, { char buf[40]; - string_get_size((u64)to - from + 1, 1, STRING_UNITS_2, buf, - sizeof(buf)); + string_get_size((u64)to - from + 1, 1, STRING_SIZE_BASE2, + buf, sizeof(buf)); seq_printf(seq, "%-15s %#x-%#x [%s]\n", name, from, to, buf); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 1624d528a..bf0a1907b 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2580,10 +2580,10 @@ sd_print_capacity(struct scsi_disk *sdkp, if (!sdkp->first_scan && old_capacity == sdkp->capacity) return; - string_get_size(sdkp->capacity, sector_size, - STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); - string_get_size(sdkp->capacity, sector_size, - STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); + string_get_size(sdkp->capacity, sector_size, STRING_SIZE_BASE2, + cap_str_2, sizeof(cap_str_2)); + string_get_size(sdkp->capacity, sector_size, 0, + cap_str_10, sizeof(cap_str_10)); sd_printk(KERN_NOTICE, sdkp, "%llu %d-byte logical blocks: (%s/%s)\n", diff --git a/drivers/xen/grant-dma-ops.c b/drivers/xen/grant-dma-ops.c index 9784a77fa..6c7d984f1 100644 --- a/drivers/xen/grant-dma-ops.c +++ b/drivers/xen/grant-dma-ops.c @@ -282,7 +282,7 @@ static int xen_grant_dma_supported(struct device *dev, u64 mask) static const struct dma_map_ops xen_grant_dma_ops = { .alloc = xen_grant_dma_alloc, .free = xen_grant_dma_free, - .alloc_pages = xen_grant_dma_alloc_pages, + .alloc_pages_op = xen_grant_dma_alloc_pages, .free_pages = xen_grant_dma_free_pages, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 67aa74d20..5ab261615 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -403,6 +403,6 @@ const struct dma_map_ops xen_swiotlb_dma_ops = { .dma_supported = xen_swiotlb_dma_supported, .mmap = dma_common_mmap, .get_sgtable = dma_common_get_sgtable, - .alloc_pages = dma_common_alloc_pages, + .alloc_pages_op = dma_common_alloc_pages, .free_pages = dma_common_free_pages, }; diff --git a/fs/Kconfig b/fs/Kconfig index 18d034ec7..b05c45f63 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -44,6 +44,7 @@ source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" +source "fs/bcachefs/Kconfig" source "fs/zonefs/Kconfig" endif # BLOCK diff --git a/fs/Makefile b/fs/Makefile index 5bfdbf0d7..977a05cae 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -129,6 +129,7 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/ obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ obj-$(CONFIG_F2FS_FS) += f2fs/ +obj-$(CONFIG_BCACHEFS_FS) += bcachefs/ obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ diff --git a/fs/aio.c b/fs/aio.c index b0b17bd09..b3e14a9fe 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1109,6 +1109,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb) kmem_cache_free(kiocb_cachep, iocb); } +struct aio_waiter { + struct wait_queue_entry w; + size_t min_nr; +}; + /* aio_complete * Called when the io request on the given iocb is complete. */ @@ -1117,7 +1122,7 @@ static void aio_complete(struct aio_kiocb *iocb) struct kioctx *ctx = iocb->ki_ctx; struct aio_ring *ring; struct io_event *ev_page, *event; - unsigned tail, pos, head; + unsigned tail, pos, head, avail; unsigned long flags; /* @@ -1161,6 +1166,10 @@ static void aio_complete(struct aio_kiocb *iocb) ctx->completed_events++; if (ctx->completed_events > 1) refill_reqs_available(ctx, head, tail); + + avail = tail > head + ? tail - head + : tail + ctx->nr_events - head; spin_unlock_irqrestore(&ctx->completion_lock, flags); pr_debug("added to ring %p at [%u]\n", iocb, tail); @@ -1181,8 +1190,18 @@ static void aio_complete(struct aio_kiocb *iocb) */ smp_mb(); - if (waitqueue_active(&ctx->wait)) - wake_up(&ctx->wait); + if (waitqueue_active(&ctx->wait)) { + struct aio_waiter *curr, *next; + unsigned long flags; + + spin_lock_irqsave(&ctx->wait.lock, flags); + list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry) + if (avail >= curr->min_nr) { + list_del_init_careful(&curr->w.entry); + wake_up_process(curr->w.private); + } + spin_unlock_irqrestore(&ctx->wait.lock, flags); + } } static inline void iocb_put(struct aio_kiocb *iocb) @@ -1250,10 +1269,10 @@ static long aio_read_events_ring(struct kioctx *ctx, avail = min(avail, nr - ret); avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - pos); - ev = kmap(page); + ev = kmap_local_page(page); copy_ret = copy_to_user(event + ret, ev + pos, sizeof(*ev) * avail); - kunmap(page); + kunmap_local(ev); if (unlikely(copy_ret)) { ret = -EFAULT; @@ -1298,7 +1317,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, struct io_event __user *event, ktime_t until) { - long ret = 0; + struct hrtimer_sleeper t; + struct aio_waiter w; + long ret = 0, ret2 = 0; /* * Note that aio_read_events() is being called as the conditional - i.e. @@ -1314,12 +1335,37 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr, * the ringbuffer empty. So in practice we should be ok, but it's * something to be aware of when touching this code. */ - if (until == 0) - aio_read_events(ctx, min_nr, nr, event, &ret); - else - wait_event_interruptible_hrtimeout(ctx->wait, - aio_read_events(ctx, min_nr, nr, event, &ret), - until); + aio_read_events(ctx, min_nr, nr, event, &ret); + if (until == 0 || ret < 0 || ret >= min_nr) + return ret; + + hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + if (until != KTIME_MAX) { + hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns); + hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL); + } + + init_wait(&w.w); + + while (1) { + unsigned long nr_got = ret; + + w.min_nr = min_nr - ret; + + ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE) ?: + !t.task ? -ETIME : 0; + + if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2) + break; + + if (nr_got == ret) + schedule(); + } + + finish_wait(&ctx->wait, &w.w); + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + return ret; } diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig new file mode 100644 index 000000000..6c698b3b3 --- /dev/null +++ b/fs/bcachefs/Kconfig @@ -0,0 +1,77 @@ + +config BCACHEFS_FS + tristate "bcachefs filesystem support (EXPERIMENTAL)" + depends on BLOCK + select EXPORTFS + select CLOSURES + select LIBCRC32C + select CRC64 + select FS_POSIX_ACL + select LZ4_COMPRESS + select LZ4_DECOMPRESS + select LZ4HC_COMPRESS + select LZ4HC_DECOMPRESS + select ZLIB_DEFLATE + select ZLIB_INFLATE + select ZSTD_COMPRESS + select ZSTD_DECOMPRESS + select CRYPTO_SHA256 + select CRYPTO_CHACHA20 + select CRYPTO_POLY1305 + select KEYS + select SIXLOCKS + select RAID6_PQ + select XOR_BLOCKS + select XXHASH + select SRCU + select SYMBOLIC_ERRNAME + select MEAN_AND_VARIANCE + help + The bcachefs filesystem - a modern, copy on write filesystem, with + support for multiple devices, compression, checksumming, etc. + +config BCACHEFS_QUOTA + bool "bcachefs quota support" + depends on BCACHEFS_FS + select QUOTACTL + +config BCACHEFS_POSIX_ACL + bool "bcachefs POSIX ACL support" + depends on BCACHEFS_FS + select FS_POSIX_ACL + +config BCACHEFS_DEBUG_TRANSACTIONS + bool "bcachefs runtime info" + depends on BCACHEFS_FS + default y + help + This makes the list of running btree transactions available in debugfs. + + This is a highly useful debugging feature but does add a small amount of overhead. + +config BCACHEFS_DEBUG + bool "bcachefs debugging" + depends on BCACHEFS_FS + help + Enables many extra debugging checks and assertions. + + The resulting code will be significantly slower than normal; you + probably shouldn't select this option unless you're a developer. + +config BCACHEFS_TESTS + bool "bcachefs unit and performance tests" + depends on BCACHEFS_FS + help + Include some unit and performance tests for the core btree code + +config BCACHEFS_LOCK_TIME_STATS + bool "bcachefs lock time statistics" + depends on BCACHEFS_FS + help + Expose statistics for how long we held a lock in debugfs + +config BCACHEFS_NO_LATENCY_ACCT + bool "disable latency accounting and time stats" + depends on BCACHEFS_FS + help + This disables device latency tracking and time stats, only for performance testing diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile new file mode 100644 index 000000000..a71956048 --- /dev/null +++ b/fs/bcachefs/Makefile @@ -0,0 +1,74 @@ + +obj-$(CONFIG_BCACHEFS_FS) += bcachefs.o + +bcachefs-y := \ + alloc_background.o \ + alloc_foreground.o \ + backpointers.o \ + bkey.o \ + bkey_methods.o \ + bkey_sort.o \ + bset.o \ + btree_cache.o \ + btree_gc.o \ + btree_io.o \ + btree_iter.o \ + btree_key_cache.o \ + btree_locking.o \ + btree_update_interior.o \ + btree_update_leaf.o \ + btree_write_buffer.o \ + buckets.o \ + buckets_waiting_for_journal.o \ + chardev.o \ + checksum.o \ + clock.o \ + compress.o \ + counters.o \ + debug.o \ + dirent.o \ + disk_groups.o \ + data_update.o \ + ec.o \ + errcode.o \ + error.o \ + extents.o \ + extent_update.o \ + fs.o \ + fs-common.o \ + fs-ioctl.o \ + fs-io.o \ + fsck.o \ + inode.o \ + io.o \ + journal.o \ + journal_io.o \ + journal_reclaim.o \ + journal_sb.o \ + journal_seq_blacklist.o \ + keylist.o \ + lru.o \ + migrate.o \ + move.o \ + movinggc.o \ + nocow_locking.o \ + opts.o \ + printbuf.o \ + quota.o \ + rebalance.o \ + recovery.o \ + reflink.o \ + replicas.o \ + siphash.o \ + subvolume.o \ + super.o \ + super-io.o \ + sysfs.o \ + tests.o \ + trace.o \ + two_state_shared_lock.o \ + util.o \ + varint.o \ + xattr.o + +bcachefs-$(CONFIG_BCACHEFS_POSIX_ACL) += acl.o diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c new file mode 100644 index 000000000..b1a488860 --- /dev/null +++ b/fs/bcachefs/acl.c @@ -0,0 +1,412 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_POSIX_ACL + +#include "bcachefs.h" + +#include +#include +#include +#include +#include + +#include "acl.h" +#include "fs.h" +#include "xattr.h" + +static inline size_t bch2_acl_size(unsigned nr_short, unsigned nr_long) +{ + return sizeof(bch_acl_header) + + sizeof(bch_acl_entry_short) * nr_short + + sizeof(bch_acl_entry) * nr_long; +} + +static inline int acl_to_xattr_type(int type) +{ + switch (type) { + case ACL_TYPE_ACCESS: + return KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS; + case ACL_TYPE_DEFAULT: + return KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT; + default: + BUG(); + } +} + +/* + * Convert from filesystem to in-memory representation. + */ +static struct posix_acl *bch2_acl_from_disk(struct btree_trans *trans, + const void *value, size_t size) +{ + const void *p, *end = value + size; + struct posix_acl *acl; + struct posix_acl_entry *out; + unsigned count = 0; + int ret; + + if (!value) + return NULL; + if (size < sizeof(bch_acl_header)) + goto invalid; + if (((bch_acl_header *)value)->a_version != + cpu_to_le32(BCH_ACL_VERSION)) + goto invalid; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *entry = p; + + if (p + sizeof(bch_acl_entry_short) > end) + goto invalid; + + switch (le16_to_cpu(entry->e_tag)) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + case ACL_GROUP: + p += sizeof(bch_acl_entry); + break; + default: + goto invalid; + } + + count++; + } + + if (p > end) + goto invalid; + + if (!count) + return NULL; + + acl = allocate_dropping_locks(trans, ret, + posix_acl_alloc(count, _gfp)); + if (!acl) + return ERR_PTR(-ENOMEM); + if (ret) { + kfree(acl); + return ERR_PTR(ret); + } + + out = acl->a_entries; + + p = value + sizeof(bch_acl_header); + while (p < end) { + const bch_acl_entry *in = p; + + out->e_tag = le16_to_cpu(in->e_tag); + out->e_perm = le16_to_cpu(in->e_perm); + + switch (out->e_tag) { + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + p += sizeof(bch_acl_entry_short); + break; + case ACL_USER: + out->e_uid = make_kuid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + out->e_gid = make_kgid(&init_user_ns, + le32_to_cpu(in->e_id)); + p += sizeof(bch_acl_entry); + break; + } + + out++; + } + + BUG_ON(out != acl->a_entries + acl->a_count); + + return acl; +invalid: + pr_err("invalid acl entry"); + return ERR_PTR(-EINVAL); +} + +#define acl_for_each_entry(acl, acl_e) \ + for (acl_e = acl->a_entries; \ + acl_e < acl->a_entries + acl->a_count; \ + acl_e++) + +/* + * Convert from in-memory to filesystem representation. + */ +static struct bkey_i_xattr * +bch2_acl_to_xattr(struct btree_trans *trans, + const struct posix_acl *acl, + int type) +{ + struct bkey_i_xattr *xattr; + bch_acl_header *acl_header; + const struct posix_acl_entry *acl_e; + void *outptr; + unsigned nr_short = 0, nr_long = 0, acl_len, u64s; + + acl_for_each_entry(acl, acl_e) { + switch (acl_e->e_tag) { + case ACL_USER: + case ACL_GROUP: + nr_long++; + break; + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + nr_short++; + break; + default: + return ERR_PTR(-EINVAL); + } + } + + acl_len = bch2_acl_size(nr_short, nr_long); + u64s = BKEY_U64s + xattr_val_u64s(0, acl_len); + + if (u64s > U8_MAX) + return ERR_PTR(-E2BIG); + + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return xattr; + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = acl_to_xattr_type(type); + xattr->v.x_name_len = 0; + xattr->v.x_val_len = cpu_to_le16(acl_len); + + acl_header = xattr_val(&xattr->v); + acl_header->a_version = cpu_to_le32(BCH_ACL_VERSION); + + outptr = (void *) acl_header + sizeof(*acl_header); + + acl_for_each_entry(acl, acl_e) { + bch_acl_entry *entry = outptr; + + entry->e_tag = cpu_to_le16(acl_e->e_tag); + entry->e_perm = cpu_to_le16(acl_e->e_perm); + switch (acl_e->e_tag) { + case ACL_USER: + entry->e_id = cpu_to_le32( + from_kuid(&init_user_ns, acl_e->e_uid)); + outptr += sizeof(bch_acl_entry); + break; + case ACL_GROUP: + entry->e_id = cpu_to_le32( + from_kgid(&init_user_ns, acl_e->e_gid)); + outptr += sizeof(bch_acl_entry); + break; + + case ACL_USER_OBJ: + case ACL_GROUP_OBJ: + case ACL_MASK: + case ACL_OTHER: + outptr += sizeof(bch_acl_entry_short); + break; + } + } + + BUG_ON(outptr != xattr_val(&xattr->v) + acl_len); + + return xattr; +} + +struct posix_acl *bch2_get_acl(struct mnt_idmap *idmap, + struct dentry *dentry, int type) +{ + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_s_c_xattr xattr; + struct posix_acl *acl = NULL; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_hash_lookup(&trans, &iter, bch2_xattr_hash_desc, + &hash, inode_inum(inode), &search, 0); + if (ret) { + if (!bch2_err_matches(ret, ENOENT)) + acl = ERR_PTR(ret); + goto out; + } + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) { + acl = ERR_PTR(ret); + goto out; + } + + xattr = bkey_s_c_to_xattr(k); + acl = bch2_acl_from_disk(&trans, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + + if (!IS_ERR(acl)) + set_cached_acl(&inode->v, type, acl); +out: + if (bch2_err_matches(PTR_ERR_OR_ZERO(acl), BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return acl; +} + +int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, + struct posix_acl *acl, int type) +{ + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode_u); + int ret; + + if (type == ACL_TYPE_DEFAULT && + !S_ISDIR(inode_u->bi_mode)) + return acl ? -EACCES : 0; + + if (acl) { + struct bkey_i_xattr *xattr = + bch2_acl_to_xattr(trans, acl, type); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, &hash_info, + inum, &xattr->k_i, 0); + } else { + struct xattr_search_key search = + X_SEARCH(acl_to_xattr_type(type), "", 0); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, &hash_info, + inum, &search); + } + + return bch2_err_matches(ret, ENOENT) ? 0 : ret; +} + +int bch2_set_acl(struct mnt_idmap *idmap, + struct dentry *dentry, + struct posix_acl *_acl, int type) +{ + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl; + umode_t mode; + int ret; + + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + acl = _acl; + + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); + if (ret) + goto btree_err; + + mode = inode_u.bi_mode; + + if (type == ACL_TYPE_ACCESS) { + ret = posix_acl_update_mode(idmap, &inode->v, &mode, &acl); + if (ret) + goto btree_err; + } + + ret = bch2_set_acl_trans(&trans, inode_inum(inode), &inode_u, acl, type); + if (ret) + goto btree_err; + + inode_u.bi_ctime = bch2_current_time(c); + inode_u.bi_mode = mode; + + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, 0); +btree_err: + bch2_trans_iter_exit(&trans, &inode_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) + goto err; + + bch2_inode_update_after_write(&trans, inode, &inode_u, + ATTR_CTIME|ATTR_MODE); + + set_cached_acl(&inode->v, type, acl); +err: + bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); + + return ret; +} + +int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) +{ + struct bch_hash_info hash_info = bch2_hash_info_init(trans->c, inode); + struct xattr_search_key search = X_SEARCH(KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS, "", 0); + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_i_xattr *new; + struct posix_acl *acl; + struct bkey_s_c k; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, + &hash_info, inum, &search, BTREE_ITER_INTENT); + if (ret) + return bch2_err_matches(ret, ENOENT) ? 0 : ret; + + k = bch2_btree_iter_peek_slot(&iter); + xattr = bkey_s_c_to_xattr(k); + if (ret) + goto err; + + acl = bch2_acl_from_disk(trans, xattr_val(xattr.v), + le16_to_cpu(xattr.v->x_val_len)); + ret = PTR_ERR_OR_ZERO(acl); + if (IS_ERR_OR_NULL(acl)) + goto err; + + ret = allocate_dropping_locks_errcode(trans, + __posix_acl_chmod(&acl, _gfp, mode)); + if (ret) + goto err; + + new = bch2_acl_to_xattr(trans, acl, ACL_TYPE_ACCESS); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto err; + } + + new->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); + *new_acl = acl; + acl = NULL; +err: + bch2_trans_iter_exit(trans, &iter); + if (!IS_ERR_OR_NULL(acl)) + kfree(acl); + return ret; +} + +#endif /* CONFIG_BCACHEFS_POSIX_ACL */ diff --git a/fs/bcachefs/acl.h b/fs/bcachefs/acl.h new file mode 100644 index 000000000..bb21d8d69 --- /dev/null +++ b/fs/bcachefs/acl.h @@ -0,0 +1,58 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ACL_H +#define _BCACHEFS_ACL_H + +struct bch_inode_unpacked; +struct bch_hash_info; +struct bch_inode_info; +struct posix_acl; + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + +#define BCH_ACL_VERSION 0x0001 + +typedef struct { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +} bch_acl_entry; + +typedef struct { + __le16 e_tag; + __le16 e_perm; +} bch_acl_entry_short; + +typedef struct { + __le32 a_version; +} bch_acl_header; + +struct posix_acl *bch2_get_acl(struct mnt_idmap *, struct dentry *, int); + +int bch2_set_acl_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct posix_acl *, int); +int bch2_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); +int bch2_acl_chmod(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + umode_t, struct posix_acl **); + +#else + +static inline int bch2_set_acl_trans(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, + struct posix_acl *acl, int type) +{ + return 0; +} + +static inline int bch2_acl_chmod(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode, + umode_t mode, + struct posix_acl **new_acl) +{ + return 0; +} + +#endif /* CONFIG_BCACHEFS_POSIX_ACL */ + +#endif /* _BCACHEFS_ACL_H */ diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c new file mode 100644 index 000000000..8d8481fc1 --- /dev/null +++ b/fs/bcachefs/alloc_background.c @@ -0,0 +1,2209 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "buckets_waiting_for_journal.h" +#include "clock.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" +#include "trace.h" +#include "varint.h" + +#include +#include +#include +#include +#include +#include +#include + +/* Persistent alloc info: */ + +static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { +#define x(name, bits) [BCH_ALLOC_FIELD_V1_##name] = bits / 8, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bkey_alloc_unpacked { + u64 journal_seq; + u8 gen; + u8 oldest_gen; + u8 data_type; + bool need_discard:1; + bool need_inc_gen:1; +#define x(_name, _bits) u##_bits _name; + BCH_ALLOC_FIELDS_V2() +#undef x +}; + +static inline u64 alloc_field_v1_get(const struct bch_alloc *a, + const void **p, unsigned field) +{ + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; + u64 v; + + if (!(a->fields & (1 << field))) + return 0; + + switch (bytes) { + case 1: + v = *((const u8 *) *p); + break; + case 2: + v = le16_to_cpup(*p); + break; + case 4: + v = le32_to_cpup(*p); + break; + case 8: + v = le64_to_cpup(*p); + break; + default: + BUG(); + } + + *p += bytes; + return v; +} + +static inline void alloc_field_v1_put(struct bkey_i_alloc *a, void **p, + unsigned field, u64 v) +{ + unsigned bytes = BCH_ALLOC_V1_FIELD_BYTES[field]; + + if (!v) + return; + + a->v.fields |= 1 << field; + + switch (bytes) { + case 1: + *((u8 *) *p) = v; + break; + case 2: + *((__le16 *) *p) = cpu_to_le16(v); + break; + case 4: + *((__le32 *) *p) = cpu_to_le32(v); + break; + case 8: + *((__le64 *) *p) = cpu_to_le64(v); + break; + default: + BUG(); + } + + *p += bytes; +} + +static void bch2_alloc_unpack_v1(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + const struct bch_alloc *in = bkey_s_c_to_alloc(k).v; + const void *d = in->data; + unsigned idx = 0; + + out->gen = in->gen; + +#define x(_name, _bits) out->_name = alloc_field_v1_get(in, &d, idx++); + BCH_ALLOC_FIELDS_V1() +#undef x +} + +static int bch2_alloc_unpack_v2(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v2 a = bkey_s_c_to_alloc_v2(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static int bch2_alloc_unpack_v3(struct bkey_alloc_unpacked *out, + struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v3 a = bkey_s_c_to_alloc_v3(k); + const u8 *in = a.v->data; + const u8 *end = bkey_val_end(a); + unsigned fieldnr = 0; + int ret; + u64 v; + + out->gen = a.v->gen; + out->oldest_gen = a.v->oldest_gen; + out->data_type = a.v->data_type; + out->need_discard = BCH_ALLOC_V3_NEED_DISCARD(a.v); + out->need_inc_gen = BCH_ALLOC_V3_NEED_INC_GEN(a.v); + out->journal_seq = le64_to_cpu(a.v->journal_seq); + +#define x(_name, _bits) \ + if (fieldnr < a.v->nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v = 0; \ + } \ + out->_name = v; \ + if (v != out->_name) \ + return -1; \ + fieldnr++; + + BCH_ALLOC_FIELDS_V2() +#undef x + return 0; +} + +static struct bkey_alloc_unpacked bch2_alloc_unpack(struct bkey_s_c k) +{ + struct bkey_alloc_unpacked ret = { .gen = 0 }; + + switch (k.k->type) { + case KEY_TYPE_alloc: + bch2_alloc_unpack_v1(&ret, k); + break; + case KEY_TYPE_alloc_v2: + bch2_alloc_unpack_v2(&ret, k); + break; + case KEY_TYPE_alloc_v3: + bch2_alloc_unpack_v3(&ret, k); + break; + } + + return ret; +} + +static unsigned bch_alloc_v1_val_u64s(const struct bch_alloc *a) +{ + unsigned i, bytes = offsetof(struct bch_alloc, data); + + for (i = 0; i < ARRAY_SIZE(BCH_ALLOC_V1_FIELD_BYTES); i++) + if (a->fields & (1 << i)) + bytes += BCH_ALLOC_V1_FIELD_BYTES[i]; + + return DIV_ROUND_UP(bytes, sizeof(u64)); +} + +int bch2_alloc_v1_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_alloc a = bkey_s_c_to_alloc(k); + + /* allow for unknown fields */ + if (bkey_val_u64s(a.k) < bch_alloc_v1_val_u64s(a.v)) { + prt_printf(err, "incorrect value size (%zu < %u)", + bkey_val_u64s(a.k), bch_alloc_v1_val_u64s(a.v)); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_alloc_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_alloc_unpacked u; + + if (bch2_alloc_unpack_v2(&u, k)) { + prt_printf(err, "unpack error"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_alloc_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_alloc_unpacked u; + + if (bch2_alloc_unpack_v3(&u, k)) { + prt_printf(err, "unpack error"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_alloc_v4_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + struct bkey_s_c_alloc_v4 a = bkey_s_c_to_alloc_v4(k); + int rw = flags & WRITE; + + if (alloc_v4_u64s(a.v) > bkey_val_u64s(k.k)) { + prt_printf(err, "bad val size (%u > %lu)", + alloc_v4_u64s(a.v), bkey_val_u64s(k.k)); + return -BCH_ERR_invalid_bkey; + } + + if (!BCH_ALLOC_V4_BACKPOINTERS_START(a.v) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v)) { + prt_printf(err, "invalid backpointers_start"); + return -BCH_ERR_invalid_bkey; + } + + if (rw == WRITE && + !(flags & BKEY_INVALID_JOURNAL) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_btree_backpointers) { + unsigned i, bp_len = 0; + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a.v); i++) + bp_len += alloc_v4_backpointers_c(a.v)[i].bucket_len; + + if (bp_len > a.v->dirty_sectors) { + prt_printf(err, "too many backpointers"); + return -BCH_ERR_invalid_bkey; + } + } + + if (rw == WRITE) { + if (alloc_data_type(*a.v, a.v->data_type) != a.v->data_type) { + prt_printf(err, "invalid data type (got %u should be %u)", + a.v->data_type, alloc_data_type(*a.v, a.v->data_type)); + return -BCH_ERR_invalid_bkey; + } + + switch (a.v->data_type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + if (a.v->dirty_sectors || + a.v->cached_sectors || + a.v->stripe) { + prt_printf(err, "empty data type free but have data"); + return -BCH_ERR_invalid_bkey; + } + break; + case BCH_DATA_sb: + case BCH_DATA_journal: + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + if (!a.v->dirty_sectors) { + prt_printf(err, "data_type %s but dirty_sectors==0", + bch2_data_types[a.v->data_type]); + return -BCH_ERR_invalid_bkey; + } + break; + case BCH_DATA_cached: + if (!a.v->cached_sectors || + a.v->dirty_sectors || + a.v->stripe) { + prt_printf(err, "data type inconsistency"); + return -BCH_ERR_invalid_bkey; + } + + if (!a.v->io_time[READ] && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs) { + prt_printf(err, "cached bucket with read_time == 0"); + return -BCH_ERR_invalid_bkey; + } + break; + case BCH_DATA_stripe: + if (!a.v->stripe) { + prt_printf(err, "data_type %s but stripe==0", + bch2_data_types[a.v->data_type]); + return -BCH_ERR_invalid_bkey; + } + break; + } + } + + return 0; +} + +static inline u64 swab40(u64 x) +{ + return (((x & 0x00000000ffULL) << 32)| + ((x & 0x000000ff00ULL) << 16)| + ((x & 0x0000ff0000ULL) >> 0)| + ((x & 0x00ff000000ULL) >> 16)| + ((x & 0xff00000000ULL) >> 32)); +} + +void bch2_alloc_v4_swab(struct bkey_s k) +{ + struct bch_alloc_v4 *a = bkey_s_to_alloc_v4(k).v; + struct bch_backpointer *bp, *bps; + + a->journal_seq = swab64(a->journal_seq); + a->flags = swab32(a->flags); + a->dirty_sectors = swab32(a->dirty_sectors); + a->cached_sectors = swab32(a->cached_sectors); + a->io_time[0] = swab64(a->io_time[0]); + a->io_time[1] = swab64(a->io_time[1]); + a->stripe = swab32(a->stripe); + a->nr_external_backpointers = swab32(a->nr_external_backpointers); + + bps = alloc_v4_backpointers(a); + for (bp = bps; bp < bps + BCH_ALLOC_V4_NR_BACKPOINTERS(a); bp++) { + bp->bucket_offset = swab40(bp->bucket_offset); + bp->bucket_len = swab32(bp->bucket_len); + bch2_bpos_swab(&bp->pos); + } +} + +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + unsigned i; + + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "gen %u oldest_gen %u data_type %s", + a->gen, a->oldest_gen, + a->data_type < BCH_DATA_NR + ? bch2_data_types[a->data_type] + : "(invalid data type)"); + prt_newline(out); + prt_printf(out, "journal_seq %llu", a->journal_seq); + prt_newline(out); + prt_printf(out, "need_discard %llu", BCH_ALLOC_V4_NEED_DISCARD(a)); + prt_newline(out); + prt_printf(out, "need_inc_gen %llu", BCH_ALLOC_V4_NEED_INC_GEN(a)); + prt_newline(out); + prt_printf(out, "dirty_sectors %u", a->dirty_sectors); + prt_newline(out); + prt_printf(out, "cached_sectors %u", a->cached_sectors); + prt_newline(out); + prt_printf(out, "stripe %u", a->stripe); + prt_newline(out); + prt_printf(out, "stripe_redundancy %u", a->stripe_redundancy); + prt_newline(out); + prt_printf(out, "io_time[READ] %llu", a->io_time[READ]); + prt_newline(out); + prt_printf(out, "io_time[WRITE] %llu", a->io_time[WRITE]); + prt_newline(out); + prt_printf(out, "fragmentation %llu", a->fragmentation_lru); + prt_newline(out); + prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a)); + prt_newline(out); + + if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) { + struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k); + const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v); + + prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v)); + printbuf_indent_add(out, 2); + + for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) { + prt_newline(out); + bch2_backpointer_to_text(out, &bps[i]); + } + + printbuf_indent_sub(out, 2); + } + + printbuf_indent_sub(out, 2); +} + +void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) +{ + if (k.k->type == KEY_TYPE_alloc_v4) { + void *src, *dst; + + *out = *bkey_s_c_to_alloc_v4(k).v; + + src = alloc_v4_backpointers(out); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(out); + + if (src < dst) + memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(out, 0); + } else { + struct bkey_alloc_unpacked u = bch2_alloc_unpack(k); + + *out = (struct bch_alloc_v4) { + .journal_seq = u.journal_seq, + .flags = u.need_discard, + .gen = u.gen, + .oldest_gen = u.oldest_gen, + .data_type = u.data_type, + .stripe_redundancy = u.stripe_redundancy, + .dirty_sectors = u.dirty_sectors, + .cached_sectors = u.cached_sectors, + .io_time[READ] = u.read_time, + .io_time[WRITE] = u.write_time, + .stripe = u.stripe, + }; + + SET_BCH_ALLOC_V4_BACKPOINTERS_START(out, BCH_ALLOC_V4_U64s); + } +} + +static noinline struct bkey_i_alloc_v4 * +__bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_i_alloc_v4 *ret; + + ret = bch2_trans_kmalloc(trans, max(bkey_bytes(k.k), sizeof(struct bkey_i_alloc_v4))); + if (IS_ERR(ret)) + return ret; + + if (k.k->type == KEY_TYPE_alloc_v4) { + void *src, *dst; + + bkey_reassemble(&ret->k_i, k); + + src = alloc_v4_backpointers(&ret->v); + SET_BCH_ALLOC_V4_BACKPOINTERS_START(&ret->v, BCH_ALLOC_V4_U64s); + dst = alloc_v4_backpointers(&ret->v); + + if (src < dst) + memset(src, 0, dst - src); + + SET_BCH_ALLOC_V4_NR_BACKPOINTERS(&ret->v, 0); + set_alloc_v4_u64s(ret); + } else { + bkey_alloc_v4_init(&ret->k_i); + ret->k.p = k.k->p; + bch2_alloc_to_v4(k, &ret->v); + } + return ret; +} + +static inline struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut_inlined(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bkey_s_c_alloc_v4 a; + + if (likely(k.k->type == KEY_TYPE_alloc_v4) && + ((a = bkey_s_c_to_alloc_v4(k), true) && + BCH_ALLOC_V4_NR_BACKPOINTERS(a.v) == 0)) + return bch2_bkey_make_mut_noupdate_typed(trans, k, alloc_v4); + + return __bch2_alloc_to_v4_mut(trans, k); +} + +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *trans, struct bkey_s_c k) +{ + return bch2_alloc_to_v4_mut_inlined(trans, k); +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) +{ + struct bkey_s_c k; + struct bkey_i_alloc_v4 *a; + int ret; + + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_alloc, pos, + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bkey_err(k); + if (unlikely(ret)) + return ERR_PTR(ret); + + a = bch2_alloc_to_v4_mut_inlined(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (unlikely(ret)) + goto err; + return a; +err: + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); +} + +static struct bpos alloc_gens_pos(struct bpos pos, unsigned *offset) +{ + *offset = pos.offset & KEY_TYPE_BUCKET_GENS_MASK; + + pos.offset >>= KEY_TYPE_BUCKET_GENS_BITS; + return pos; +} + +static struct bpos bucket_gens_pos_to_alloc(struct bpos pos, unsigned offset) +{ + pos.offset <<= KEY_TYPE_BUCKET_GENS_BITS; + pos.offset += offset; + return pos; +} + +static unsigned alloc_gen(struct bkey_s_c k, unsigned offset) +{ + return k.k->type == KEY_TYPE_bucket_gens + ? bkey_s_c_to_bucket_gens(k).v->gens[offset] + : 0; +} + +int bch2_bucket_gens_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (bkey_val_bytes(k.k) != sizeof(struct bch_bucket_gens)) { + prt_printf(err, "bad val size (%lu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_bucket_gens)); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_bucket_gens g = bkey_s_c_to_bucket_gens(k); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(g.v->gens); i++) { + if (i) + prt_char(out, ' '); + prt_printf(out, "%u", g.v->gens[i]); + } +} + +int bch2_bucket_gens_init(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a; + struct bkey_i_bucket_gens g; + bool have_bucket_gens_key = false; + unsigned offset; + struct bpos pos; + u8 gen; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + + gen = bch2_alloc_to_v4(k, &a)->gen; + pos = alloc_gens_pos(iter.pos, &offset); + + if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) { + ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + if (ret) + break; + have_bucket_gens_key = false; + } + + if (!have_bucket_gens_key) { + bkey_bucket_gens_init(&g.k_i); + g.k.p = pos; + have_bucket_gens_key = true; + } + + g.v.gens[offset] = gen; + } + bch2_trans_iter_exit(&trans, &iter); + + if (have_bucket_gens_key && !ret) + ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(&trans, BTREE_ID_bucket_gens, &g.k_i, 0)); + + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +int bch2_alloc_read(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + int ret; + + down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); + + if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) { + const struct bch_bucket_gens *g; + u64 b; + + for_each_btree_key(&trans, iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + + if (k.k->type != KEY_TYPE_bucket_gens) + continue; + + g = bkey_s_c_to_bucket_gens(k).v; + + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_exists2(c, k.k->p.inode)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + for (b = max_t(u64, ca->mi.first_bucket, start); + b < min_t(u64, ca->mi.nbuckets, end); + b++) + *bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK]; + } + bch2_trans_iter_exit(&trans, &iter); + } else { + struct bch_alloc_v4 a; + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + /* + * Not a fsck error because this is checked/repaired by + * bch2_check_alloc_key() which runs later: + */ + if (!bch2_dev_bucket_exists(c, k.k->p)) + continue; + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + *bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen; + } + bch2_trans_iter_exit(&trans, &iter); + } + + bch2_trans_exit(&trans); + up_read(&c->gc_lock); + + if (ret) + bch_err_fn(c, ret); + + return ret; +} + +/* Free space/discard btree: */ + +static int bch2_bucket_do_index(struct btree_trans *trans, + struct bkey_s_c alloc_k, + const struct bch_alloc_v4 *a, + bool set) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + struct btree_iter iter; + struct bkey_s_c old; + struct bkey_i *k; + enum btree_id btree; + enum bch_bkey_type old_type = !set ? KEY_TYPE_set : KEY_TYPE_deleted; + enum bch_bkey_type new_type = set ? KEY_TYPE_set : KEY_TYPE_deleted; + struct printbuf buf = PRINTBUF; + int ret; + + if (a->data_type != BCH_DATA_free && + a->data_type != BCH_DATA_need_discard) + return 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.type = new_type; + + switch (a->data_type) { + case BCH_DATA_free: + btree = BTREE_ID_freespace; + k->k.p = alloc_freespace_pos(alloc_k.k->p, *a); + bch2_key_resize(&k->k, 1); + break; + case BCH_DATA_need_discard: + btree = BTREE_ID_need_discard; + k->k.p = alloc_k.k->p; + break; + default: + return 0; + } + + old = bch2_bkey_get_iter(trans, &iter, btree, + bkey_start_pos(&k->k), + BTREE_ITER_INTENT); + ret = bkey_err(old); + if (ret) + return ret; + + if (ca->mi.freespace_initialized && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info && + bch2_trans_inconsistent_on(old.k->type != old_type, trans, + "incorrect key when %s %s:%llu:%llu:0 (got %s should be %s)\n" + " for %s", + set ? "setting" : "clearing", + bch2_btree_ids[btree], + iter.pos.inode, + iter.pos.offset, + bch2_bkey_types[old.k->type], + bch2_bkey_types[old_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = -EIO; + goto err; + } + + ret = bch2_trans_update(trans, &iter, k, 0); +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static noinline int bch2_bucket_gen_update(struct btree_trans *trans, + struct bpos bucket, u8 gen) +{ + struct btree_iter iter; + unsigned offset; + struct bpos pos = alloc_gens_pos(bucket, &offset); + struct bkey_i_bucket_gens *g; + struct bkey_s_c k; + int ret; + + g = bch2_trans_kmalloc(trans, sizeof(*g)); + ret = PTR_ERR_OR_ZERO(g); + if (ret) + return ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_bucket_gens, pos, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k); + if (ret) + return ret; + + if (k.k->type != KEY_TYPE_bucket_gens) { + bkey_bucket_gens_init(&g->k_i); + g->k.p = iter.pos; + } else { + bkey_reassemble(&g->k_i, k); + } + + g->v.gens[offset] = gen; + + ret = bch2_trans_update(trans, &iter, &g->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_alloc(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a_convert, *new_a; + const struct bch_alloc_v4 *old_a; + u64 old_lru, new_lru; + int ret = 0; + + /* + * Deletion only happens in the device removal path, with + * BTREE_TRIGGER_NORUN: + */ + BUG_ON(new->k.type != KEY_TYPE_alloc_v4); + + old_a = bch2_alloc_to_v4(old, &old_a_convert); + new_a = &bkey_i_to_alloc_v4(new)->v; + + new_a->data_type = alloc_data_type(*new_a, new_a->data_type); + + if (new_a->dirty_sectors > old_a->dirty_sectors || + new_a->cached_sectors > old_a->cached_sectors) { + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now)); + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true); + SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true); + } + + if (data_type_is_empty(new_a->data_type) && + BCH_ALLOC_V4_NEED_INC_GEN(new_a) && + !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) { + new_a->gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); + } + + if (old_a->data_type != new_a->data_type || + (new_a->data_type == BCH_DATA_free && + alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) { + ret = bch2_bucket_do_index(trans, old, old_a, false) ?: + bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true); + if (ret) + return ret; + } + + if (new_a->data_type == BCH_DATA_cached && + !new_a->io_time[READ]) + new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + + old_lru = alloc_lru_idx_read(*old_a); + new_lru = alloc_lru_idx_read(*new_a); + + if (old_lru != new_lru) { + ret = bch2_lru_change(trans, new->k.p.inode, + bucket_to_u64(new->k.p), + old_lru, new_lru); + if (ret) + return ret; + } + + new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a, + bch_dev_bkey_exists(c, new->k.p.inode)); + + if (old_a->fragmentation_lru != new_a->fragmentation_lru) { + ret = bch2_lru_change(trans, + BCH_LRU_FRAGMENTATION_START, + bucket_to_u64(new->k.p), + old_a->fragmentation_lru, new_a->fragmentation_lru); + if (ret) + return ret; + } + + if (old_a->gen != new_a->gen) { + ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen); + if (ret) + return ret; + } + + return 0; +} + +/* + * This synthesizes deleted extents for holes, similar to BTREE_ITER_SLOTS for + * extents style btrees, but works on non-extents btrees: + */ +static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +{ + struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + + if (bkey_err(k)) + return k; + + if (k.k->type) { + return k; + } else { + struct btree_iter iter2; + struct bpos next; + + bch2_trans_copy_iter(&iter2, iter); + + if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX)) + end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p)); + + end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1)); + + /* + * btree node min/max is a closed interval, upto takes a half + * open interval: + */ + k = bch2_btree_iter_peek_upto(&iter2, end); + next = iter2.pos; + bch2_trans_iter_exit(iter->trans, &iter2); + + BUG_ON(next.offset >= iter->pos.offset + U32_MAX); + + if (bkey_err(k)) + return k; + + bkey_init(hole); + hole->p = iter->pos; + + bch2_key_resize(hole, next.offset - iter->pos.offset); + return (struct bkey_s_c) { hole, NULL }; + } +} + +static bool next_bucket(struct bch_fs *c, struct bpos *bucket) +{ + struct bch_dev *ca; + unsigned iter; + + if (bch2_dev_bucket_exists(c, *bucket)) + return true; + + if (bch2_dev_exists2(c, bucket->inode)) { + ca = bch_dev_bkey_exists(c, bucket->inode); + + if (bucket->offset < ca->mi.first_bucket) { + bucket->offset = ca->mi.first_bucket; + return true; + } + + bucket->inode++; + bucket->offset = 0; + } + + rcu_read_lock(); + iter = bucket->inode; + ca = __bch2_next_dev(c, &iter, NULL); + if (ca) + *bucket = POS(ca->dev_idx, ca->mi.first_bucket); + rcu_read_unlock(); + + return ca != NULL; +} + +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, struct bkey *hole) +{ + struct bch_fs *c = iter->trans->c; + struct bkey_s_c k; +again: + k = bch2_get_key_or_hole(iter, POS_MAX, hole); + if (bkey_err(k)) + return k; + + if (!k.k->type) { + struct bpos bucket = bkey_start_pos(k.k); + + if (!bch2_dev_bucket_exists(c, bucket)) { + if (!next_bucket(c, &bucket)) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bucket); + goto again; + } + + if (!bch2_dev_bucket_exists(c, k.k->p)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + + bch2_key_resize(hole, ca->mi.nbuckets - bucket.offset); + } + } + + return k; +} + +static noinline_for_stack +int bch2_check_alloc_key(struct btree_trans *trans, + struct bkey_s_c alloc_k, + struct btree_iter *alloc_iter, + struct btree_iter *discard_iter, + struct btree_iter *freespace_iter, + struct btree_iter *bucket_gens_iter) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + unsigned discard_key_type, freespace_key_type; + unsigned gens_offset; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_k.k->p), c, + "alloc key for invalid device:bucket %llu:%llu", + alloc_k.k->p.inode, alloc_k.k->p.offset)) + return bch2_btree_delete_at(trans, alloc_iter, 0); + + ca = bch_dev_bkey_exists(c, alloc_k.k->p.inode); + if (!ca->mi.freespace_initialized) + return 0; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + + discard_key_type = a->data_type == BCH_DATA_need_discard ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(discard_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != discard_key_type && + (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect key in need_discard btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[discard_key_type], + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = discard_key_type; + update->k.p = discard_iter->pos; + + ret = bch2_trans_update(trans, discard_iter, update, 0); + if (ret) + goto err; + } + + freespace_key_type = a->data_type == BCH_DATA_free ? KEY_TYPE_set : 0; + bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != freespace_key_type && + (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect key in freespace btree (got %s should be %s)\n" + " %s", + bch2_bkey_types[k.k->type], + bch2_bkey_types[freespace_key_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = freespace_key_type; + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, 1); + + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) + goto err; + } + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (a->gen != alloc_gen(k, gens_offset) && + (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect gen in bucket_gens btree (got %u should be %u)\n" + " %s", + alloc_gen(k, gens_offset), a->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)))) { + struct bkey_i_bucket_gens *g = + bch2_trans_kmalloc(trans, sizeof(*g)); + + ret = PTR_ERR_OR_ZERO(g); + if (ret) + goto err; + + if (k.k->type == KEY_TYPE_bucket_gens) { + bkey_reassemble(&g->k_i, k); + } else { + bkey_bucket_gens_init(&g->k_i); + g->k.p = alloc_gens_pos(alloc_k.k->p, &gens_offset); + } + + g->v.gens[gens_offset] = a->gen; + + ret = bch2_trans_update(trans, bucket_gens_iter, &g->k_i, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack +int bch2_check_alloc_hole_freespace(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *freespace_iter) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + ca = bch_dev_bkey_exists(c, start.inode); + if (!ca->mi.freespace_initialized) + return 0; + + bch2_btree_iter_set_pos(freespace_iter, start); + + k = bch2_btree_iter_peek_slot(freespace_iter); + ret = bkey_err(k); + if (ret) + goto err; + + *end = bkey_min(k.k->p, *end); + + if (k.k->type != KEY_TYPE_set && + (c->opts.reconstruct_alloc || + fsck_err(c, "hole in alloc btree missing in freespace btree\n" + " device %llu buckets %llu-%llu", + freespace_iter->pos.inode, + freespace_iter->pos.offset, + end->offset))) { + struct bkey_i *update = + bch2_trans_kmalloc(trans, sizeof(*update)); + + ret = PTR_ERR_OR_ZERO(update); + if (ret) + goto err; + + bkey_init(&update->k); + update->k.type = KEY_TYPE_set; + update->k.p = freespace_iter->pos; + bch2_key_resize(&update->k, + min_t(u64, U32_MAX, end->offset - + freespace_iter->pos.offset)); + + ret = bch2_trans_update(trans, freespace_iter, update, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack +int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, + struct bpos start, + struct bpos *end, + struct btree_iter *bucket_gens_iter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + unsigned i, gens_offset, gens_end_offset; + int ret; + + if (c->sb.version < bcachefs_metadata_version_bucket_gens) + return 0; + + bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + + k = bch2_btree_iter_peek_slot(bucket_gens_iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (bkey_cmp(alloc_gens_pos(start, &gens_offset), + alloc_gens_pos(*end, &gens_end_offset))) + gens_end_offset = KEY_TYPE_BUCKET_GENS_NR; + + if (k.k->type == KEY_TYPE_bucket_gens) { + struct bkey_i_bucket_gens g; + bool need_update = false; + + bkey_reassemble(&g.k_i, k); + + for (i = gens_offset; i < gens_end_offset; i++) { + if (fsck_err_on(g.v.gens[i], c, + "hole in alloc btree at %llu:%llu with nonzero gen in bucket_gens btree (%u)", + bucket_gens_pos_to_alloc(k.k->p, i).inode, + bucket_gens_pos_to_alloc(k.k->p, i).offset, + g.v.gens[i])) { + g.v.gens[i] = 0; + need_update = true; + } + } + + if (need_update) { + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(g)); + + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + memcpy(k, &g, sizeof(g)); + + ret = bch2_trans_update(trans, bucket_gens_iter, k, 0); + if (ret) + goto err; + } + } + + *end = bkey_min(*end, bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0)); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter; + struct bkey_s_c alloc_k; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + u64 genbits; + struct bpos pos; + enum bch_data_type state = iter->btree_id == BTREE_ID_need_discard + ? BCH_DATA_need_discard + : BCH_DATA_free; + struct printbuf buf = PRINTBUF; + int ret; + + pos = iter->pos; + pos.offset &= ~(~0ULL << 56); + genbits = iter->pos.offset & (~0ULL << 56); + + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, pos, 0); + ret = bkey_err(alloc_k); + if (ret) + return ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, pos), c, + "entry in %s btree for nonexistant dev:bucket %llu:%llu", + bch2_btree_ids[iter->btree_id], pos.inode, pos.offset)) + goto delete; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + + if (fsck_err_on(a->data_type != state || + (state == BCH_DATA_free && + genbits != alloc_freespace_genbits(*a)), c, + "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), + bch2_btree_ids[iter->btree_id], + iter->pos.inode, + iter->pos.offset, + a->data_type == state, + genbits >> 56, alloc_freespace_genbits(*a) >> 56)) + goto delete; +out: +fsck_err: + set_btree_iter_dontneed(&alloc_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +delete: + ret = bch2_btree_delete_extent_at(trans, iter, + iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW); + goto out; +} + +static int bch2_check_discard_freespace_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end) +{ + if (!btree_node_type_is_extents(iter->btree_id)) { + return __bch2_check_discard_freespace_key(trans, iter); + } else { + int ret; + + while (!bkey_eq(iter->pos, end) && + !(ret = btree_trans_too_many_iters(trans) ?: + __bch2_check_discard_freespace_key(trans, iter))) + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + + return ret; + } +} + +/* + * We've already checked that generation numbers in the bucket_gens btree are + * valid for buckets that exist; this just checks for keys for nonexistent + * buckets. + */ +static noinline_for_stack +int bch2_check_bucket_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_i_bucket_gens g; + struct bch_dev *ca; + u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset; + u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset; + u64 b; + bool need_update = false, dev_exists; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(k.k->type != KEY_TYPE_bucket_gens); + bkey_reassemble(&g.k_i, k); + + /* if no bch_dev, skip out whether we repair or not */ + dev_exists = bch2_dev_exists2(c, k.k->p.inode); + if (!dev_exists) { + if (fsck_err_on(!dev_exists, c, + "bucket_gens key for invalid device:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + } + goto out; + } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + if (fsck_err_on(end <= ca->mi.first_bucket || + start >= ca->mi.nbuckets, c, + "bucket_gens key for invalid buckets:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } + + for (b = start; b < ca->mi.first_bucket; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; + } + + for (b = ca->mi.nbuckets; b < end; b++) + if (fsck_err_on(g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK], c, + "bucket_gens key has nonzero gen for invalid bucket")) { + g.v.gens[b & KEY_TYPE_BUCKET_GENS_MASK] = 0; + need_update = true; + } + + if (need_update) { + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(g)); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto out; + + memcpy(k, &g, sizeof(g)); + ret = bch2_trans_update(trans, iter, k, 0); + } +out: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_alloc_info(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter, discard_iter, freespace_iter, bucket_gens_iter; + struct bkey hole; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &discard_iter, BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &freespace_iter, BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH); + bch2_trans_iter_init(&trans, &bucket_gens_iter, BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH); + + while (1) { + struct bpos next; + + bch2_trans_begin(&trans); + + k = bch2_get_key_or_real_bucket_hole(&iter, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; + + if (!k.k) + break; + + if (k.k->type) { + next = bpos_nosnap_successor(k.k->p); + + ret = bch2_check_alloc_key(&trans, + k, &iter, + &discard_iter, + &freespace_iter, + &bucket_gens_iter); + if (ret) + goto bkey_err; + } else { + next = k.k->p; + + ret = bch2_check_alloc_hole_freespace(&trans, + bkey_start_pos(k.k), + &next, + &freespace_iter) ?: + bch2_check_alloc_hole_bucket_gens(&trans, + bkey_start_pos(k.k), + &next, + &bucket_gens_iter); + if (ret) + goto bkey_err; + } + + ret = bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_pos(&iter, next); +bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &bucket_gens_iter); + bch2_trans_iter_exit(&trans, &freespace_iter); + bch2_trans_iter_exit(&trans, &discard_iter); + bch2_trans_iter_exit(&trans, &iter); + + if (ret < 0) + goto err; + + ret = for_each_btree_key2(&trans, iter, + BTREE_ID_need_discard, POS_MIN, + BTREE_ITER_PREFETCH, k, + bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: + for_each_btree_key2(&trans, iter, + BTREE_ID_freespace, POS_MIN, + BTREE_ITER_PREFETCH, k, + bch2_check_discard_freespace_key(&trans, &iter, k.k->p)) ?: + for_each_btree_key_commit(&trans, iter, + BTREE_ID_bucket_gens, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_bucket_gens_key(&trans, &iter, k)); +err: + bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, + struct btree_iter *alloc_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter lru_iter; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c alloc_k, lru_k; + struct printbuf buf = PRINTBUF; + int ret; + + alloc_k = bch2_btree_iter_peek(alloc_iter); + if (!alloc_k.k) + return 0; + + ret = bkey_err(alloc_k); + if (ret) + return ret; + + a = bch2_alloc_to_v4(alloc_k, &a_convert); + + if (a->data_type != BCH_DATA_cached) + return 0; + + lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, + lru_pos(alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ]), 0); + ret = bkey_err(lru_k); + if (ret) + return ret; + + if (fsck_err_on(!a->io_time[READ], c, + "cached bucket with read_time 0\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) || + fsck_err_on(lru_k.k->type != KEY_TYPE_set, c, + "missing lru entry\n" + " %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + u64 read_time = a->io_time[READ] ?: + atomic64_read(&c->io_clock[READ].now); + + ret = bch2_lru_set(trans, + alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + read_time); + if (ret) + goto err; + + if (a->io_time[READ] != read_time) { + struct bkey_i_alloc_v4 *a_mut = + bch2_alloc_to_v4_mut(trans, alloc_k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + goto err; + + a_mut->v.io_time[READ] = read_time; + ret = bch2_trans_update(trans, alloc_iter, + &a_mut->k_i, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } + } +err: +fsck_err: + bch2_trans_iter_exit(trans, &lru_iter); + printbuf_exit(&buf); + return ret; +} + +int bch2_check_alloc_to_lru_refs(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_alloc_to_lru_ref(&trans, &iter))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int bch2_discard_one_bucket(struct btree_trans *trans, + struct btree_iter *need_discard_iter, + struct bpos *discard_pos_done, + u64 *seen, + u64 *open, + u64 *need_journal_commit, + u64 *discarded) +{ + struct bch_fs *c = trans->c; + struct bpos pos = need_discard_iter->pos; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct bch_dev *ca; + struct bkey_i_alloc_v4 *a; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ca = bch_dev_bkey_exists(c, pos.inode); + if (!percpu_ref_tryget(&ca->io_ref)) { + bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0)); + return 0; + } + + if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) { + (*open)++; + goto out; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + pos.inode, pos.offset)) { + (*need_journal_commit)++; + goto out; + } + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + need_discard_iter->pos, + BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + goto out; + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + if (BCH_ALLOC_V4_NEED_INC_GEN(&a->v)) { + a->v.gen++; + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + goto write; + } + + if (a->v.journal_seq > c->journal.flushed_seq_ondisk) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "clearing need_discard but journal_seq %llu > flushed_seq %llu\n" + "%s", + a->v.journal_seq, + c->journal.flushed_seq_ondisk, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + } + goto out; + } + + if (a->v.data_type != BCH_DATA_need_discard) { + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + bch2_trans_inconsistent(trans, + "bucket incorrectly set in need_discard btree\n" + "%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + } + + goto out; + } + + if (!bkey_eq(*discard_pos_done, iter.pos) && + ca->mi.discard && !c->opts.nochanges) { + /* + * This works without any other locks because this is the only + * thread that removes items from the need_discard tree + */ + bch2_trans_unlock(trans); + blkdev_issue_discard(ca->disk_sb.bdev, + k.k->p.offset * ca->mi.bucket_size, + ca->mi.bucket_size, + GFP_KERNEL); + *discard_pos_done = iter.pos; + + ret = bch2_trans_relock_notrace(trans); + if (ret) + goto out; + } + + SET_BCH_ALLOC_V4_NEED_DISCARD(&a->v, false); + a->v.data_type = alloc_data_type(a->v, a->v.data_type); +write: + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); + if (ret) + goto out; + + this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]); + (*discarded)++; +out: + (*seen)++; + bch2_trans_iter_exit(trans, &iter); + percpu_ref_put(&ca->io_ref); + printbuf_exit(&buf); + return ret; +} + +static void bch2_do_discards_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, discard_work); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0; + struct bpos discard_pos_done = POS_MAX; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + /* + * We're doing the commit in bch2_discard_one_bucket instead of using + * for_each_btree_key_commit() so that we can increment counters after + * successful commit: + */ + ret = for_each_btree_key2(&trans, iter, + BTREE_ID_need_discard, POS_MIN, 0, k, + bch2_discard_one_bucket(&trans, &iter, &discard_pos_done, + &seen, + &open, + &need_journal_commit, + &discarded)); + + bch2_trans_exit(&trans); + + if (need_journal_commit * 2 > seen) + bch2_journal_flush_async(&c->journal, NULL); + + bch2_write_ref_put(c, BCH_WRITE_REF_discard); + + trace_discard_buckets(c, seen, open, need_journal_commit, discarded, + bch2_err_str(ret)); +} + +void bch2_do_discards(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_discard) && + !queue_work(c->write_ref_wq, &c->discard_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_discard); +} + +static int invalidate_one_bucket(struct btree_trans *trans, + struct btree_iter *lru_iter, + struct bkey_s_c lru_k, + s64 *nr_to_invalidate) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bkey_i_alloc_v4 *a = NULL; + struct printbuf buf = PRINTBUF; + struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); + unsigned cached_sectors; + int ret = 0; + + if (*nr_to_invalidate <= 0) + return 1; + + if (!bch2_dev_bucket_exists(c, bucket)) { + prt_str(&buf, "lru entry points to invalid bucket"); + goto err; + } + + if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) + return 0; + + a = bch2_trans_start_alloc_update(trans, &alloc_iter, bucket); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + goto out; + + /* We expect harmless races here due to the btree write buffer: */ + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + goto out; + + BUG_ON(a->v.data_type != BCH_DATA_cached); + + if (!a->v.cached_sectors) + bch_err(c, "invalidating empty bucket, confused"); + + cached_sectors = a->v.cached_sectors; + + SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); + a->v.gen++; + a->v.data_type = 0; + a->v.dirty_sectors = 0; + a->v.cached_sectors = 0; + a->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now); + a->v.io_time[WRITE] = atomic64_read(&c->io_clock[WRITE].now); + + ret = bch2_trans_update(trans, &alloc_iter, &a->k_i, + BTREE_TRIGGER_BUCKET_INVALIDATE) ?: + bch2_trans_commit(trans, NULL, NULL, + BCH_WATERMARK_btree| + BTREE_INSERT_NOFAIL); + if (ret) + goto out; + + trace_and_count(c, bucket_invalidate, c, bucket.inode, bucket.offset, cached_sectors); + --*nr_to_invalidate; +out: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +err: + prt_str(&buf, "\n lru key: "); + bch2_bkey_val_to_text(&buf, c, lru_k); + + prt_str(&buf, "\n lru entry: "); + bch2_lru_pos_to_text(&buf, lru_iter->pos); + + prt_str(&buf, "\n alloc key: "); + if (!a) + bch2_bpos_to_text(&buf, bucket); + else + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&a->k_i)); + + bch_err(c, "%s", buf.buf); + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_lrus) { + bch2_inconsistent_error(c); + ret = -EINVAL; + } + + goto out; +} + +static void bch2_do_invalidates_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work); + struct bch_dev *ca; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = bch2_btree_write_buffer_flush(&trans); + if (ret) + goto err; + + for_each_member_device(ca, c, i) { + s64 nr_to_invalidate = + should_invalidate_buckets(ca, bch2_dev_usage_read(ca)); + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_lru, + lru_pos(ca->dev_idx, 0, 0), + lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX), + BTREE_ITER_INTENT, k, + invalidate_one_bucket(&trans, &iter, k, &nr_to_invalidate)); + + if (ret < 0) { + percpu_ref_put(&ca->ref); + break; + } + } +err: + bch2_trans_exit(&trans); + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +} + +void bch2_do_invalidates(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate) && + !queue_work(c->write_ref_wq, &c->invalidate_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); +} + +static int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, + unsigned long *last_updated) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey hole; + struct bpos end = POS(ca->dev_idx, ca->mi.nbuckets); + struct bch_member *m; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_PREFETCH); + /* + * Scan the alloc btree for every bucket on @ca, and add buckets to the + * freespace/need_discard/need_gc_gens btrees as needed: + */ + while (1) { + if (*last_updated + HZ * 10 < jiffies) { + bch_info(ca, "%s: currently at %llu/%llu", + __func__, iter.pos.offset, ca->mi.nbuckets); + *last_updated = jiffies; + } + + bch2_trans_begin(&trans); + + if (bkey_ge(iter.pos, end)) { + ret = 0; + break; + } + + k = bch2_get_key_or_hole(&iter, end, &hole); + ret = bkey_err(k); + if (ret) + goto bkey_err; + + if (k.k->type) { + /* + * We process live keys in the alloc btree one at a + * time: + */ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + + ret = bch2_bucket_do_index(&trans, k, a, true) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL); + if (ret) + goto bkey_err; + + bch2_btree_iter_advance(&iter); + } else { + struct bkey_i *freespace; + + freespace = bch2_trans_kmalloc(&trans, sizeof(*freespace)); + ret = PTR_ERR_OR_ZERO(freespace); + if (ret) + goto bkey_err; + + bkey_init(&freespace->k); + freespace->k.type = KEY_TYPE_set; + freespace->k.p = k.k->p; + freespace->k.size = k.k->size; + + ret = __bch2_btree_insert(&trans, BTREE_ID_freespace, freespace, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_pos(&iter, k.k->p); + } +bkey_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + } + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret < 0) { + bch_err(ca, "error initializing free space: %s", bch2_err_str(ret)); + return ret; + } + + mutex_lock(&c->sb_lock); + m = bch2_sb_get_members(c->disk_sb.sb)->members + ca->dev_idx; + SET_BCH_MEMBER_FREESPACE_INITIALIZED(m, true); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_fs_freespace_init(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; + bool doing_init = false; + unsigned long last_updated = jiffies; + + /* + * We can crash during the device add path, so we need to check this on + * every mount: + */ + + for_each_member_device(ca, c, i) { + if (ca->mi.freespace_initialized) + continue; + + if (!doing_init) { + bch_info(c, "initializing freespace"); + doing_init = true; + } + + ret = bch2_dev_freespace_init(c, ca, &last_updated); + if (ret) { + percpu_ref_put(&ca->ref); + bch_err_fn(c, ret); + return ret; + } + } + + if (doing_init) { + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + bch_verbose(c, "done initializing freespace"); + } + + return 0; +} + +/* Bucket IO clocks: */ + +int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, + size_t bucket_nr, int rw) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + u64 now; + int ret = 0; + + a = bch2_trans_start_alloc_update(trans, &iter, POS(dev, bucket_nr)); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + now = atomic64_read(&c->io_clock[rw].now); + if (a->v.io_time[rw] == now) + goto out; + + a->v.io_time[rw] = now; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* Startup/shutdown (ro/rw): */ + +void bch2_recalc_capacity(struct bch_fs *c) +{ + struct bch_dev *ca; + u64 capacity = 0, reserved_sectors = 0, gc_reserve; + unsigned bucket_size_max = 0; + unsigned long ra_pages = 0; + unsigned i; + + lockdep_assert_held(&c->state_lock); + + for_each_online_member(ca, c, i) { + struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; + + ra_pages += bdi->ra_pages; + } + + bch2_set_ra_pages(c, ra_pages); + + for_each_rw_member(ca, c, i) { + u64 dev_reserve = 0; + + /* + * We need to reserve buckets (from the number + * of currently available buckets) against + * foreground writes so that mainly copygc can + * make forward progress. + * + * We need enough to refill the various reserves + * from scratch - copygc will use its entire + * reserve all at once, then run against when + * its reserve is refilled (from the formerly + * available buckets). + * + * This reserve is just used when considering if + * allocations for foreground writes must wait - + * not -ENOSPC calculations. + */ + + dev_reserve += ca->nr_btree_reserve * 2; + dev_reserve += ca->mi.nbuckets >> 6; /* copygc reserve */ + + dev_reserve += 1; /* btree write point */ + dev_reserve += 1; /* copygc write point */ + dev_reserve += 1; /* rebalance write point */ + + dev_reserve *= ca->mi.bucket_size; + + capacity += bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket); + + reserved_sectors += dev_reserve * 2; + + bucket_size_max = max_t(unsigned, bucket_size_max, + ca->mi.bucket_size); + } + + gc_reserve = c->opts.gc_reserve_bytes + ? c->opts.gc_reserve_bytes >> 9 + : div64_u64(capacity * c->opts.gc_reserve_percent, 100); + + reserved_sectors = max(gc_reserve, reserved_sectors); + + reserved_sectors = min(reserved_sectors, capacity); + + c->capacity = capacity - reserved_sectors; + + c->bucket_size_max = bucket_size_max; + + /* Wake up case someone was waiting for buckets */ + closure_wake_up(&c->freelist_wait); +} + +static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) +{ + struct open_bucket *ob; + bool ret = false; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } + + return ret; +} + +/* device goes ro: */ +void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + /* First, remove device from allocation groups: */ + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + clear_bit(ca->dev_idx, c->rw_devs[i].d); + + /* + * Capacity is calculated based off of devices in allocation groups: + */ + bch2_recalc_capacity(c); + + bch2_open_buckets_stop(c, ca, false); + + /* + * Wake up threads that were blocked on allocation, so they can notice + * the device can no longer be removed and the capacity has changed: + */ + closure_wake_up(&c->freelist_wait); + + /* + * journal_res_get() can block waiting for free space in the journal - + * it needs to notice there may not be devices to allocate from anymore: + */ + wake_up(&c->journal.wait); + + /* Now wait for any in flight writes: */ + + closure_wait_event(&c->open_buckets_wait, + !bch2_dev_has_open_write_point(c, ca)); +} + +/* device goes rw: */ +void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (ca->mi.data_allowed & (1 << i)) + set_bit(ca->dev_idx, c->rw_devs[i].d); +} + +void bch2_fs_allocator_background_init(struct bch_fs *c) +{ + spin_lock_init(&c->freelist_lock); + INIT_WORK(&c->discard_work, bch2_do_discards_work); + INIT_WORK(&c->invalidate_work, bch2_do_invalidates_work); +} diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h new file mode 100644 index 000000000..c0914feb5 --- /dev/null +++ b/fs/bcachefs/alloc_background.h @@ -0,0 +1,257 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_BACKGROUND_H +#define _BCACHEFS_ALLOC_BACKGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" +#include "buckets.h" +#include "debug.h" +#include "super.h" + +enum bkey_invalid_flags; + +/* How out of date a pointer gen is allowed to be: */ +#define BUCKET_GC_GEN_MAX 96U + +static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) +{ + struct bch_dev *ca; + + if (!bch2_dev_exists2(c, pos.inode)) + return false; + + ca = bch_dev_bkey_exists(c, pos.inode); + return pos.offset >= ca->mi.first_bucket && + pos.offset < ca->mi.nbuckets; +} + +static inline u64 bucket_to_u64(struct bpos bucket) +{ + return (bucket.inode << 48) | bucket.offset; +} + +static inline struct bpos u64_to_bucket(u64 bucket) +{ + return POS(bucket >> 48, bucket & ~(~0ULL << 48)); +} + +static inline u8 alloc_gc_gen(struct bch_alloc_v4 a) +{ + return a.gen - a.oldest_gen; +} + +static inline enum bch_data_type __alloc_data_type(u32 dirty_sectors, + u32 cached_sectors, + u32 stripe, + struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + if (stripe) + return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; + if (dirty_sectors) + return data_type; + if (cached_sectors) + return BCH_DATA_cached; + if (BCH_ALLOC_V4_NEED_DISCARD(&a)) + return BCH_DATA_need_discard; + if (alloc_gc_gen(a) >= BUCKET_GC_GEN_MAX) + return BCH_DATA_need_gc_gens; + return BCH_DATA_free; +} + +static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, + enum bch_data_type data_type) +{ + return __alloc_data_type(a.dirty_sectors, a.cached_sectors, + a.stripe, a, data_type); +} + +static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type) +{ + return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type; +} + +static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a) +{ + return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0; +} + +#define DATA_TYPES_MOVABLE \ + ((1U << BCH_DATA_btree)| \ + (1U << BCH_DATA_user)| \ + (1U << BCH_DATA_stripe)) + +static inline bool data_type_movable(enum bch_data_type type) +{ + return (1U << type) & DATA_TYPES_MOVABLE; +} + +static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a, + struct bch_dev *ca) +{ + if (!data_type_movable(a.data_type) || + a.dirty_sectors >= ca->mi.bucket_size) + return 0; + + return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size); +} + +static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a) +{ + return ((u64) alloc_gc_gen(a) >> 4) << 56; +} + +static inline struct bpos alloc_freespace_pos(struct bpos pos, struct bch_alloc_v4 a) +{ + pos.offset |= alloc_freespace_genbits(a); + return pos; +} + +static inline unsigned alloc_v4_u64s(const struct bch_alloc_v4 *a) +{ + unsigned ret = (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0) + + BCH_ALLOC_V4_NR_BACKPOINTERS(a) * + (sizeof(struct bch_backpointer) / sizeof(u64)); + + BUG_ON(ret > U8_MAX - BKEY_U64s); + return ret; +} + +static inline void set_alloc_v4_u64s(struct bkey_i_alloc_v4 *a) +{ + set_bkey_val_u64s(&a->k, alloc_v4_u64s(&a->v)); +} + +struct bkey_i_alloc_v4 * +bch2_trans_start_alloc_update(struct btree_trans *, struct btree_iter *, struct bpos); + +void __bch2_alloc_to_v4(struct bkey_s_c, struct bch_alloc_v4 *); + +static inline const struct bch_alloc_v4 *bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *convert) +{ + const struct bch_alloc_v4 *ret; + + if (unlikely(k.k->type != KEY_TYPE_alloc_v4)) + goto slowpath; + + ret = bkey_s_c_to_alloc_v4(k).v; + if (BCH_ALLOC_V4_BACKPOINTERS_START(ret) != BCH_ALLOC_V4_U64s) + goto slowpath; + + return ret; +slowpath: + __bch2_alloc_to_v4(k, convert); + return convert; +} + +struct bkey_i_alloc_v4 *bch2_alloc_to_v4_mut(struct btree_trans *, struct bkey_s_c); + +int bch2_bucket_io_time_reset(struct btree_trans *, unsigned, size_t, int); + +int bch2_alloc_v1_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v2_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v3_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_alloc_v4_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_alloc_v4_swab(struct bkey_s); +void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_alloc ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v1_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 8, \ +}) + +#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v2_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 8, \ +}) + +#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v3_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 16, \ +}) + +#define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ + .key_invalid = bch2_alloc_v4_invalid, \ + .val_to_text = bch2_alloc_to_text, \ + .swab = bch2_alloc_v4_swab, \ + .trans_trigger = bch2_trans_mark_alloc, \ + .atomic_trigger = bch2_mark_alloc, \ + .min_val_size = 48, \ +}) + +int bch2_bucket_gens_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_bucket_gens_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_bucket_gens ((struct bkey_ops) { \ + .key_invalid = bch2_bucket_gens_invalid, \ + .val_to_text = bch2_bucket_gens_to_text, \ +}) + +int bch2_bucket_gens_init(struct bch_fs *); + +static inline bool bkey_is_alloc(const struct bkey *k) +{ + return k->type == KEY_TYPE_alloc || + k->type == KEY_TYPE_alloc_v2 || + k->type == KEY_TYPE_alloc_v3; +} + +int bch2_alloc_read(struct bch_fs *); + +int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_check_alloc_info(struct bch_fs *); +int bch2_check_alloc_to_lru_refs(struct bch_fs *); +void bch2_do_discards(struct bch_fs *); + +static inline u64 should_invalidate_buckets(struct bch_dev *ca, + struct bch_dev_usage u) +{ + u64 want_free = ca->mi.nbuckets >> 7; + u64 free = max_t(s64, 0, + u.d[BCH_DATA_free].buckets + + u.d[BCH_DATA_need_discard].buckets + - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); + + return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); +} + +void bch2_do_invalidates(struct bch_fs *); + +static inline struct bch_backpointer *alloc_v4_backpointers(struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + + (BCH_ALLOC_V4_BACKPOINTERS_START(a) ?: + BCH_ALLOC_V4_U64s_V0)); +} + +static inline const struct bch_backpointer *alloc_v4_backpointers_c(const struct bch_alloc_v4 *a) +{ + return (void *) ((u64 *) &a->v + BCH_ALLOC_V4_BACKPOINTERS_START(a)); +} + +int bch2_fs_freespace_init(struct bch_fs *); + +void bch2_recalc_capacity(struct bch_fs *); + +void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); +void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); + +void bch2_fs_allocator_background_init(struct bch_fs *); + +#endif /* _BCACHEFS_ALLOC_BACKGROUND_H */ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c new file mode 100644 index 000000000..fcb7311b1 --- /dev/null +++ b/fs/bcachefs/alloc_foreground.c @@ -0,0 +1,1536 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2012 Google, Inc. + * + * Foreground allocator code: allocate buckets from freelist, and allocate in + * sector granularity from writepoints. + * + * bch2_bucket_alloc() allocates a single bucket from a specific device. + * + * bch2_bucket_alloc_set() allocates one or more buckets from different devices + * in a given filesystem. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_gc.h" +#include "buckets.h" +#include "buckets_waiting_for_journal.h" +#include "clock.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io.h" +#include "journal.h" +#include "movinggc.h" +#include "nocow_locking.h" +#include "trace.h" + +#include +#include +#include + +static void bch2_trans_mutex_lock_norelock(struct btree_trans *trans, + struct mutex *lock) +{ + if (!mutex_trylock(lock)) { + bch2_trans_unlock(trans); + mutex_lock(lock); + } +} + +const char * const bch2_watermarks[] = { +#define x(t) #t, + BCH_WATERMARKS() +#undef x + NULL +}; + +/* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: + * + * - They track buckets that have been partially allocated, allowing for + * sub-bucket sized allocations - they're used by the sector allocator below + * + * - They provide a reference to the buckets they own that mark and sweep GC + * can find, until the new allocation has a pointer to it inserted into the + * btree + * + * When allocating some space with the sector allocator, the allocation comes + * with a reference to an open bucket - the caller is required to put that + * reference _after_ doing the index update that makes its allocation reachable. + */ + +void bch2_reset_alloc_cursors(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + ca->alloc_cursor = 0; + rcu_read_unlock(); +} + +static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + ob->hash = *slot; + *slot = idx; +} + +static void bch2_open_bucket_hash_remove(struct bch_fs *c, struct open_bucket *ob) +{ + open_bucket_idx_t idx = ob - c->open_buckets; + open_bucket_idx_t *slot = open_bucket_hashslot(c, ob->dev, ob->bucket); + + while (*slot != idx) { + BUG_ON(!*slot); + slot = &c->open_buckets[*slot].hash; + } + + *slot = ob->hash; + ob->hash = 0; +} + +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (ob->ec) { + ec_stripe_new_put(c, ob->ec, STRIPE_REF_io); + return; + } + + percpu_down_read(&c->mark_lock); + spin_lock(&ob->lock); + + ob->valid = false; + ob->data_type = 0; + + spin_unlock(&ob->lock); + percpu_up_read(&c->mark_lock); + + spin_lock(&c->freelist_lock); + bch2_open_bucket_hash_remove(c, ob); + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + + c->open_buckets_nr_free++; + ca->nr_open_buckets--; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); +} + +void bch2_open_bucket_write_error(struct bch_fs *c, + struct open_buckets *obs, + unsigned dev) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->dev == dev && ob->ec) + bch2_ec_bucket_cancel(c, ob); +} + +static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) +{ + struct open_bucket *ob; + + BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); + + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); + ob->data_type = 0; + + c->open_buckets_nr_free--; + return ob; +} + +static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) +{ + BUG_ON(c->open_buckets_partial_nr >= + ARRAY_SIZE(c->open_buckets_partial)); + + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + c->open_buckets_partial[c->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); +} + +/* _only_ for allocating the journal on a new device: */ +long bch2_bucket_alloc_new_fs(struct bch_dev *ca) +{ + while (ca->new_fs_bucket_idx < ca->mi.nbuckets) { + u64 b = ca->new_fs_bucket_idx++; + + if (!is_superblock_bucket(ca, b) && + (!ca->buckets_nouse || !test_bit(b, ca->buckets_nouse))) + return b; + } + + return -1; +} + +static inline unsigned open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_reclaim: + return 0; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, + u64 bucket, + enum bch_watermark watermark, + const struct bch_alloc_v4 *a, + struct bucket_alloc_state *s, + struct closure *cl) +{ + struct open_bucket *ob; + + if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { + s->skipped_nouse++; + return NULL; + } + + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + s->skipped_open++; + return NULL; + } + + if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, ca->dev_idx, bucket)) { + s->skipped_need_journal_commit++; + return NULL; + } + + if (bch2_bucket_nocow_is_locked(&c->nocow_locks, POS(ca->dev_idx, bucket))) { + s->skipped_nocow++; + return NULL; + } + + spin_lock(&c->freelist_lock); + + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + + if (!c->blocked_allocate_open_bucket) + c->blocked_allocate_open_bucket = local_clock(); + + spin_unlock(&c->freelist_lock); + return ERR_PTR(-BCH_ERR_open_buckets_empty); + } + + /* Recheck under lock: */ + if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { + spin_unlock(&c->freelist_lock); + s->skipped_open++; + return NULL; + } + + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); + + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->dev = ca->dev_idx; + ob->gen = a->gen; + ob->bucket = bucket; + spin_unlock(&ob->lock); + + ca->nr_open_buckets++; + bch2_open_bucket_hash_add(c, ob); + + if (c->blocked_allocate_open_bucket) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate_open_bucket], + c->blocked_allocate_open_bucket); + c->blocked_allocate_open_bucket = 0; + } + + if (c->blocked_allocate) { + bch2_time_stats_update( + &c->times[BCH_TIME_blocked_allocate], + c->blocked_allocate); + c->blocked_allocate = 0; + } + + spin_unlock(&c->freelist_lock); + return ob; +} + +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, + enum bch_watermark watermark, u64 free_entry, + struct bucket_alloc_state *s, + struct bkey_s_c freespace_k, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + struct open_bucket *ob; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + u64 b = free_entry & ~(~0ULL << 56); + unsigned genbits = free_entry >> 56; + struct printbuf buf = PRINTBUF; + int ret; + + if (b < ca->mi.first_bucket || b >= ca->mi.nbuckets) { + prt_printf(&buf, "freespace btree has bucket outside allowed range %u-%llu\n" + " freespace key ", + ca->mi.first_bucket, ca->mi.nbuckets); + bch2_bkey_val_to_text(&buf, c, freespace_k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_alloc, POS(ca->dev_idx, b), + BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + a = bch2_alloc_to_v4(k, &a_convert); + + if (a->data_type != BCH_DATA_free) { + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + ob = NULL; + goto err; + } + + prt_printf(&buf, "non free bucket in freespace btree\n" + " freespace key "); + bch2_bkey_val_to_text(&buf, c, freespace_k); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + if (genbits != (alloc_freespace_genbits(*a) >> 56) && + c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_info) { + prt_printf(&buf, "bucket in freespace btree with wrong genbits (got %u should be %llu)\n" + " freespace key ", + genbits, alloc_freespace_genbits(*a) >> 56); + bch2_bkey_val_to_text(&buf, c, freespace_k); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + bch2_trans_inconsistent(trans, "%s", buf.buf); + ob = ERR_PTR(-EIO); + goto err; + } + + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_extents_to_backpointers) { + struct bch_backpointer bp; + struct bpos bp_pos = POS_MIN; + + ret = bch2_get_next_backpointer(trans, POS(ca->dev_idx, b), -1, + &bp_pos, &bp, + BTREE_ITER_NOPRESERVE); + if (ret) { + ob = ERR_PTR(ret); + goto err; + } + + if (!bkey_eq(bp_pos, POS_MAX)) { + /* + * Bucket may have data in it - we don't call + * bc2h_trans_inconnsistent() because fsck hasn't + * finished yet + */ + ob = NULL; + goto err; + } + } + + ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl); + if (!ob) + iter.path->preserve = false; +err: + if (iter.trans && iter.path) + set_btree_iter_dontneed(&iter); + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ob; +} + +/* + * This path is for before the freespace btree is initialized: + * + * If ca->new_fs_bucket_idx is nonzero, we haven't yet marked superblock & + * journal buckets - journal buckets will be < ca->new_fs_bucket_idx + */ +static noinline struct open_bucket * +bch2_bucket_alloc_early(struct btree_trans *trans, + struct bch_dev *ca, + enum bch_watermark watermark, + struct bucket_alloc_state *s, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, ca->new_fs_bucket_idx); + u64 alloc_cursor = max(alloc_start, READ_ONCE(ca->alloc_cursor)); + int ret; +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_alloc, POS(ca->dev_idx, alloc_cursor), + BTREE_ITER_SLOTS, k, ret) { + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) + break; + + if (ca->new_fs_bucket_idx && + is_superblock_bucket(ca, k.k->p.offset)) + continue; + + a = bch2_alloc_to_v4(k, &a_convert); + + if (a->data_type != BCH_DATA_free) + continue; + + s->buckets_seen++; + + ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl); + if (ob) + break; + } + bch2_trans_iter_exit(trans, &iter); + + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_cursor > alloc_start) { + alloc_cursor = alloc_start; + goto again; + } + + return ob; +} + +static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, + struct bch_dev *ca, + enum bch_watermark watermark, + struct bucket_alloc_state *s, + struct closure *cl) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct open_bucket *ob = NULL; + u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(ca->alloc_cursor)); + u64 alloc_cursor = alloc_start; + int ret; + + BUG_ON(ca->new_fs_bucket_idx); +again: + for_each_btree_key_norestart(trans, iter, BTREE_ID_freespace, + POS(ca->dev_idx, alloc_cursor), 0, k, ret) { + if (k.k->p.inode != ca->dev_idx) + break; + + for (alloc_cursor = max(alloc_cursor, bkey_start_offset(k.k)); + alloc_cursor < k.k->p.offset; + alloc_cursor++) { + ret = btree_trans_too_many_iters(trans); + if (ret) { + ob = ERR_PTR(ret); + break; + } + + s->buckets_seen++; + + ob = try_alloc_bucket(trans, ca, watermark, + alloc_cursor, s, k, cl); + if (ob) { + iter.path->preserve = false; + break; + } + } + + if (ob || ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + ca->alloc_cursor = alloc_cursor; + + if (!ob && ret) + ob = ERR_PTR(ret); + + if (!ob && alloc_start > ca->mi.first_bucket) { + alloc_cursor = alloc_start = ca->mi.first_bucket; + goto again; + } + + return ob; +} + +/** + * bch_bucket_alloc - allocate a single bucket from a specific device + * + * Returns index of bucket on success, 0 on failure + */ +static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, + struct bch_dev *ca, + enum bch_watermark watermark, + struct closure *cl, + struct bch_dev_usage *usage) +{ + struct bch_fs *c = trans->c; + struct open_bucket *ob = NULL; + bool freespace = READ_ONCE(ca->mi.freespace_initialized); + u64 avail; + struct bucket_alloc_state s = { 0 }; + bool waiting = false; +again: + bch2_dev_usage_read_fast(ca, usage); + avail = dev_buckets_free(ca, *usage, watermark); + + if (usage->d[BCH_DATA_need_discard].buckets > avail) + bch2_do_discards(c); + + if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + bch2_do_gc_gens(c); + + if (should_invalidate_buckets(ca, *usage)) + bch2_do_invalidates(c); + + if (!avail) { + if (cl && !waiting) { + closure_wait(&c->freelist_wait, cl); + waiting = true; + goto again; + } + + if (!c->blocked_allocate) + c->blocked_allocate = local_clock(); + + ob = ERR_PTR(-BCH_ERR_freelist_empty); + goto err; + } + + if (waiting) + closure_wake_up(&c->freelist_wait); +alloc: + ob = likely(freespace) + ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) + : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + + if (s.skipped_need_journal_commit * 2 > avail) + bch2_journal_flush_async(&c->journal, NULL); + + if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + freespace = false; + goto alloc; + } +err: + if (!ob) + ob = ERR_PTR(-BCH_ERR_no_buckets_found); + + if (!IS_ERR(ob)) + trace_and_count(c, bucket_alloc, ca, + bch2_watermarks[watermark], + ob->bucket, + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + &s, + cl == NULL, + ""); + else if (!bch2_err_matches(PTR_ERR(ob), BCH_ERR_transaction_restart)) + trace_and_count(c, bucket_alloc_fail, ca, + bch2_watermarks[watermark], + 0, + usage->d[BCH_DATA_free].buckets, + avail, + bch2_copygc_wait_amount(c), + c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now), + &s, + cl == NULL, + bch2_err_str(PTR_ERR(ob))); + + return ob; +} + +struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, + enum bch_watermark watermark, + struct closure *cl) +{ + struct bch_dev_usage usage; + struct open_bucket *ob; + + bch2_trans_do(c, NULL, NULL, 0, + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(&trans, ca, watermark, + cl, &usage))); + return ob; +} + +static int __dev_stripe_cmp(struct dev_stripe_state *stripe, + unsigned l, unsigned r) +{ + return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - + (stripe->next_alloc[l] < stripe->next_alloc[r])); +} + +#define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) + +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs) +{ + struct dev_alloc_list ret = { .nr = 0 }; + unsigned i; + + for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) + ret.devs[ret.nr++] = i; + + bubble_sort(ret.devs, ret.nr, dev_stripe_cmp); + return ret; +} + +static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, + struct dev_stripe_state *stripe, + struct bch_dev_usage *usage) +{ + u64 *v = stripe->next_alloc + ca->dev_idx; + u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space_inv = free_space + ? div64_u64(1ULL << 48, free_space) + : 1ULL << 48; + u64 scale = *v / 4; + + if (*v + free_space_inv >= *v) + *v += free_space_inv; + else + *v = U64_MAX; + + for (v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; +} + +void bch2_dev_stripe_increment(struct bch_dev *ca, + struct dev_stripe_state *stripe) +{ + struct bch_dev_usage usage; + + bch2_dev_usage_read_fast(ca, &usage); + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); +} + +static int add_new_bucket(struct bch_fs *c, + struct open_buckets *ptrs, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + struct open_bucket *ob) +{ + unsigned durability = + bch_dev_bkey_exists(c, ob->dev)->mi.durability; + + BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); + + __clear_bit(ob->dev, devs_may_alloc->d); + *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) + ? durability : 1; + *have_cache |= !durability; + + ob_push(c, ptrs, ob); + + if (*nr_effective >= nr_replicas) + return 1; + if (ob->ec) + return 1; + return 0; +} + +int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct open_buckets *ptrs, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + unsigned flags, + enum bch_data_type data_type, + enum bch_watermark watermark, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, stripe, devs_may_alloc); + unsigned dev; + struct bch_dev *ca; + int ret = -BCH_ERR_insufficient_devices; + unsigned i; + + BUG_ON(*nr_effective >= nr_replicas); + + for (i = 0; i < devs_sorted.nr; i++) { + struct bch_dev_usage usage; + struct open_bucket *ob; + + dev = devs_sorted.devs[i]; + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + if (!ca) + continue; + + if (!ca->mi.durability && *have_cache) { + percpu_ref_put(&ca->ref); + continue; + } + + ob = bch2_bucket_alloc_trans(trans, ca, watermark, cl, &usage); + if (!IS_ERR(ob)) + bch2_dev_stripe_increment_inlined(ca, stripe, &usage); + percpu_ref_put(&ca->ref); + + if (IS_ERR(ob)) { + ret = PTR_ERR(ob); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || cl) + break; + continue; + } + + ob->data_type = data_type; + + if (add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob)) { + ret = 0; + break; + } + } + + return ret; +} + +/* Allocate from stripes: */ + +/* + * if we can't allocate a new stripe because there are already too many + * partially filled stripes, force allocating from an existing stripe even when + * it's to a device we don't want: + */ + +static int bucket_alloc_from_stripe(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + u16 target, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct dev_alloc_list devs_sorted; + struct ec_stripe_head *h; + struct open_bucket *ob; + struct bch_dev *ca; + unsigned i, ec_idx; + int ret = 0; + + if (nr_replicas < 2) + return 0; + + if (ec_open_bucket(c, ptrs)) + return 0; + + h = bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + if (IS_ERR(h)) + return PTR_ERR(h); + if (!h) + return 0; + + devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + + for (i = 0; i < devs_sorted.nr; i++) + for (ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { + if (!h->s->blocks[ec_idx]) + continue; + + ob = c->open_buckets + h->s->blocks[ec_idx]; + if (ob->dev == devs_sorted.devs[i] && + !test_and_set_bit(ec_idx, h->s->blocks_allocated)) + goto got_bucket; + } + goto out_put_head; +got_bucket: + ca = bch_dev_bkey_exists(c, ob->dev); + + ob->ec_idx = ec_idx; + ob->ec = h->s; + ec_stripe_new_get(h->s, STRIPE_REF_io); + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); +out_put_head: + bch2_ec_stripe_head_put(c, h); + return ret; +} + +/* Sector allocator */ + +static bool want_bucket(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + bool *have_cache, bool ec, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + if (!test_bit(ob->dev, devs_may_alloc->d)) + return false; + + if (ob->data_type != wp->data_type) + return false; + + if (!ca->mi.durability && + (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + return false; + + if (ec != (ob->ec != NULL)) + return false; + + return true; +} + +static int bucket_alloc_set_writepoint(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + bool ec, unsigned flags) +{ + struct open_buckets ptrs_skip = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + int ret = 0; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + if (!ret && want_bucket(c, wp, devs_may_alloc, + have_cache, ec, ob)) + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + else + ob_push(c, &ptrs_skip, ob); + } + wp->ptrs = ptrs_skip; + + return ret; +} + +static int bucket_alloc_set_partial(struct bch_fs *c, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_mask *devs_may_alloc, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, bool ec, + enum bch_watermark watermark, + unsigned flags) +{ + int i, ret = 0; + + if (!c->open_buckets_partial_nr) + return 0; + + spin_lock(&c->freelist_lock); + + if (!c->open_buckets_partial_nr) + goto unlock; + + for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { + struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; + + if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_dev_usage usage; + u64 avail; + + bch2_dev_usage_read_fast(ca, &usage); + avail = dev_buckets_free(ca, usage, watermark); + if (!avail) + continue; + + array_remove_item(c->open_buckets_partial, + c->open_buckets_partial_nr, + i); + ob->on_partial_list = false; + + ret = add_new_bucket(c, ptrs, devs_may_alloc, + nr_replicas, nr_effective, + have_cache, flags, ob); + if (ret) + break; + } + } +unlock: + spin_unlock(&c->freelist_lock); + return ret; +} + +static int __open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + bool erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *_cl) +{ + struct bch_fs *c = trans->c; + struct bch_devs_mask devs; + struct open_bucket *ob; + struct closure *cl = NULL; + unsigned i; + int ret; + + devs = target_rw_devs(c, wp->data_type, target); + + /* Don't allocate from devices we already have pointers to: */ + for (i = 0; i < devs_have->nr; i++) + __clear_bit(devs_have->devs[i], devs.d); + + open_bucket_for_each(c, ptrs, ob, i) + __clear_bit(ob->dev, devs.d); + + if (erasure_code && ec_open_bucket(c, ptrs)) + return 0; + + ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, flags); + if (ret) + return ret; + + ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, + nr_replicas, nr_effective, + have_cache, erasure_code, watermark, flags); + if (ret) + return ret; + + if (erasure_code) { + ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, + target, + nr_replicas, nr_effective, + have_cache, + watermark, flags, _cl); + } else { +retry_blocking: + /* + * Try nonblocking first, so that if one device is full we'll try from + * other devices: + */ + ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, + nr_replicas, nr_effective, have_cache, + flags, wp->data_type, watermark, cl); + if (ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && + !cl && _cl) { + cl = _cl; + goto retry_blocking; + } + + } + + return ret; +} + +static int open_bucket_add_buckets(struct btree_trans *trans, + struct open_buckets *ptrs, + struct write_point *wp, + struct bch_devs_list *devs_have, + u16 target, + unsigned erasure_code, + unsigned nr_replicas, + unsigned *nr_effective, + bool *have_cache, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl) +{ + int ret; + + if (erasure_code) { + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, erasure_code, + nr_replicas, nr_effective, have_cache, + watermark, flags, cl); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_operation_blocked) || + bch2_err_matches(ret, BCH_ERR_freelist_empty) || + bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + return ret; + if (*nr_effective >= nr_replicas) + return 0; + } + + ret = __open_bucket_add_buckets(trans, ptrs, wp, + devs_have, target, false, + nr_replicas, nr_effective, have_cache, + watermark, flags, cl); + return ret < 0 ? ret : 0; +} + +static bool should_drop_bucket(struct open_bucket *ob, struct bch_fs *c, + struct bch_dev *ca, bool ec) +{ + if (ec) { + return ob->ec != NULL; + } else if (ca) { + bool drop = ob->dev == ca->dev_idx; + struct open_bucket *ob2; + unsigned i; + + if (!drop && ob->ec) { + mutex_lock(&ob->ec->lock); + for (i = 0; i < ob->ec->new_stripe.key.v.nr_blocks; i++) { + if (!ob->ec->blocks[i]) + continue; + + ob2 = c->open_buckets + ob->ec->blocks[i]; + drop |= ob2->dev == ca->dev_idx; + } + mutex_unlock(&ob->ec->lock); + } + + return drop; + } else { + return true; + } +} + +static void bch2_writepoint_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec, struct write_point *wp) +{ + struct open_buckets ptrs = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (should_drop_bucket(ob, c, ca, ec)) + bch2_open_bucket_put(c, ob); + else + ob_push(c, &ptrs, ob); + wp->ptrs = ptrs; + mutex_unlock(&wp->lock); +} + +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, + bool ec) +{ + unsigned i; + + /* Next, close write points that point to this device... */ + for (i = 0; i < ARRAY_SIZE(c->write_points); i++) + bch2_writepoint_stop(c, ca, ec, &c->write_points[i]); + + bch2_writepoint_stop(c, ca, ec, &c->copygc_write_point); + bch2_writepoint_stop(c, ca, ec, &c->rebalance_write_point); + bch2_writepoint_stop(c, ca, ec, &c->btree_write_point); + + mutex_lock(&c->btree_reserve_cache_lock); + while (c->btree_reserve_cache_nr) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + bch2_open_buckets_put(c, &a->ob); + } + mutex_unlock(&c->btree_reserve_cache_lock); + + spin_lock(&c->freelist_lock); + i = 0; + while (i < c->open_buckets_partial_nr) { + struct open_bucket *ob = + c->open_buckets + c->open_buckets_partial[i]; + + if (should_drop_bucket(ob, c, ca, ec)) { + --c->open_buckets_partial_nr; + swap(c->open_buckets_partial[i], + c->open_buckets_partial[c->open_buckets_partial_nr]); + ob->on_partial_list = false; + spin_unlock(&c->freelist_lock); + bch2_open_bucket_put(c, ob); + spin_lock(&c->freelist_lock); + } else { + i++; + } + } + spin_unlock(&c->freelist_lock); + + bch2_ec_stop_dev(c, ca); +} + +static inline struct hlist_head *writepoint_hash(struct bch_fs *c, + unsigned long write_point) +{ + unsigned hash = + hash_long(write_point, ilog2(ARRAY_SIZE(c->write_points_hash))); + + return &c->write_points_hash[hash]; +} + +static struct write_point *__writepoint_find(struct hlist_head *head, + unsigned long write_point) +{ + struct write_point *wp; + + rcu_read_lock(); + hlist_for_each_entry_rcu(wp, head, node) + if (wp->write_point == write_point) + goto out; + wp = NULL; +out: + rcu_read_unlock(); + return wp; +} + +static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) +{ + u64 stranded = c->write_points_nr * c->bucket_size_max; + u64 free = bch2_fs_usage_read_short(c).free; + + return stranded * factor > free; +} + +static bool try_increase_writepoints(struct bch_fs *c) +{ + struct write_point *wp; + + if (c->write_points_nr == ARRAY_SIZE(c->write_points) || + too_many_writepoints(c, 32)) + return false; + + wp = c->write_points + c->write_points_nr++; + hlist_add_head_rcu(&wp->node, writepoint_hash(c, wp->write_point)); + return true; +} + +static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->write_points_hash_lock); + if (c->write_points_nr < old_nr) { + mutex_unlock(&c->write_points_hash_lock); + return true; + } + + if (c->write_points_nr == 1 || + !too_many_writepoints(c, 8)) { + mutex_unlock(&c->write_points_hash_lock); + return false; + } + + wp = c->write_points + --c->write_points_nr; + + hlist_del_rcu(&wp->node); + mutex_unlock(&c->write_points_hash_lock); + + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); + wp->ptrs.nr = 0; + mutex_unlock(&wp->lock); + return true; +} + +static struct write_point *writepoint_find(struct btree_trans *trans, + unsigned long write_point) +{ + struct bch_fs *c = trans->c; + struct write_point *wp, *oldest; + struct hlist_head *head; + + if (!(write_point & 1UL)) { + wp = (struct write_point *) write_point; + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + return wp; + } + + head = writepoint_hash(c, write_point); +restart_find: + wp = __writepoint_find(head, write_point); + if (wp) { +lock_wp: + bch2_trans_mutex_lock_norelock(trans, &wp->lock); + if (wp->write_point == write_point) + goto out; + mutex_unlock(&wp->lock); + goto restart_find; + } +restart_find_oldest: + oldest = NULL; + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) + if (!oldest || time_before64(wp->last_used, oldest->last_used)) + oldest = wp; + + bch2_trans_mutex_lock_norelock(trans, &oldest->lock); + bch2_trans_mutex_lock_norelock(trans, &c->write_points_hash_lock); + if (oldest >= c->write_points + c->write_points_nr || + try_increase_writepoints(c)) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto restart_find_oldest; + } + + wp = __writepoint_find(head, write_point); + if (wp && wp != oldest) { + mutex_unlock(&c->write_points_hash_lock); + mutex_unlock(&oldest->lock); + goto lock_wp; + } + + wp = oldest; + hlist_del_rcu(&wp->node); + wp->write_point = write_point; + hlist_add_head_rcu(&wp->node, head); + mutex_unlock(&c->write_points_hash_lock); +out: + wp->last_used = local_clock(); + return wp; +} + +/* + * Get us an open_bucket we can allocate from, return with it locked: + */ +int bch2_alloc_sectors_start_trans(struct btree_trans *trans, + unsigned target, + unsigned erasure_code, + struct write_point_specifier write_point, + struct bch_devs_list *devs_have, + unsigned nr_replicas, + unsigned nr_replicas_required, + enum bch_watermark watermark, + unsigned flags, + struct closure *cl, + struct write_point **wp_ret) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; + struct open_bucket *ob; + struct open_buckets ptrs; + unsigned nr_effective, write_points_nr; + bool have_cache; + int ret; + int i; + + BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS); + + BUG_ON(!nr_replicas || !nr_replicas_required); +retry: + ptrs.nr = 0; + nr_effective = 0; + write_points_nr = c->write_points_nr; + have_cache = false; + + *wp_ret = wp = writepoint_find(trans, write_point.v); + + /* metadata may not allocate on cache devices: */ + if (wp->data_type != BCH_DATA_user) + have_cache = true; + + if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, NULL); + if (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto alloc_done; + + /* Don't retry from all devices if we're out of open buckets: */ + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) + goto allocate_blocking; + + /* + * Only try to allocate cache (durability = 0 devices) from the + * specified target: + */ + have_cache = true; + + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + 0, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + } else { +allocate_blocking: + ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, + target, erasure_code, + nr_replicas, &nr_effective, + &have_cache, watermark, + flags, cl); + } +alloc_done: + BUG_ON(!ret && nr_effective < nr_replicas); + + if (erasure_code && !ec_open_bucket(c, &ptrs)) + pr_debug("failed to get ec bucket: ret %u", ret); + + if (ret == -BCH_ERR_insufficient_devices && + nr_effective >= nr_replicas_required) + ret = 0; + + if (ret) + goto err; + + /* Free buckets we didn't use: */ + open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_free_unused(c, ob); + + wp->ptrs = ptrs; + + wp->sectors_free = UINT_MAX; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + + return 0; +err: + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (ptrs.nr < ARRAY_SIZE(ptrs.v)) + ob_push(c, &ptrs, ob); + else + open_bucket_free_unused(c, ob); + wp->ptrs = ptrs; + + mutex_unlock(&wp->lock); + + if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && + try_decrease_writepoints(trans, write_points_nr)) + goto retry; + + if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty) || + bch2_err_matches(ret, BCH_ERR_freelist_empty)) + return cl + ? -BCH_ERR_bucket_alloc_blocked + : -BCH_ERR_ENOSPC_bucket_alloc; + + return ret; +} + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} + +void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors, + bool cached) +{ + bch2_alloc_sectors_append_ptrs_inlined(c, wp, k, sectors, cached); +} + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) +{ + bch2_alloc_sectors_done_inlined(c, wp); +} + +static inline void writepoint_init(struct write_point *wp, + enum bch_data_type type) +{ + mutex_init(&wp->lock); + wp->data_type = type; + + INIT_WORK(&wp->index_update_work, bch2_write_point_do_index_updates); + INIT_LIST_HEAD(&wp->writes); + spin_lock_init(&wp->writes_lock); +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *c) +{ + struct open_bucket *ob; + struct write_point *wp; + + mutex_init(&c->write_points_hash_lock); + c->write_points_nr = ARRAY_SIZE(c->write_points); + + /* open bucket 0 is a sentinal NULL: */ + spin_lock_init(&c->open_buckets[0].lock); + + for (ob = c->open_buckets + 1; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { + spin_lock_init(&ob->lock); + c->open_buckets_nr_free++; + + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + } + + writepoint_init(&c->btree_write_point, BCH_DATA_btree); + writepoint_init(&c->rebalance_write_point, BCH_DATA_user); + writepoint_init(&c->copygc_write_point, BCH_DATA_user); + + for (wp = c->write_points; + wp < c->write_points + c->write_points_nr; wp++) { + writepoint_init(wp, BCH_DATA_user); + + wp->last_used = local_clock(); + wp->write_point = (unsigned long) wp; + hlist_add_head_rcu(&wp->node, + writepoint_hash(c, wp->write_point)); + } +} + +static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned data_type = ob->data_type; + barrier(); /* READ_ONCE() doesn't work on bitfields */ + + prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u", + ob - c->open_buckets, + atomic_read(&ob->pin), + data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type", + ob->dev, ob->bucket, ob->gen, + ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size); + if (ob->ec) + prt_printf(out, " ec idx %llu", ob->ec->idx); + if (ob->on_partial_list) + prt_str(out, " partial"); + prt_newline(out); +} + +void bch2_open_buckets_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct open_bucket *ob; + + out->atomic++; + + for (ob = c->open_buckets; + ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list) + bch2_open_bucket_to_text(out, c, ob); + spin_unlock(&ob->lock); + } + + --out->atomic; +} + +void bch2_open_buckets_partial_to_text(struct printbuf *out, struct bch_fs *c) +{ + unsigned i; + + out->atomic++; + spin_lock(&c->freelist_lock); + + for (i = 0; i < c->open_buckets_partial_nr; i++) + bch2_open_bucket_to_text(out, c, + c->open_buckets + c->open_buckets_partial[i]); + + spin_unlock(&c->freelist_lock); + --out->atomic; +} + +static const char * const bch2_write_point_states[] = { +#define x(n) #n, + WRITE_POINT_STATES() +#undef x + NULL +}; + +void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct write_point *wp; + unsigned i; + + for (wp = c->write_points; + wp < c->write_points + ARRAY_SIZE(c->write_points); + wp++) { + prt_printf(out, "%lu: ", wp->write_point); + prt_human_readable_u64(out, wp->sectors_allocated); + + prt_printf(out, " last wrote: "); + bch2_pr_time_units(out, sched_clock() - wp->last_used); + + for (i = 0; i < WRITE_POINT_STATE_NR; i++) { + prt_printf(out, " %s: ", bch2_write_point_states[i]); + bch2_pr_time_units(out, wp->time[i]); + } + + prt_newline(out); + } +} diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h new file mode 100644 index 000000000..fee195f7e --- /dev/null +++ b/fs/bcachefs/alloc_foreground.h @@ -0,0 +1,224 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_FOREGROUND_H +#define _BCACHEFS_ALLOC_FOREGROUND_H + +#include "bcachefs.h" +#include "alloc_types.h" +#include "extents.h" +#include "super.h" + +#include + +struct bkey; +struct bch_dev; +struct bch_fs; +struct bch_devs_List; + +extern const char * const bch2_watermarks[]; + +void bch2_reset_alloc_cursors(struct bch_fs *); + +struct dev_alloc_list { + unsigned nr; + u8 devs[BCH_SB_MEMBERS_MAX]; +}; + +struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *); +void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); + +long bch2_bucket_alloc_new_fs(struct bch_dev *); + +struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, + enum bch_watermark, struct closure *); + +static inline void ob_push(struct bch_fs *c, struct open_buckets *obs, + struct open_bucket *ob) +{ + BUG_ON(obs->nr >= ARRAY_SIZE(obs->v)); + + obs->v[obs->nr++] = ob - c->open_buckets; +} + +#define open_bucket_for_each(_c, _obs, _ob, _i) \ + for ((_i) = 0; \ + (_i) < (_obs)->nr && \ + ((_ob) = (_c)->open_buckets + (_obs)->v[_i], true); \ + (_i)++) + +static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, + struct open_buckets *obs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, obs, ob, i) + if (ob->ec) + return ob; + + return NULL; +} + +void bch2_open_bucket_write_error(struct bch_fs *, + struct open_buckets *, unsigned); + +void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + +static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + if (atomic_dec_and_test(&ob->pin)) + __bch2_open_bucket_put(c, ob); +} + +static inline void bch2_open_buckets_put(struct bch_fs *c, + struct open_buckets *ptrs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, ptrs, ob, i) + bch2_open_bucket_put(c, ob); + ptrs->nr = 0; +} + +static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct write_point *wp) +{ + struct open_buckets ptrs = { .nr = 0 }, keep = { .nr = 0 }; + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); + wp->ptrs = keep; + + mutex_unlock(&wp->lock); + + bch2_open_buckets_put(c, &ptrs); +} + +static inline void bch2_open_bucket_get(struct bch_fs *c, + struct write_point *wp, + struct open_buckets *ptrs) +{ + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + ob->data_type = wp->data_type; + atomic_inc(&ob->pin); + ob_push(c, ptrs, ob); + } +} + +static inline open_bucket_idx_t *open_bucket_hashslot(struct bch_fs *c, + unsigned dev, u64 bucket) +{ + return c->open_buckets_hash + + (jhash_3words(dev, bucket, bucket >> 32, 0) & + (OPEN_BUCKETS_COUNT - 1)); +} + +static inline bool bch2_bucket_is_open(struct bch_fs *c, unsigned dev, u64 bucket) +{ + open_bucket_idx_t slot = *open_bucket_hashslot(c, dev, bucket); + + while (slot) { + struct open_bucket *ob = &c->open_buckets[slot]; + + if (ob->dev == dev && ob->bucket == bucket) + return true; + + slot = ob->hash; + } + + return false; +} + +static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 bucket) +{ + bool ret; + + if (bch2_bucket_is_open(c, dev, bucket)) + return true; + + spin_lock(&c->freelist_lock); + ret = bch2_bucket_is_open(c, dev, bucket); + spin_unlock(&c->freelist_lock); + + return ret; +} + +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, + struct dev_stripe_state *, struct bch_devs_mask *, + unsigned, unsigned *, bool *, unsigned, + enum bch_data_type, enum bch_watermark, + struct closure *); + +int bch2_alloc_sectors_start_trans(struct btree_trans *, + unsigned, unsigned, + struct write_point_specifier, + struct bch_devs_list *, + unsigned, unsigned, + enum bch_watermark, + unsigned, + struct closure *, + struct write_point **); + +struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); + +/* + * Append pointers to the space we just allocated to @k, and mark @sectors space + * as allocated out of @ob + */ +static inline void +bch2_alloc_sectors_append_ptrs_inlined(struct bch_fs *c, struct write_point *wp, + struct bkey_i *k, unsigned sectors, + bool cached) +{ + struct open_bucket *ob; + unsigned i; + + BUG_ON(sectors > wp->sectors_free); + wp->sectors_free -= sectors; + wp->sectors_allocated += sectors; + + open_bucket_for_each(c, &wp->ptrs, ob, i) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + struct bch_extent_ptr ptr = bch2_ob_ptr(c, ob); + + ptr.cached = cached || + (!ca->mi.durability && + wp->data_type == BCH_DATA_user); + + bch2_bkey_append_ptr(k, ptr); + + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; + } +} + +void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, + struct bkey_i *, unsigned, bool); +void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); + +void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *, bool); + +static inline struct write_point_specifier writepoint_hashed(unsigned long v) +{ + return (struct write_point_specifier) { .v = v | 1 }; +} + +static inline struct write_point_specifier writepoint_ptr(struct write_point *wp) +{ + return (struct write_point_specifier) { .v = (unsigned long) wp }; +} + +void bch2_fs_allocator_foreground_init(struct bch_fs *); + +void bch2_open_buckets_to_text(struct printbuf *, struct bch_fs *); +void bch2_open_buckets_partial_to_text(struct printbuf *, struct bch_fs *); + +void bch2_write_points_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_ALLOC_FOREGROUND_H */ diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h new file mode 100644 index 000000000..c33a29954 --- /dev/null +++ b/fs/bcachefs/alloc_types.h @@ -0,0 +1,126 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ALLOC_TYPES_H +#define _BCACHEFS_ALLOC_TYPES_H + +#include +#include + +#include "clock_types.h" +#include "fifo.h" + +struct bucket_alloc_state { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; +}; + +#define BCH_WATERMARKS() \ + x(stripe) \ + x(normal) \ + x(copygc) \ + x(btree) \ + x(btree_copygc) \ + x(reclaim) + +enum bch_watermark { +#define x(name) BCH_WATERMARK_##name, + BCH_WATERMARKS() +#undef x + BCH_WATERMARK_NR, +}; + +#define BCH_WATERMARK_BITS 3 +#define BCH_WATERMARK_MASK ~(~0 << BCH_WATERMARK_BITS) + +#define OPEN_BUCKETS_COUNT 1024 + +#define WRITE_POINT_HASH_NR 32 +#define WRITE_POINT_MAX 32 + +/* + * 0 is never a valid open_bucket_idx_t: + */ +typedef u16 open_bucket_idx_t; + +struct open_bucket { + spinlock_t lock; + atomic_t pin; + open_bucket_idx_t freelist; + open_bucket_idx_t hash; + + /* + * When an open bucket has an ec_stripe attached, this is the index of + * the block in the stripe this open_bucket corresponds to: + */ + u8 ec_idx; + enum bch_data_type data_type:6; + unsigned valid:1; + unsigned on_partial_list:1; + + u8 dev; + u8 gen; + u32 sectors_free; + u64 bucket; + struct ec_stripe_new *ec; +}; + +#define OPEN_BUCKET_LIST_MAX 15 + +struct open_buckets { + open_bucket_idx_t nr; + open_bucket_idx_t v[OPEN_BUCKET_LIST_MAX]; +}; + +struct dev_stripe_state { + u64 next_alloc[BCH_SB_MEMBERS_MAX]; +}; + +#define WRITE_POINT_STATES() \ + x(stopped) \ + x(waiting_io) \ + x(waiting_work) \ + x(running) + +enum write_point_state { +#define x(n) WRITE_POINT_##n, + WRITE_POINT_STATES() +#undef x + WRITE_POINT_STATE_NR +}; + +struct write_point { + struct { + struct hlist_node node; + struct mutex lock; + u64 last_used; + unsigned long write_point; + enum bch_data_type data_type; + + /* calculated based on how many pointers we're actually going to use: */ + unsigned sectors_free; + + struct open_buckets ptrs; + struct dev_stripe_state stripe; + + u64 sectors_allocated; + } __attribute__((__aligned__(SMP_CACHE_BYTES))); + + struct { + struct work_struct index_update_work; + + struct list_head writes; + spinlock_t writes_lock; + + enum write_point_state state; + u64 last_state_change; + u64 time[WRITE_POINT_STATE_NR]; + } __attribute__((__aligned__(SMP_CACHE_BYTES))); +}; + +struct write_point_specifier { + unsigned long v; +}; + +#endif /* _BCACHEFS_ALLOC_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c new file mode 100644 index 000000000..d412bae55 --- /dev/null +++ b/fs/bcachefs/backpointers.c @@ -0,0 +1,889 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "btree_cache.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "error.h" + +#include + +static bool extent_matches_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct bpos bucket, + struct bch_backpointer bp) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket2; + struct bch_backpointer bp2; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, btree_id, level, k, p, + &bucket2, &bp2); + if (bpos_eq(bucket, bucket2) && + !memcmp(&bp, &bp2, sizeof(bp))) + return true; + } + + return false; +} + +int bch2_backpointer_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + struct bpos bucket = bp_pos_to_bucket(c, bp.k->p); + + if (!bpos_eq(bp.k->p, bucket_pos_to_bp(c, bucket, bp.v->bucket_offset))) { + prt_str(err, "backpointer at wrong pos"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer *bp) +{ + prt_printf(out, "btree=%s l=%u offset=%llu:%u len=%u pos=", + bch2_btree_ids[bp->btree_id], + bp->level, + (u64) (bp->bucket_offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT), + (u32) bp->bucket_offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), + bp->bucket_len); + bch2_bpos_to_text(out, bp->pos); +} + +void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + prt_str(out, "bucket="); + bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p)); + prt_str(out, " "); + + bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v); +} + +void bch2_backpointer_swab(struct bkey_s k) +{ + struct bkey_s_backpointer bp = bkey_s_to_backpointer(k); + + bp.v->bucket_offset = swab32(bp.v->bucket_offset); + bp.v->bucket_len = swab32(bp.v->bucket_len); + bch2_bpos_swab(&bp.v->pos); +} + +static noinline int backpointer_mod_err(struct btree_trans *trans, + struct bch_backpointer bp, + struct bkey_s_c bp_k, + struct bkey_s_c orig_k, + bool insert) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + if (insert) { + prt_printf(&buf, "existing backpointer found when inserting "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "found "); + bch2_bkey_val_to_text(&buf, c, bp_k); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + bch_err(c, "%s", buf.buf); + } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + prt_printf(&buf, "backpointer not found when deleting"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + prt_printf(&buf, "searching for "); + bch2_backpointer_to_text(&buf, &bp); + prt_newline(&buf); + + prt_printf(&buf, "got "); + bch2_bkey_val_to_text(&buf, c, bp_k); + prt_newline(&buf); + + prt_printf(&buf, "for "); + bch2_bkey_val_to_text(&buf, c, orig_k); + + bch_err(c, "%s", buf.buf); + } + + printbuf_exit(&buf); + + if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + bch2_inconsistent_error(c); + return -EIO; + } else { + return 0; + } +} + +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, + struct bpos bucket, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + bool insert) +{ + struct bch_fs *c = trans->c; + struct bkey_i_backpointer *bp_k; + struct btree_iter bp_iter; + struct bkey_s_c k; + int ret; + + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + + k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bp_k->k.p, + BTREE_ITER_INTENT| + BTREE_ITER_SLOTS| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k); + if (ret) + goto err; + + if (insert + ? k.k->type + : (k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(k).v, &bp, sizeof(bp)))) { + ret = backpointer_mod_err(trans, bp, k, orig_k, insert); + if (ret) + goto err; + } + + ret = bch2_trans_update(trans, &bp_iter, &bp_k->k_i, 0); +err: + bch2_trans_iter_exit(trans, &bp_iter); + return ret; +} + +/* + * Find the next backpointer >= *bp_offset: + */ +int bch2_get_next_backpointer(struct btree_trans *trans, + struct bpos bucket, int gen, + struct bpos *bp_pos, + struct bch_backpointer *bp, + unsigned iter_flags) +{ + struct bch_fs *c = trans->c; + struct bpos bp_end_pos = bucket_pos_to_bp(c, bpos_nosnap_successor(bucket), 0); + struct btree_iter alloc_iter = { NULL }, bp_iter = { NULL }; + struct bkey_s_c k; + int ret = 0; + + if (bpos_ge(*bp_pos, bp_end_pos)) + goto done; + + if (gen >= 0) { + k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED|iter_flags); + ret = bkey_err(k); + if (ret) + goto out; + + if (k.k->type != KEY_TYPE_alloc_v4 || + bkey_s_c_to_alloc_v4(k).v->gen != gen) + goto done; + } + + *bp_pos = bpos_max(*bp_pos, bucket_pos_to_bp(c, bucket, 0)); + + for_each_btree_key_norestart(trans, bp_iter, BTREE_ID_backpointers, + *bp_pos, iter_flags, k, ret) { + if (bpos_ge(k.k->p, bp_end_pos)) + break; + + *bp_pos = k.k->p; + *bp = *bkey_s_c_to_backpointer(k).v; + goto out; + } +done: + *bp_pos = SPOS_MAX; +out: + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +static void backpointer_not_found(struct btree_trans *trans, + struct bpos bp_pos, + struct bch_backpointer bp, + struct bkey_s_c k, + const char *thing_it_points_to) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + + if (likely(!bch2_backpointers_no_use_write_buffer)) + return; + + prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", + thing_it_points_to); + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, "\n "); + + prt_printf(&buf, "backpointer pos: "); + bch2_bpos_to_text(&buf, bp_pos); + prt_printf(&buf, "\n "); + + bch2_backpointer_to_text(&buf, &bp); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, k); + if (c->curr_recovery_pass >= BCH_RECOVERY_PASS_check_extents_to_backpointers) + bch_err_ratelimited(c, "%s", buf.buf); + else + bch2_trans_inconsistent(trans, "%s", buf.buf); + + printbuf_exit(&buf); +} + +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bp_pos, + struct bch_backpointer bp, + unsigned iter_flags) +{ + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, bp.btree_id); + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct bkey_s_c k; + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + min(bp.level, r->level), + iter_flags); + k = bch2_btree_iter_peek_slot(iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (bp.level == r->level + 1) + k = bkey_i_to_s_c(&r->key); + + if (k.k && extent_matches_bp(c, bp.btree_id, bp.level, k, bucket, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); + + if (unlikely(bch2_backpointers_no_use_write_buffer)) { + if (bp.level) { + struct btree *b; + + /* + * If a backpointer for a btree node wasn't found, it may be + * because it was overwritten by a new btree node that hasn't + * been written out yet - backpointer_get_node() checks for + * this: + */ + b = bch2_backpointer_get_node(trans, iter, bp_pos, bp); + if (!IS_ERR_OR_NULL(b)) + return bkey_i_to_s_c(&b->key); + + bch2_trans_iter_exit(trans, iter); + + if (IS_ERR(b)) + return bkey_s_c_err(PTR_ERR(b)); + return bkey_s_c_null; + } + + backpointer_not_found(trans, bp_pos, bp, k, "extent"); + } + + return bkey_s_c_null; +} + +struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bp_pos, + struct bch_backpointer bp) +{ + struct bch_fs *c = trans->c; + struct bpos bucket = bp_pos_to_bucket(c, bp_pos); + struct btree *b; + + BUG_ON(!bp.level); + + bch2_trans_node_iter_init(trans, iter, + bp.btree_id, + bp.pos, + 0, + bp.level - 1, + 0); + b = bch2_btree_iter_peek_node(iter); + if (IS_ERR(b)) + goto err; + + if (b && extent_matches_bp(c, bp.btree_id, bp.level, + bkey_i_to_s_c(&b->key), + bucket, bp)) + return b; + + if (b && btree_node_will_make_reachable(b)) { + b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + } else { + backpointer_not_found(trans, bp_pos, bp, + bkey_i_to_s_c(&b->key), "btree node"); + b = NULL; + } +err: + bch2_trans_iter_exit(trans, iter); + return b; +} + +static int bch2_check_btree_backpointer(struct btree_trans *trans, struct btree_iter *bp_iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct btree_iter alloc_iter = { NULL }; + struct bch_dev *ca; + struct bkey_s_c alloc_k; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (fsck_err_on(!bch2_dev_exists2(c, k.k->p.inode), c, + "backpointer for mising device:\n%s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } + + ca = bch_dev_bkey_exists(c, k.k->p.inode); + + alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, BTREE_ID_alloc, + bp_pos_to_bucket(c, k.k->p), 0); + ret = bkey_err(alloc_k); + if (ret) + goto out; + + if (fsck_err_on(alloc_k.k->type != KEY_TYPE_alloc_v4, c, + "backpointer for nonexistent alloc key: %llu:%llu:0\n%s", + alloc_iter.pos.inode, alloc_iter.pos.offset, + (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { + ret = bch2_btree_delete_at(trans, bp_iter, 0); + goto out; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); + printbuf_exit(&buf); + return ret; +} + +/* verify that every backpointer has a corresponding alloc key */ +int bch2_check_btree_backpointers(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_backpointers, POS_MIN, 0, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + bch2_check_btree_backpointer(&trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +struct bpos_level { + unsigned level; + struct bpos pos; +}; + +static int check_bp_exists(struct btree_trans *trans, + struct bpos bucket, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + struct bpos bucket_start, + struct bpos bucket_end, + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; + struct btree_iter bp_iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c bp_k; + int ret; + + if (bpos_lt(bucket, bucket_start) || + bpos_gt(bucket, bucket_end)) + return 0; + + if (!bch2_dev_bucket_exists(c, bucket)) + goto missing; + + bp_k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp(c, bucket, bp.bucket_offset), + 0); + ret = bkey_err(bp_k); + if (ret) + goto err; + + if (bp_k.k->type != KEY_TYPE_backpointer || + memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) { + if (last_flushed->level != bp.level || + !bpos_eq(last_flushed->pos, orig_k.k->p)) { + last_flushed->level = bp.level; + last_flushed->pos = orig_k.k->p; + + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + goto missing; + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + return ret; +missing: + prt_printf(&buf, "missing backpointer for btree=%s l=%u ", + bch2_btree_ids[bp.btree_id], bp.level); + bch2_bkey_val_to_text(&buf, c, orig_k); + prt_printf(&buf, "\nbp pos "); + bch2_bpos_to_text(&buf, bp_iter.pos); + + if (c->sb.version < bcachefs_metadata_version_backpointers || + c->opts.reconstruct_alloc || + fsck_err(c, "%s", buf.buf)) + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true); + + goto out; +} + +static int check_extent_to_backpointers(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos bucket_start, + struct bpos bucket_end, + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bkey_s_c k; + int ret; + + k = bch2_btree_iter_peek_all_levels(iter); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) + return 0; + + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k, + bucket_start, bucket_end, + last_flushed); + if (ret) + return ret; + } + + return 0; +} + +static int check_btree_root_to_backpointers(struct btree_trans *trans, + enum btree_id btree_id, + struct bpos bucket_start, + struct bpos bucket_end, + struct bpos_level *last_flushed) +{ + struct bch_fs *c = trans->c; + struct btree_root *r = bch2_btree_id_root(c, btree_id); + struct btree_iter iter; + struct btree *b; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + int ret; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + BUG_ON(b != btree_node_root(c, b)); + + k = bkey_i_to_s_c(&b->key); + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + struct bpos bucket_pos; + struct bch_backpointer bp; + + if (p.ptr.cached) + continue; + + bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1, + k, p, &bucket_pos, &bp); + + ret = check_bp_exists(trans, bucket_pos, bp, k, + bucket_start, bucket_end, + last_flushed); + if (ret) + goto err; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + +static size_t btree_nodes_fit_in_ram(struct bch_fs *c) +{ + struct sysinfo i; + u64 mem_bytes; + + si_meminfo(&i); + mem_bytes = i.totalram * i.mem_unit; + return div_u64(mem_bytes >> 1, btree_bytes(c)); +} + +static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, + unsigned btree_leaf_mask, + unsigned btree_interior_mask, + struct bbpos start, struct bbpos *end) +{ + struct btree_iter iter; + struct bkey_s_c k; + size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); + enum btree_id btree; + int ret = 0; + + for (btree = start.btree; btree < BTREE_ID_NR && !ret; btree++) { + unsigned depth = ((1U << btree) & btree_leaf_mask) ? 1 : 2; + + if (!((1U << btree) & btree_leaf_mask) && + !((1U << btree) & btree_interior_mask)) + continue; + + bch2_trans_node_iter_init(trans, &iter, btree, + btree == start.btree ? start.pos : POS_MIN, + 0, depth, 0); + /* + * for_each_btree_key_contineu() doesn't check the return value + * from bch2_btree_iter_advance(), which is needed when + * iterating over interior nodes where we'll see keys at + * SPOS_MAX: + */ + do { + k = __bch2_btree_iter_peek_and_restart(trans, &iter, 0); + ret = bkey_err(k); + if (!k.k || ret) + break; + + --btree_nodes; + if (!btree_nodes) { + *end = BBPOS(btree, k.k->p); + bch2_trans_iter_exit(trans, &iter); + return 0; + } + } while (bch2_btree_iter_advance(&iter)); + bch2_trans_iter_exit(trans, &iter); + } + + *end = BBPOS_MAX; + return ret; +} + +static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, + struct bpos bucket_start, + struct bpos bucket_end) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + enum btree_id btree_id; + struct bpos_level last_flushed = { UINT_MAX }; + int ret = 0; + + for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) { + unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1; + + bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, + depth, + BTREE_ITER_ALL_LEVELS| + BTREE_ITER_PREFETCH); + + do { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_extent_to_backpointers(trans, &iter, + bucket_start, bucket_end, + &last_flushed)); + if (ret) + break; + } while (!bch2_btree_iter_advance(&iter)); + + bch2_trans_iter_exit(trans, &iter); + + if (ret) + break; + + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + check_btree_root_to_backpointers(trans, btree_id, + bucket_start, bucket_end, + &last_flushed)); + if (ret) + break; + } + return ret; +} + +static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c, + struct bpos bucket) +{ + return bch2_dev_exists2(c, bucket.inode) + ? bucket_pos_to_bp(c, bucket, 0) + : bucket; +} + +static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans, + struct bpos start, struct bpos *end) +{ + struct btree_iter alloc_iter; + struct btree_iter bp_iter; + struct bkey_s_c alloc_k, bp_k; + size_t btree_nodes = btree_nodes_fit_in_ram(trans->c); + bool alloc_end = false, bp_end = false; + int ret = 0; + + bch2_trans_node_iter_init(trans, &alloc_iter, BTREE_ID_alloc, + start, 0, 1, 0); + bch2_trans_node_iter_init(trans, &bp_iter, BTREE_ID_backpointers, + bucket_pos_to_bp_safe(trans->c, start), 0, 1, 0); + while (1) { + alloc_k = !alloc_end + ? __bch2_btree_iter_peek_and_restart(trans, &alloc_iter, 0) + : bkey_s_c_null; + bp_k = !bp_end + ? __bch2_btree_iter_peek_and_restart(trans, &bp_iter, 0) + : bkey_s_c_null; + + ret = bkey_err(alloc_k) ?: bkey_err(bp_k); + if ((!alloc_k.k && !bp_k.k) || ret) { + *end = SPOS_MAX; + break; + } + + --btree_nodes; + if (!btree_nodes) { + *end = alloc_k.k->p; + break; + } + + if (bpos_lt(alloc_iter.pos, SPOS_MAX) && + bpos_lt(bucket_pos_to_bp_safe(trans->c, alloc_iter.pos), bp_iter.pos)) { + if (!bch2_btree_iter_advance(&alloc_iter)) + alloc_end = true; + } else { + if (!bch2_btree_iter_advance(&bp_iter)) + bp_end = true; + } + } + bch2_trans_iter_exit(trans, &bp_iter); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +int bch2_check_extents_to_backpointers(struct bch_fs *c) +{ + struct btree_trans trans; + struct bpos start = POS_MIN, end; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + while (1) { + ret = bch2_get_alloc_in_memory_pos(&trans, start, &end); + if (ret) + break; + + if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX)) + bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass", + __func__, btree_nodes_fit_in_ram(c)); + + if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_extents_to_backpointers(): "); + bch2_bpos_to_text(&buf, start); + prt_str(&buf, "-"); + bch2_bpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_extents_to_backpointers_pass(&trans, start, end); + if (ret || bpos_eq(end, SPOS_MAX)) + break; + + start = bpos_successor(end); + } + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int check_one_backpointer(struct btree_trans *trans, + struct bbpos start, + struct bbpos end, + struct bkey_s_c_backpointer bp, + struct bpos *last_flushed_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bbpos pos = bp_to_bbpos(*bp.v); + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + int ret; + + if (bbpos_cmp(pos, start) < 0 || + bbpos_cmp(pos, end) > 0) + return 0; + + k = bch2_backpointer_get_key(trans, &iter, bp.k->p, *bp.v, 0); + ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k && !bpos_eq(*last_flushed_pos, bp.k->p)) { + *last_flushed_pos = bp.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (fsck_err_on(!k.k, c, + "backpointer for missing extent\n %s", + (bch2_bkey_val_to_text(&buf, c, bp.s_c), buf.buf))) { + ret = bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, bp.k->p); + goto out; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, + struct bbpos start, + struct bbpos end) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bpos last_flushed_pos = SPOS_MAX; + + return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers, + POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_one_backpointer(trans, start, end, + bkey_s_c_to_backpointer(k), + &last_flushed_pos)); +} + +int bch2_check_backpointers_to_extents(struct bch_fs *c) +{ + struct btree_trans trans; + struct bbpos start = (struct bbpos) { .btree = 0, .pos = POS_MIN, }, end; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + while (1) { + ret = bch2_get_btree_in_memory_pos(&trans, + (1U << BTREE_ID_extents)| + (1U << BTREE_ID_reflink), + ~0, + start, &end); + if (ret) + break; + + if (!bbpos_cmp(start, BBPOS_MIN) && + bbpos_cmp(end, BBPOS_MAX)) + bch_verbose(c, "%s(): extents do not fit in ram, running in multiple passes with %zu nodes per pass", + __func__, btree_nodes_fit_in_ram(c)); + + if (bbpos_cmp(start, BBPOS_MIN) || + bbpos_cmp(end, BBPOS_MAX)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "check_backpointers_to_extents(): "); + bch2_bbpos_to_text(&buf, start); + prt_str(&buf, "-"); + bch2_bbpos_to_text(&buf, end); + + bch_verbose(c, "%s", buf.buf); + printbuf_exit(&buf); + } + + ret = bch2_check_backpointers_to_extents_pass(&trans, start, end); + if (ret || !bbpos_cmp(end, BBPOS_MAX)) + break; + + start = bbpos_successor(end); + } + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h new file mode 100644 index 000000000..87e31aa19 --- /dev/null +++ b/fs/bcachefs/backpointers.h @@ -0,0 +1,131 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H + +#include "btree_iter.h" +#include "btree_update.h" +#include "buckets.h" +#include "super.h" + +int bch2_backpointer_invalid(const struct bch_fs *, struct bkey_s_c k, + enum bkey_invalid_flags, struct printbuf *); +void bch2_backpointer_to_text(struct printbuf *, const struct bch_backpointer *); +void bch2_backpointer_k_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_backpointer_swab(struct bkey_s); + +#define bch2_bkey_ops_backpointer ((struct bkey_ops) { \ + .key_invalid = bch2_backpointer_invalid, \ + .val_to_text = bch2_backpointer_k_to_text, \ + .swab = bch2_backpointer_swab, \ + .min_val_size = 32, \ +}) + +#define MAX_EXTENT_COMPRESS_RATIO_SHIFT 10 + +/* + * Convert from pos in backpointer btree to pos of corresponding bucket in alloc + * btree: + */ +static inline struct bpos bp_pos_to_bucket(const struct bch_fs *c, + struct bpos bp_pos) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bp_pos.inode); + u64 bucket_sector = bp_pos.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; + + return POS(bp_pos.inode, sector_to_bucket(ca, bucket_sector)); +} + +/* + * Convert from pos in alloc btree + bucket offset to pos in backpointer btree: + */ +static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c, + struct bpos bucket, + u64 bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, bucket.inode); + struct bpos ret; + + ret = POS(bucket.inode, + (bucket_to_sector(ca, bucket.offset) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) + bucket_offset); + + EBUG_ON(!bkey_eq(bucket, bp_pos_to_bucket(c, ret))); + + return ret; +} + +int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos, + struct bch_backpointer, struct bkey_s_c, bool); + +static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, + struct bpos bucket, + struct bch_backpointer bp, + struct bkey_s_c orig_k, + bool insert) +{ + struct bch_fs *c = trans->c; + struct bkey_i_backpointer *bp_k; + int ret; + + if (unlikely(bch2_backpointers_no_use_write_buffer)) + return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert); + + bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer)); + ret = PTR_ERR_OR_ZERO(bp_k); + if (ret) + return ret; + + bkey_backpointer_init(&bp_k->k_i); + bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset); + bp_k->v = bp; + + if (!insert) { + bp_k->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&bp_k->k, 0); + } + + return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i); +} + +static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p) +{ + return level ? BCH_DATA_btree : + p.has_ec ? BCH_DATA_stripe : + BCH_DATA_user; +} + +static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + struct bpos *bucket_pos, struct bch_backpointer *bp) +{ + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + s64 sectors = level ? btree_sectors(c) : k.k->size; + u32 bucket_offset; + + *bucket_pos = PTR_BUCKET_POS_OFFSET(c, &p.ptr, &bucket_offset); + *bp = (struct bch_backpointer) { + .btree_id = btree_id, + .level = level, + .data_type = data_type, + .bucket_offset = ((u64) bucket_offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + + p.crc.offset, + .bucket_len = ptr_disk_sectors(sectors, p), + .pos = k.k->p, + }; +} + +int bch2_get_next_backpointer(struct btree_trans *, struct bpos, int, + struct bpos *, struct bch_backpointer *, unsigned); +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_backpointer, + unsigned); +struct btree *bch2_backpointer_get_node(struct btree_trans *, struct btree_iter *, + struct bpos, struct bch_backpointer); + +int bch2_check_btree_backpointers(struct bch_fs *); +int bch2_check_extents_to_backpointers(struct bch_fs *); +int bch2_check_backpointers_to_extents(struct bch_fs *); + +#endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bbpos.h b/fs/bcachefs/bbpos.h new file mode 100644 index 000000000..1fbed1f83 --- /dev/null +++ b/fs/bcachefs/bbpos.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BBPOS_H +#define _BCACHEFS_BBPOS_H + +#include "bkey_methods.h" + +struct bbpos { + enum btree_id btree; + struct bpos pos; +}; + +static inline struct bbpos BBPOS(enum btree_id btree, struct bpos pos) +{ + return (struct bbpos) { btree, pos }; +} + +#define BBPOS_MIN BBPOS(0, POS_MIN) +#define BBPOS_MAX BBPOS(BTREE_ID_NR - 1, POS_MAX) + +static inline int bbpos_cmp(struct bbpos l, struct bbpos r) +{ + return cmp_int(l.btree, r.btree) ?: bpos_cmp(l.pos, r.pos); +} + +static inline struct bbpos bbpos_successor(struct bbpos pos) +{ + if (bpos_cmp(pos.pos, SPOS_MAX)) { + pos.pos = bpos_successor(pos.pos); + return pos; + } + + if (pos.btree != BTREE_ID_NR) { + pos.btree++; + pos.pos = POS_MIN; + return pos; + } + + BUG(); +} + +static inline void bch2_bbpos_to_text(struct printbuf *out, struct bbpos pos) +{ + prt_str(out, bch2_btree_ids[pos.btree]); + prt_char(out, ':'); + bch2_bpos_to_text(out, pos.pos); +} + +#endif /* _BCACHEFS_BBPOS_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h new file mode 100644 index 000000000..445d010c8 --- /dev/null +++ b/fs/bcachefs/bcachefs.h @@ -0,0 +1,1185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_H +#define _BCACHEFS_H + +/* + * SOME HIGH LEVEL CODE DOCUMENTATION: + * + * Bcache mostly works with cache sets, cache devices, and backing devices. + * + * Support for multiple cache devices hasn't quite been finished off yet, but + * it's about 95% plumbed through. A cache set and its cache devices is sort of + * like a md raid array and its component devices. Most of the code doesn't care + * about individual cache devices, the main abstraction is the cache set. + * + * Multiple cache devices is intended to give us the ability to mirror dirty + * cached data and metadata, without mirroring clean cached data. + * + * Backing devices are different, in that they have a lifetime independent of a + * cache set. When you register a newly formatted backing device it'll come up + * in passthrough mode, and then you can attach and detach a backing device from + * a cache set at runtime - while it's mounted and in use. Detaching implicitly + * invalidates any cached data for that backing device. + * + * A cache set can have multiple (many) backing devices attached to it. + * + * There's also flash only volumes - this is the reason for the distinction + * between struct cached_dev and struct bcache_device. A flash only volume + * works much like a bcache device that has a backing device, except the + * "cached" data is always dirty. The end result is that we get thin + * provisioning with very little additional code. + * + * Flash only volumes work but they're not production ready because the moving + * garbage collector needs more work. More on that later. + * + * BUCKETS/ALLOCATION: + * + * Bcache is primarily designed for caching, which means that in normal + * operation all of our available space will be allocated. Thus, we need an + * efficient way of deleting things from the cache so we can write new things to + * it. + * + * To do this, we first divide the cache device up into buckets. A bucket is the + * unit of allocation; they're typically around 1 mb - anywhere from 128k to 2M+ + * works efficiently. + * + * Each bucket has a 16 bit priority, and an 8 bit generation associated with + * it. The gens and priorities for all the buckets are stored contiguously and + * packed on disk (in a linked list of buckets - aside from the superblock, all + * of bcache's metadata is stored in buckets). + * + * The priority is used to implement an LRU. We reset a bucket's priority when + * we allocate it or on cache it, and every so often we decrement the priority + * of each bucket. It could be used to implement something more sophisticated, + * if anyone ever gets around to it. + * + * The generation is used for invalidating buckets. Each pointer also has an 8 + * bit generation embedded in it; for a pointer to be considered valid, its gen + * must match the gen of the bucket it points into. Thus, to reuse a bucket all + * we have to do is increment its gen (and write its new gen to disk; we batch + * this up). + * + * Bcache is entirely COW - we never write twice to a bucket, even buckets that + * contain metadata (including btree nodes). + * + * THE BTREE: + * + * Bcache is in large part design around the btree. + * + * At a high level, the btree is just an index of key -> ptr tuples. + * + * Keys represent extents, and thus have a size field. Keys also have a variable + * number of pointers attached to them (potentially zero, which is handy for + * invalidating the cache). + * + * The key itself is an inode:offset pair. The inode number corresponds to a + * backing device or a flash only volume. The offset is the ending offset of the + * extent within the inode - not the starting offset; this makes lookups + * slightly more convenient. + * + * Pointers contain the cache device id, the offset on that device, and an 8 bit + * generation number. More on the gen later. + * + * Index lookups are not fully abstracted - cache lookups in particular are + * still somewhat mixed in with the btree code, but things are headed in that + * direction. + * + * Updates are fairly well abstracted, though. There are two different ways of + * updating the btree; insert and replace. + * + * BTREE_INSERT will just take a list of keys and insert them into the btree - + * overwriting (possibly only partially) any extents they overlap with. This is + * used to update the index after a write. + * + * BTREE_REPLACE is really cmpxchg(); it inserts a key into the btree iff it is + * overwriting a key that matches another given key. This is used for inserting + * data into the cache after a cache miss, and for background writeback, and for + * the moving garbage collector. + * + * There is no "delete" operation; deleting things from the index is + * accomplished by either by invalidating pointers (by incrementing a bucket's + * gen) or by inserting a key with 0 pointers - which will overwrite anything + * previously present at that location in the index. + * + * This means that there are always stale/invalid keys in the btree. They're + * filtered out by the code that iterates through a btree node, and removed when + * a btree node is rewritten. + * + * BTREE NODES: + * + * Our unit of allocation is a bucket, and we can't arbitrarily allocate and + * free smaller than a bucket - so, that's how big our btree nodes are. + * + * (If buckets are really big we'll only use part of the bucket for a btree node + * - no less than 1/4th - but a bucket still contains no more than a single + * btree node. I'd actually like to change this, but for now we rely on the + * bucket's gen for deleting btree nodes when we rewrite/split a node.) + * + * Anyways, btree nodes are big - big enough to be inefficient with a textbook + * btree implementation. + * + * The way this is solved is that btree nodes are internally log structured; we + * can append new keys to an existing btree node without rewriting it. This + * means each set of keys we write is sorted, but the node is not. + * + * We maintain this log structure in memory - keeping 1Mb of keys sorted would + * be expensive, and we have to distinguish between the keys we have written and + * the keys we haven't. So to do a lookup in a btree node, we have to search + * each sorted set. But we do merge written sets together lazily, so the cost of + * these extra searches is quite low (normally most of the keys in a btree node + * will be in one big set, and then there'll be one or two sets that are much + * smaller). + * + * This log structure makes bcache's btree more of a hybrid between a + * conventional btree and a compacting data structure, with some of the + * advantages of both. + * + * GARBAGE COLLECTION: + * + * We can't just invalidate any bucket - it might contain dirty data or + * metadata. If it once contained dirty data, other writes might overwrite it + * later, leaving no valid pointers into that bucket in the index. + * + * Thus, the primary purpose of garbage collection is to find buckets to reuse. + * It also counts how much valid data it each bucket currently contains, so that + * allocation can reuse buckets sooner when they've been mostly overwritten. + * + * It also does some things that are really internal to the btree + * implementation. If a btree node contains pointers that are stale by more than + * some threshold, it rewrites the btree node to avoid the bucket's generation + * wrapping around. It also merges adjacent btree nodes if they're empty enough. + * + * THE JOURNAL: + * + * Bcache's journal is not necessary for consistency; we always strictly + * order metadata writes so that the btree and everything else is consistent on + * disk in the event of an unclean shutdown, and in fact bcache had writeback + * caching (with recovery from unclean shutdown) before journalling was + * implemented. + * + * Rather, the journal is purely a performance optimization; we can't complete a + * write until we've updated the index on disk, otherwise the cache would be + * inconsistent in the event of an unclean shutdown. This means that without the + * journal, on random write workloads we constantly have to update all the leaf + * nodes in the btree, and those writes will be mostly empty (appending at most + * a few keys each) - highly inefficient in terms of amount of metadata writes, + * and it puts more strain on the various btree resorting/compacting code. + * + * The journal is just a log of keys we've inserted; on startup we just reinsert + * all the keys in the open journal entries. That means that when we're updating + * a node in the btree, we can wait until a 4k block of keys fills up before + * writing them out. + * + * For simplicity, we only journal updates to leaf nodes; updates to parent + * nodes are rare enough (since our leaf nodes are huge) that it wasn't worth + * the complexity to deal with journalling them (in particular, journal replay) + * - updates to non leaf nodes just happen synchronously (see btree_split()). + */ + +#undef pr_fmt +#ifdef __KERNEL__ +#define pr_fmt(fmt) "bcachefs: %s() " fmt "\n", __func__ +#else +#define pr_fmt(fmt) "%s() " fmt "\n", __func__ +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bcachefs_format.h" +#include "errcode.h" +#include "fifo.h" +#include "nocow_locking_types.h" +#include "opts.h" +#include "seqmutex.h" +#include "util.h" + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_WRITE_REF_DEBUG +#endif + +#ifndef dynamic_fault +#define dynamic_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") + +#define trace_and_count(_c, _name, ...) \ +do { \ + this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \ + trace_##_name(__VA_ARGS__); \ +} while (0) + +#define bch2_fs_init_fault(name) \ + dynamic_fault("bcachefs:bch_fs_init:" name) +#define bch2_meta_read_fault(name) \ + dynamic_fault("bcachefs:meta:read:" name) +#define bch2_meta_write_fault(name) \ + dynamic_fault("bcachefs:meta:write:" name) + +#ifdef __KERNEL__ +#define BCACHEFS_LOG_PREFIX +#endif + +#ifdef BCACHEFS_LOG_PREFIX + +#define bch2_log_msg(_c, fmt) "bcachefs (%s): " fmt, ((_c)->name) +#define bch2_fmt_dev(_ca, fmt) "bcachefs (%s): " fmt "\n", ((_ca)->name) +#define bch2_fmt_dev_offset(_ca, _offset, fmt) "bcachefs (%s sector %llu): " fmt "\n", ((_ca)->name), (_offset) +#define bch2_fmt_inum(_c, _inum, fmt) "bcachefs (%s inum %llu): " fmt "\n", ((_c)->name), (_inum) +#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ + "bcachefs (%s inum %llu offset %llu): " fmt "\n", ((_c)->name), (_inum), (_offset) + +#else + +#define bch2_log_msg(_c, fmt) fmt +#define bch2_fmt_dev(_ca, fmt) "%s: " fmt "\n", ((_ca)->name) +#define bch2_fmt_dev_offset(_ca, _offset, fmt) "%s sector %llu: " fmt "\n", ((_ca)->name), (_offset) +#define bch2_fmt_inum(_c, _inum, fmt) "inum %llu: " fmt "\n", (_inum) +#define bch2_fmt_inum_offset(_c, _inum, _offset, fmt) \ + "inum %llu offset %llu: " fmt "\n", (_inum), (_offset) + +#endif + +#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") + +#define bch_info(c, fmt, ...) \ + printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_notice(c, fmt, ...) \ + printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn(c, fmt, ...) \ + printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_warn_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__) + +#define bch_err(c, fmt, ...) \ + printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_dev(ca, fmt, ...) \ + printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) +#define bch_err_dev_offset(ca, _offset, fmt, ...) \ + printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) +#define bch_err_inum(c, _inum, fmt, ...) \ + printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) +#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \ + printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +#define bch_err_ratelimited(c, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__) +#define bch_err_dev_ratelimited(ca, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__) +#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__) +#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__) +#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \ + printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__) + +#define bch_err_fn(_c, _ret) \ + bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret)) +#define bch_err_msg(_c, _ret, _msg) \ + bch_err(_c, "%s(): error " _msg " %s", __func__, bch2_err_str(_ret)) + +#define bch_verbose(c, fmt, ...) \ +do { \ + if ((c)->opts.verbose) \ + bch_info(c, fmt, ##__VA_ARGS__); \ +} while (0) + +#define pr_verbose_init(opts, fmt, ...) \ +do { \ + if (opt_get(opts, verbose)) \ + pr_info(fmt, ##__VA_ARGS__); \ +} while (0) + +/* Parameters that are useful for debugging, but should always be compiled in: */ +#define BCH_DEBUG_PARAMS_ALWAYS() \ + BCH_DEBUG_PARAM(key_merging_disabled, \ + "Disables merging of extents") \ + BCH_DEBUG_PARAM(btree_gc_always_rewrite, \ + "Causes mark and sweep to compact and rewrite every " \ + "btree node it traverses") \ + BCH_DEBUG_PARAM(btree_gc_rewrite_disabled, \ + "Disables rewriting of btree nodes during mark and sweep")\ + BCH_DEBUG_PARAM(btree_shrinker_disabled, \ + "Disables the shrinker callback for the btree node cache")\ + BCH_DEBUG_PARAM(verify_btree_ondisk, \ + "Reread btree nodes at various points to verify the " \ + "mergesort in the read path against modifications " \ + "done in memory") \ + BCH_DEBUG_PARAM(verify_all_btree_replicas, \ + "When reading btree nodes, read all replicas and " \ + "compare them") \ + BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ + "Don't use the write buffer for backpointers, enabling "\ + "extra runtime checks") + +/* Parameters that should only be compiled in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ + BCH_DEBUG_PARAM(expensive_debug_checks, \ + "Enables various runtime debugging checks that " \ + "significantly affect performance") \ + BCH_DEBUG_PARAM(debug_check_iterators, \ + "Enables extra verification for btree iterators") \ + BCH_DEBUG_PARAM(debug_check_btree_accounting, \ + "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(journal_seq_verify, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(inject_invalid_keys, \ + "Store the journal sequence number in the version " \ + "number of every btree key, and verify that btree " \ + "update ordering is preserved during recovery") \ + BCH_DEBUG_PARAM(test_alloc_startup, \ + "Force allocator startup to use the slowpath where it" \ + "can't find enough free buckets without invalidating" \ + "cached data") \ + BCH_DEBUG_PARAM(force_reconstruct_read, \ + "Force reads to use the reconstruct path, when reading" \ + "from erasure coded extents") \ + BCH_DEBUG_PARAM(test_restart_gc, \ + "Test restarting mark and sweep gc when bucket gens change") + +#define BCH_DEBUG_PARAMS_ALL() BCH_DEBUG_PARAMS_ALWAYS() BCH_DEBUG_PARAMS_DEBUG() + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALL() +#else +#define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() +#endif + +#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +#ifndef CONFIG_BCACHEFS_DEBUG +#define BCH_DEBUG_PARAM(name, description) static const bool bch2_##name; +BCH_DEBUG_PARAMS_DEBUG() +#undef BCH_DEBUG_PARAM +#endif + +#define BCH_TIME_STATS() \ + x(btree_node_mem_alloc) \ + x(btree_node_split) \ + x(btree_node_compact) \ + x(btree_node_merge) \ + x(btree_node_sort) \ + x(btree_node_read) \ + x(btree_interior_update_foreground) \ + x(btree_interior_update_total) \ + x(btree_gc) \ + x(data_write) \ + x(data_read) \ + x(data_promote) \ + x(journal_flush_write) \ + x(journal_noflush_write) \ + x(journal_flush_seq) \ + x(blocked_journal) \ + x(blocked_allocate) \ + x(blocked_allocate_open_bucket) \ + x(nocow_lock_contended) + +enum bch_time_stats { +#define x(name) BCH_TIME_##name, + BCH_TIME_STATS() +#undef x + BCH_TIME_STAT_NR +}; + +#include "alloc_types.h" +#include "btree_types.h" +#include "btree_write_buffer_types.h" +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "ec_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "quota_types.h" +#include "rebalance_types.h" +#include "replicas_types.h" +#include "subvolume_types.h" +#include "super_types.h" + +/* Number of nodes btree coalesce will try to coalesce at once */ +#define GC_MERGE_NODES 4U + +/* Maximum number of nodes we might need to allocate atomically: */ +#define BTREE_RESERVE_MAX (BTREE_MAX_DEPTH + (BTREE_MAX_DEPTH - 1)) + +/* Size of the freelist we allocate btree nodes from: */ +#define BTREE_NODE_RESERVE (BTREE_RESERVE_MAX * 4) + +#define BTREE_NODE_OPEN_BUCKET_RESERVE (BTREE_RESERVE_MAX * BCH_REPLICAS_MAX) + +struct btree; + +enum gc_phase { + GC_PHASE_NOT_RUNNING, + GC_PHASE_START, + GC_PHASE_SB, + + GC_PHASE_BTREE_stripes, + GC_PHASE_BTREE_extents, + GC_PHASE_BTREE_inodes, + GC_PHASE_BTREE_dirents, + GC_PHASE_BTREE_xattrs, + GC_PHASE_BTREE_alloc, + GC_PHASE_BTREE_quotas, + GC_PHASE_BTREE_reflink, + GC_PHASE_BTREE_subvolumes, + GC_PHASE_BTREE_snapshots, + GC_PHASE_BTREE_lru, + GC_PHASE_BTREE_freespace, + GC_PHASE_BTREE_need_discard, + GC_PHASE_BTREE_backpointers, + GC_PHASE_BTREE_bucket_gens, + GC_PHASE_BTREE_snapshot_trees, + + GC_PHASE_PENDING_DELETE, +}; + +struct gc_pos { + enum gc_phase phase; + struct bpos pos; + unsigned level; +}; + +struct reflink_gc { + u64 offset; + u32 size; + u32 refcount; +}; + +typedef GENRADIX(struct reflink_gc) reflink_gc_table; + +struct io_count { + u64 sectors[2][BCH_DATA_NR]; +}; + +struct bch_dev { + struct kobject kobj; + struct percpu_ref ref; + struct completion ref_completion; + struct percpu_ref io_ref; + struct completion io_ref_completion; + + struct bch_fs *fs; + + u8 dev_idx; + /* + * Cached version of this device's member info from superblock + * Committed by bch2_write_super() -> bch_fs_mi_update() + */ + struct bch_member_cpu mi; + __uuid_t uuid; + char name[BDEVNAME_SIZE]; + + struct bch_sb_handle disk_sb; + struct bch_sb *sb_read_scratch; + int sb_write_error; + dev_t dev; + atomic_t flush_seq; + + struct bch_devs_mask self; + + /* biosets used in cloned bios for writing multiple replicas */ + struct bio_set replica_set; + + /* + * Buckets: + * Per-bucket arrays are protected by c->mark_lock, bucket_lock and + * gc_lock, for device resize - holding any is sufficient for access: + * Or rcu_read_lock(), but only for ptr_stale(): + */ + struct bucket_array __rcu *buckets_gc; + struct bucket_gens __rcu *bucket_gens; + u8 *oldest_gen; + unsigned long *buckets_nouse; + struct rw_semaphore bucket_lock; + + struct bch_dev_usage *usage_base; + struct bch_dev_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_dev_usage __percpu *usage_gc; + + /* Allocator: */ + u64 new_fs_bucket_idx; + u64 alloc_cursor; + + unsigned nr_open_buckets; + unsigned nr_btree_reserve; + + size_t inc_gen_needs_gc; + size_t inc_gen_really_needs_gc; + size_t buckets_waiting_on_journal; + + atomic64_t rebalance_work; + + struct journal_device journal; + u64 prev_journal_sector; + + struct work_struct io_error_work; + + /* The rest of this all shows up in sysfs */ + atomic64_t cur_latency[2]; + struct bch2_time_stats io_latency[2]; + +#define CONGESTED_MAX 1024 + atomic_t congested; + u64 congested_last; + + struct io_count __percpu *io_done; +}; + +enum { + /* startup: */ + BCH_FS_STARTED, + BCH_FS_MAY_GO_RW, + BCH_FS_RW, + BCH_FS_WAS_RW, + + /* shutdown: */ + BCH_FS_STOPPING, + BCH_FS_EMERGENCY_RO, + BCH_FS_GOING_RO, + BCH_FS_WRITE_DISABLE_COMPLETE, + BCH_FS_CLEAN_SHUTDOWN, + + /* fsck passes: */ + BCH_FS_TOPOLOGY_REPAIR_DONE, + BCH_FS_FSCK_DONE, + BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */ + BCH_FS_NEED_ANOTHER_GC, + + BCH_FS_HAVE_DELETED_SNAPSHOTS, + + /* errors: */ + BCH_FS_ERROR, + BCH_FS_TOPOLOGY_ERROR, + BCH_FS_ERRORS_FIXED, + BCH_FS_ERRORS_NOT_FIXED, +}; + +struct btree_debug { + unsigned id; +}; + +#define BCH_TRANSACTIONS_NR 128 + +struct btree_transaction_stats { + struct bch2_time_stats lock_hold_times; + struct mutex lock; + unsigned nr_max_paths; + unsigned wb_updates_size; + unsigned max_mem; + char *max_paths_text; +}; + +struct bch_fs_pcpu { + u64 sectors_available; +}; + +struct journal_seq_blacklist_table { + size_t nr; + struct journal_seq_blacklist_table_entry { + u64 start; + u64 end; + bool dirty; + } entries[0]; +}; + +struct journal_keys { + struct journal_key { + u64 journal_seq; + u32 journal_offset; + enum btree_id btree_id:8; + unsigned level:8; + bool allocated; + bool overwritten; + struct bkey_i *k; + } *d; + /* + * Gap buffer: instead of all the empty space in the array being at the + * end of the buffer - from @nr to @size - the empty space is at @gap. + * This means that sequential insertions are O(n) instead of O(n^2). + */ + size_t gap; + size_t nr; + size_t size; +}; + +struct btree_path_buf { + struct btree_path *path; +}; + +#define REPLICAS_DELTA_LIST_MAX (1U << 16) + +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + +#define BCH_WRITE_REFS() \ + x(trans) \ + x(write) \ + x(promote) \ + x(node_rewrite) \ + x(stripe_create) \ + x(stripe_delete) \ + x(reflink) \ + x(fallocate) \ + x(discard) \ + x(invalidate) \ + x(delete_dead_snapshots) \ + x(snapshot_delete_pagecache) \ + x(sysfs) + +enum bch_write_ref { +#define x(n) BCH_WRITE_REF_##n, + BCH_WRITE_REFS() +#undef x + BCH_WRITE_REF_NR, +}; + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) + +#define BCH_RECOVERY_PASSES() \ + x(alloc_read, PASS_ALWAYS) \ + x(stripes_read, PASS_ALWAYS) \ + x(initialize_subvolumes, 0) \ + x(snapshots_read, PASS_ALWAYS) \ + x(check_allocations, PASS_FSCK) \ + x(set_may_go_rw, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, PASS_ALWAYS) \ + x(check_alloc_info, PASS_FSCK) \ + x(check_lrus, PASS_FSCK) \ + x(check_btree_backpointers, PASS_FSCK) \ + x(check_backpointers_to_extents,PASS_FSCK) \ + x(check_extents_to_backpointers,PASS_FSCK) \ + x(check_alloc_to_lru_refs, PASS_FSCK) \ + x(fs_freespace_init, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 0) \ + x(fs_upgrade_for_subvolumes, 0) \ + x(check_snapshot_trees, PASS_FSCK) \ + x(check_snapshots, PASS_FSCK) \ + x(check_subvols, PASS_FSCK) \ + x(delete_dead_snapshots, PASS_FSCK|PASS_UNCLEAN|PASS_SILENT) \ + x(check_inodes, PASS_FSCK|PASS_UNCLEAN) \ + x(check_extents, PASS_FSCK) \ + x(check_dirents, PASS_FSCK) \ + x(check_xattrs, PASS_FSCK) \ + x(check_root, PASS_FSCK) \ + x(check_directory_structure, PASS_FSCK) \ + x(check_nlinks, PASS_FSCK) \ + x(fix_reflink_p, 0) \ + +enum bch_recovery_pass { +#define x(n, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +struct bch_fs { + struct closure cl; + + struct list_head list; + struct kobject kobj; + struct kobject counters_kobj; + struct kobject internal; + struct kobject opts_dir; + struct kobject time_stats; + unsigned long flags; + + int minor; + struct device *chardev; + struct super_block *vfs_sb; + dev_t dev; + char name[40]; + + /* ro/rw, add/remove/resize devices: */ + struct rw_semaphore state_lock; + + /* Counts outstanding writes, for clean transition to read-only */ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_t writes[BCH_WRITE_REF_NR]; +#else + struct percpu_ref writes; +#endif + struct work_struct read_only_work; + + struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX]; + + struct bch_replicas_cpu replicas; + struct bch_replicas_cpu replicas_gc; + struct mutex replicas_gc_lock; + mempool_t replicas_delta_pool; + + struct journal_entry_res btree_root_journal_res; + struct journal_entry_res replicas_journal_res; + struct journal_entry_res clock_journal_res; + struct journal_entry_res dev_usage_journal_res; + + struct bch_disk_groups_cpu __rcu *disk_groups; + + struct bch_opts opts; + + /* Updated by bch2_sb_update():*/ + struct { + __uuid_t uuid; + __uuid_t user_uuid; + + u16 version; + u16 version_min; + u16 version_upgrade_complete; + + u8 nr_devices; + u8 clean; + + u8 encryption_type; + + u64 time_base_lo; + u32 time_base_hi; + unsigned time_units_per_sec; + unsigned nsec_per_time_unit; + u64 features; + u64 compat; + } sb; + + + struct bch_sb_handle disk_sb; + + unsigned short block_bits; /* ilog2(block_size) */ + + u16 btree_foreground_merge_threshold; + + struct closure sb_write; + struct mutex sb_lock; + + /* snapshot.c: */ + struct snapshot_table __rcu *snapshots; + size_t snapshot_table_size; + struct mutex snapshot_table_lock; + + struct work_struct snapshot_delete_work; + struct work_struct snapshot_wait_for_pagecache_and_delete_work; + snapshot_id_list snapshots_unlinked; + struct mutex snapshots_unlinked_lock; + + /* BTREE CACHE */ + struct bio_set btree_bio; + struct workqueue_struct *io_complete_wq; + + struct btree_root btree_roots_known[BTREE_ID_NR]; + DARRAY(struct btree_root) btree_roots_extra; + struct mutex btree_root_lock; + + struct btree_cache btree_cache; + + /* + * Cache of allocated btree nodes - if we allocate a btree node and + * don't use it, if we free it that space can't be reused until going + * _all_ the way through the allocator (which exposes us to a livelock + * when allocating btree reserves fail halfway through) - instead, we + * can stick them here: + */ + struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; + unsigned btree_reserve_cache_nr; + struct mutex btree_reserve_cache_lock; + + mempool_t btree_interior_update_pool; + struct list_head btree_interior_update_list; + struct list_head btree_interior_updates_unwritten; + struct mutex btree_interior_update_lock; + struct closure_waitlist btree_interior_update_wait; + + struct workqueue_struct *btree_interior_update_worker; + struct work_struct btree_interior_update_work; + + struct list_head pending_node_rewrites; + struct mutex pending_node_rewrites_lock; + + /* btree_io.c: */ + spinlock_t btree_write_error_lock; + struct btree_write_stats { + atomic64_t nr; + atomic64_t bytes; + } btree_write_stats[BTREE_WRITE_TYPE_NR]; + + /* btree_iter.c: */ + struct seqmutex btree_trans_lock; + struct list_head btree_trans_list; + mempool_t btree_paths_pool; + mempool_t btree_trans_mem_pool; + struct btree_path_buf __percpu *btree_paths_bufs; + + struct srcu_struct btree_trans_barrier; + bool btree_trans_barrier_initialized; + + struct btree_key_cache btree_key_cache; + unsigned btree_key_cache_btrees; + + struct btree_write_buffer btree_write_buffer; + + struct workqueue_struct *btree_update_wq; + struct workqueue_struct *btree_io_complete_wq; + /* copygc needs its own workqueue for index updates.. */ + struct workqueue_struct *copygc_wq; + /* + * Use a dedicated wq for write ref holder tasks. Required to avoid + * dependency problems with other wq tasks that can block on ref + * draining, such as read-only transition. + */ + struct workqueue_struct *write_ref_wq; + + /* ALLOCATION */ + struct bch_devs_mask rw_devs[BCH_DATA_NR]; + + u64 capacity; /* sectors */ + + /* + * When capacity _decreases_ (due to a disk being removed), we + * increment capacity_gen - this invalidates outstanding reservations + * and forces them to be revalidated + */ + u32 capacity_gen; + unsigned bucket_size_max; + + atomic64_t sectors_available; + struct mutex sectors_available_lock; + + struct bch_fs_pcpu __percpu *pcpu; + + struct percpu_rw_semaphore mark_lock; + + seqcount_t usage_lock; + struct bch_fs_usage *usage_base; + struct bch_fs_usage __percpu *usage[JOURNAL_BUF_NR]; + struct bch_fs_usage __percpu *usage_gc; + u64 __percpu *online_reserved; + + /* single element mempool: */ + struct mutex usage_scratch_lock; + struct bch_fs_usage_online *usage_scratch; + + struct io_clock io_clock[2]; + + /* JOURNAL SEQ BLACKLIST */ + struct journal_seq_blacklist_table * + journal_seq_blacklist_table; + struct work_struct journal_seq_blacklist_gc_work; + + /* ALLOCATOR */ + spinlock_t freelist_lock; + struct closure_waitlist freelist_wait; + u64 blocked_allocate; + u64 blocked_allocate_open_bucket; + + open_bucket_idx_t open_buckets_freelist; + open_bucket_idx_t open_buckets_nr_free; + struct closure_waitlist open_buckets_wait; + struct open_bucket open_buckets[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_hash[OPEN_BUCKETS_COUNT]; + + open_bucket_idx_t open_buckets_partial[OPEN_BUCKETS_COUNT]; + open_bucket_idx_t open_buckets_partial_nr; + + struct write_point btree_write_point; + struct write_point rebalance_write_point; + + struct write_point write_points[WRITE_POINT_MAX]; + struct hlist_head write_points_hash[WRITE_POINT_HASH_NR]; + struct mutex write_points_hash_lock; + unsigned write_points_nr; + + struct buckets_waiting_for_journal buckets_waiting_for_journal; + struct work_struct discard_work; + struct work_struct invalidate_work; + + /* GARBAGE COLLECTION */ + struct task_struct *gc_thread; + atomic_t kick_gc; + unsigned long gc_count; + + enum btree_id gc_gens_btree; + struct bpos gc_gens_pos; + + /* + * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] + * has been marked by GC. + * + * gc_cur_phase is a superset of btree_ids (BTREE_ID_extents etc.) + * + * Protected by gc_pos_lock. Only written to by GC thread, so GC thread + * can read without a lock. + */ + seqcount_t gc_pos_lock; + struct gc_pos gc_pos; + + /* + * The allocation code needs gc_mark in struct bucket to be correct, but + * it's not while a gc is in progress. + */ + struct rw_semaphore gc_lock; + struct mutex gc_gens_lock; + + /* IO PATH */ + struct semaphore io_in_flight; + struct bio_set bio_read; + struct bio_set bio_read_split; + struct bio_set bio_write; + struct mutex bio_bounce_pages_lock; + mempool_t bio_bounce_pages; + struct bucket_nocow_lock_table + nocow_locks; + struct rhashtable promote_table; + + mempool_t compression_bounce[2]; + mempool_t compress_workspace[BCH_COMPRESSION_TYPE_NR]; + mempool_t decompress_workspace; + ZSTD_parameters zstd_params; + + struct crypto_shash *sha256; + struct crypto_sync_skcipher *chacha20; + struct crypto_shash *poly1305; + + atomic64_t key_version; + + mempool_t large_bkey_pool; + + /* MOVE.C */ + struct list_head moving_context_list; + struct mutex moving_context_lock; + + struct list_head data_progress_list; + struct mutex data_progress_lock; + + /* REBALANCE */ + struct bch_fs_rebalance rebalance; + + /* COPYGC */ + struct task_struct *copygc_thread; + struct write_point copygc_write_point; + s64 copygc_wait_at; + s64 copygc_wait; + bool copygc_running; + wait_queue_head_t copygc_running_wq; + + /* STRIPES: */ + GENRADIX(struct stripe) stripes; + GENRADIX(struct gc_stripe) gc_stripes; + + struct hlist_head ec_stripes_new[32]; + spinlock_t ec_stripes_new_lock; + + ec_stripes_heap ec_stripes_heap; + struct mutex ec_stripes_heap_lock; + + /* ERASURE CODING */ + struct list_head ec_stripe_head_list; + struct mutex ec_stripe_head_lock; + + struct list_head ec_stripe_new_list; + struct mutex ec_stripe_new_lock; + wait_queue_head_t ec_stripe_new_wait; + + struct work_struct ec_stripe_create_work; + u64 ec_stripe_hint; + + struct work_struct ec_stripe_delete_work; + + struct bio_set ec_bioset; + + /* REFLINK */ + reflink_gc_table reflink_gc_table; + size_t reflink_gc_nr; + + /* fs.c */ + struct list_head vfs_inodes_list; + struct mutex vfs_inodes_lock; + + /* VFS IO PATH - fs-io.c */ + struct bio_set writepage_bioset; + struct bio_set dio_write_bioset; + struct bio_set dio_read_bioset; + struct bio_set nocow_flush_bioset; + + /* ERRORS */ + struct list_head fsck_errors; + struct mutex fsck_error_lock; + bool fsck_alloc_err; + + /* QUOTAS */ + struct bch_memquota_type quotas[QTYP_NR]; + + /* RECOVERY */ + u64 journal_replay_seq_start; + u64 journal_replay_seq_end; + enum bch_recovery_pass curr_recovery_pass; + /* bitmap of explicitly enabled recovery passes: */ + u64 recovery_passes_explicit; + + /* DEBUG JUNK */ + struct dentry *fs_debug_dir; + struct dentry *btree_debug_dir; + struct btree_debug btree_debug[BTREE_ID_NR]; + struct btree *verify_data; + struct btree_node *verify_ondisk; + struct mutex verify_lock; + + u64 *unused_inode_hints; + unsigned inode_shard_bits; + + /* + * A btree node on disk could have too many bsets for an iterator to fit + * on the stack - have to dynamically allocate them + */ + mempool_t fill_iter; + + mempool_t btree_bounce_pool; + + struct journal journal; + GENRADIX(struct journal_replay *) journal_entries; + u64 journal_entries_base_seq; + struct journal_keys journal_keys; + struct list_head journal_iters; + + u64 last_bucket_seq_cleanup; + + u64 counters_on_mount[BCH_COUNTER_NR]; + u64 __percpu *counters; + + unsigned btree_gc_periodic:1; + unsigned copy_gc_enabled:1; + bool promote_whole_extents; + + struct bch2_time_stats times[BCH_TIME_STAT_NR]; + + struct btree_transaction_stats btree_transaction_stats[BCH_TRANSACTIONS_NR]; +}; + +extern struct wait_queue_head bch2_read_only_wait; + +static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + atomic_long_inc(&c->writes[ref]); +#else + percpu_ref_get(&c->writes); +#endif +} + +static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + return !test_bit(BCH_FS_GOING_RO, &c->flags) && + atomic_long_inc_not_zero(&c->writes[ref]); +#else + return percpu_ref_tryget_live(&c->writes); +#endif +} + +static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) +{ +#ifdef BCH_WRITE_REF_DEBUG + long v = atomic_long_dec_return(&c->writes[ref]); + + BUG_ON(v < 0); + if (v) + return; + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + if (atomic_long_read(&c->writes[i])) + return; + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch2_read_only_wait); +#else + percpu_ref_put(&c->writes); +#endif +} + +static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages) +{ +#ifndef NO_BCACHEFS_FS + if (c->vfs_sb) + c->vfs_sb->s_bdi->ra_pages = ra_pages; +#endif +} + +static inline unsigned bucket_bytes(const struct bch_dev *ca) +{ + return ca->mi.bucket_size << 9; +} + +static inline unsigned block_bytes(const struct bch_fs *c) +{ + return c->opts.block_size; +} + +static inline unsigned block_sectors(const struct bch_fs *c) +{ + return c->opts.block_size >> 9; +} + +static inline size_t btree_sectors(const struct bch_fs *c) +{ + return c->opts.btree_node_size >> 9; +} + +static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree) +{ + return c->btree_key_cache_btrees & (1U << btree); +} + +static inline struct timespec64 bch2_time_to_timespec(const struct bch_fs *c, s64 time) +{ + struct timespec64 t; + s32 rem; + + time += c->sb.time_base_lo; + + t.tv_sec = div_s64_rem(time, c->sb.time_units_per_sec, &rem); + t.tv_nsec = rem * c->sb.nsec_per_time_unit; + return t; +} + +static inline s64 timespec_to_bch2_time(const struct bch_fs *c, struct timespec64 ts) +{ + return (ts.tv_sec * c->sb.time_units_per_sec + + (int) ts.tv_nsec / c->sb.nsec_per_time_unit) - c->sb.time_base_lo; +} + +static inline s64 bch2_current_time(const struct bch_fs *c) +{ + struct timespec64 now; + + ktime_get_coarse_real_ts64(&now); + return timespec_to_bch2_time(c, now); +} + +static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev) +{ + return dev < c->sb.nr_devices && c->devs[dev]; +} + +#define BKEY_PADDED_ONSTACK(key, pad) \ + struct { struct bkey_i key; __u64 key ## _pad[pad]; } + +#endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h new file mode 100644 index 000000000..5c308f842 --- /dev/null +++ b/fs/bcachefs/bcachefs_format.h @@ -0,0 +1,2319 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FORMAT_H +#define _BCACHEFS_FORMAT_H + +/* + * bcachefs on disk data structures + * + * OVERVIEW: + * + * There are three main types of on disk data structures in bcachefs (this is + * reduced from 5 in bcache) + * + * - superblock + * - journal + * - btree + * + * The btree is the primary structure; most metadata exists as keys in the + * various btrees. There are only a small number of btrees, they're not + * sharded - we have one btree for extents, another for inodes, et cetera. + * + * SUPERBLOCK: + * + * The superblock contains the location of the journal, the list of devices in + * the filesystem, and in general any metadata we need in order to decide + * whether we can start a filesystem or prior to reading the journal/btree + * roots. + * + * The superblock is extensible, and most of the contents of the superblock are + * in variable length, type tagged fields; see struct bch_sb_field. + * + * Backup superblocks do not reside in a fixed location; also, superblocks do + * not have a fixed size. To locate backup superblocks we have struct + * bch_sb_layout; we store a copy of this inside every superblock, and also + * before the first superblock. + * + * JOURNAL: + * + * The journal primarily records btree updates in the order they occurred; + * journal replay consists of just iterating over all the keys in the open + * journal entries and re-inserting them into the btrees. + * + * The journal also contains entry types for the btree roots, and blacklisted + * journal sequence numbers (see journal_seq_blacklist.c). + * + * BTREE: + * + * bcachefs btrees are copy on write b+ trees, where nodes are big (typically + * 128k-256k) and log structured. We use struct btree_node for writing the first + * entry in a given node (offset 0), and struct btree_node_entry for all + * subsequent writes. + * + * After the header, btree node entries contain a list of keys in sorted order. + * Values are stored inline with the keys; since values are variable length (and + * keys effectively are variable length too, due to packing) we can't do random + * access without building up additional in memory tables in the btree node read + * path. + * + * BTREE KEYS (struct bkey): + * + * The various btrees share a common format for the key - so as to avoid + * switching in fastpath lookup/comparison code - but define their own + * structures for the key values. + * + * The size of a key/value pair is stored as a u8 in units of u64s, so the max + * size is just under 2k. The common part also contains a type tag for the + * value, and a format field indicating whether the key is packed or not (and + * also meant to allow adding new key fields in the future, if desired). + * + * bkeys, when stored within a btree node, may also be packed. In that case, the + * bkey_format in that node is used to unpack it. Packed bkeys mean that we can + * be generous with field sizes in the common part of the key format (64 bit + * inode number, 64 bit offset, 96 bit version field, etc.) for negligible cost. + */ + +#include +#include +#include +#include +#include "vstructs.h" + +#ifdef __KERNEL__ +typedef uuid_t __uuid_t; +#endif + +#define BITMASK(name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ +static const unsigned name##_BITS = (end - offset); \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (k->field >> offset) & ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + k->field &= ~(~(~0ULL << (end - offset)) << offset); \ + k->field |= (v & ~(~0ULL << (end - offset))) << offset; \ +} + +#define LE_BITMASK(_bits, name, type, field, offset, end) \ +static const unsigned name##_OFFSET = offset; \ +static const unsigned name##_BITS = (end - offset); \ +static const __u##_bits name##_MAX = (1ULL << (end - offset)) - 1; \ + \ +static inline __u64 name(const type *k) \ +{ \ + return (__le##_bits##_to_cpu(k->field) >> offset) & \ + ~(~0ULL << (end - offset)); \ +} \ + \ +static inline void SET_##name(type *k, __u64 v) \ +{ \ + __u##_bits new = __le##_bits##_to_cpu(k->field); \ + \ + new &= ~(~(~0ULL << (end - offset)) << offset); \ + new |= (v & ~(~0ULL << (end - offset))) << offset; \ + k->field = __cpu_to_le##_bits(new); \ +} + +#define LE16_BITMASK(n, t, f, o, e) LE_BITMASK(16, n, t, f, o, e) +#define LE32_BITMASK(n, t, f, o, e) LE_BITMASK(32, n, t, f, o, e) +#define LE64_BITMASK(n, t, f, o, e) LE_BITMASK(64, n, t, f, o, e) + +struct bkey_format { + __u8 key_u64s; + __u8 nr_fields; + /* One unused slot for now: */ + __u8 bits_per_field[6]; + __le64 field_offset[6]; +}; + +/* Btree keys - all units are in sectors */ + +struct bpos { + /* + * Word order matches machine byte order - btree code treats a bpos as a + * single large integer, for search/comparison purposes + * + * Note that wherever a bpos is embedded in another on disk data + * structure, it has to be byte swabbed when reading in metadata that + * wasn't written in native endian order: + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + __u32 snapshot; + __u64 offset; + __u64 inode; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + __u64 inode; + __u64 offset; /* Points to end of extent - sectors */ + __u32 snapshot; +#else +#error edit for your odd byteorder. +#endif +} __packed __aligned(4); + +#define KEY_INODE_MAX ((__u64)~0ULL) +#define KEY_OFFSET_MAX ((__u64)~0ULL) +#define KEY_SNAPSHOT_MAX ((__u32)~0U) +#define KEY_SIZE_MAX ((__u32)~0U) + +static inline struct bpos SPOS(__u64 inode, __u64 offset, __u32 snapshot) +{ + return (struct bpos) { + .inode = inode, + .offset = offset, + .snapshot = snapshot, + }; +} + +#define POS_MIN SPOS(0, 0, 0) +#define POS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, 0) +#define SPOS_MAX SPOS(KEY_INODE_MAX, KEY_OFFSET_MAX, KEY_SNAPSHOT_MAX) +#define POS(_inode, _offset) SPOS(_inode, _offset, 0) + +/* Empty placeholder struct, for container_of() */ +struct bch_val { + __u64 __nothing[0]; +}; + +struct bversion { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + __u64 lo; + __u32 hi; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + __u32 hi; + __u64 lo; +#endif +} __packed __aligned(4); + +struct bkey { + /* Size of combined key and value, in u64s */ + __u8 u64s; + + /* Format of key (0 for format local to btree node) */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#else +#error edit for your odd byteorder. +#endif + + /* Type of the value */ + __u8 type; + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + __u8 pad[1]; + + struct bversion version; + __u32 size; /* extent size, in sectors */ + struct bpos p; +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + struct bpos p; + __u32 size; /* extent size, in sectors */ + struct bversion version; + + __u8 pad[1]; +#endif +} __packed __aligned(8); + +struct bkey_packed { + __u64 _data[0]; + + /* Size of combined key and value, in u64s */ + __u8 u64s; + + /* Format of key (0 for format local to btree node) */ + + /* + * XXX: next incompat on disk format change, switch format and + * needs_whiteout - bkey_packed() will be cheaper if format is the high + * bits of the bitfield + */ +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 format:7, + needs_whiteout:1; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 needs_whiteout:1, + format:7; +#endif + + /* Type of the value */ + __u8 type; + __u8 key_start[0]; + + /* + * We copy bkeys with struct assignment in various places, and while + * that shouldn't be done with packed bkeys we can't disallow it in C, + * and it's legal to cast a bkey to a bkey_packed - so padding it out + * to the same size as struct bkey should hopefully be safest. + */ + __u8 pad[sizeof(struct bkey) - 3]; +} __packed __aligned(8); + +typedef struct { + __le64 lo; + __le64 hi; +} bch_le128; + +#define BKEY_U64s (sizeof(struct bkey) / sizeof(__u64)) +#define BKEY_U64s_MAX U8_MAX +#define BKEY_VAL_U64s_MAX (BKEY_U64s_MAX - BKEY_U64s) + +#define KEY_PACKED_BITS_START 24 + +#define KEY_FORMAT_LOCAL_BTREE 0 +#define KEY_FORMAT_CURRENT 1 + +enum bch_bkey_fields { + BKEY_FIELD_INODE, + BKEY_FIELD_OFFSET, + BKEY_FIELD_SNAPSHOT, + BKEY_FIELD_SIZE, + BKEY_FIELD_VERSION_HI, + BKEY_FIELD_VERSION_LO, + BKEY_NR_FIELDS, +}; + +#define bkey_format_field(name, field) \ + [BKEY_FIELD_##name] = (sizeof(((struct bkey *) NULL)->field) * 8) + +#define BKEY_FORMAT_CURRENT \ +((struct bkey_format) { \ + .key_u64s = BKEY_U64s, \ + .nr_fields = BKEY_NR_FIELDS, \ + .bits_per_field = { \ + bkey_format_field(INODE, p.inode), \ + bkey_format_field(OFFSET, p.offset), \ + bkey_format_field(SNAPSHOT, p.snapshot), \ + bkey_format_field(SIZE, size), \ + bkey_format_field(VERSION_HI, version.hi), \ + bkey_format_field(VERSION_LO, version.lo), \ + }, \ +}) + +/* bkey with inline value */ +struct bkey_i { + __u64 _data[0]; + + struct bkey k; + struct bch_val v; +}; + +#define KEY(_inode, _offset, _size) \ +((struct bkey) { \ + .u64s = BKEY_U64s, \ + .format = KEY_FORMAT_CURRENT, \ + .p = POS(_inode, _offset), \ + .size = _size, \ +}) + +static inline void bkey_init(struct bkey *k) +{ + *k = KEY(0, 0, 0); +} + +#define bkey_bytes(_k) ((_k)->u64s * sizeof(__u64)) + +#define __BKEY_PADDED(key, pad) \ + struct bkey_i key; __u64 key ## _pad[pad] + +/* + * - DELETED keys are used internally to mark keys that should be ignored but + * override keys in composition order. Their version number is ignored. + * + * - DISCARDED keys indicate that the data is all 0s because it has been + * discarded. DISCARDs may have a version; if the version is nonzero the key + * will be persistent, otherwise the key will be dropped whenever the btree + * node is rewritten (like DELETED keys). + * + * - ERROR: any read of the data returns a read error, as the data was lost due + * to a failing device. Like DISCARDED keys, they can be removed (overridden) + * by new writes or cluster-wide GC. Node repair can also overwrite them with + * the same or a more recent version number, but not with an older version + * number. + * + * - WHITEOUT: for hash table btrees + */ +#define BCH_BKEY_TYPES() \ + x(deleted, 0) \ + x(whiteout, 1) \ + x(error, 2) \ + x(cookie, 3) \ + x(hash_whiteout, 4) \ + x(btree_ptr, 5) \ + x(extent, 6) \ + x(reservation, 7) \ + x(inode, 8) \ + x(inode_generation, 9) \ + x(dirent, 10) \ + x(xattr, 11) \ + x(alloc, 12) \ + x(quota, 13) \ + x(stripe, 14) \ + x(reflink_p, 15) \ + x(reflink_v, 16) \ + x(inline_data, 17) \ + x(btree_ptr_v2, 18) \ + x(indirect_inline_data, 19) \ + x(alloc_v2, 20) \ + x(subvolume, 21) \ + x(snapshot, 22) \ + x(inode_v2, 23) \ + x(alloc_v3, 24) \ + x(set, 25) \ + x(lru, 26) \ + x(alloc_v4, 27) \ + x(backpointer, 28) \ + x(inode_v3, 29) \ + x(bucket_gens, 30) \ + x(snapshot_tree, 31) + +enum bch_bkey_type { +#define x(name, nr) KEY_TYPE_##name = nr, + BCH_BKEY_TYPES() +#undef x + KEY_TYPE_MAX, +}; + +struct bch_deleted { + struct bch_val v; +}; + +struct bch_whiteout { + struct bch_val v; +}; + +struct bch_error { + struct bch_val v; +}; + +struct bch_cookie { + struct bch_val v; + __le64 cookie; +}; + +struct bch_hash_whiteout { + struct bch_val v; +}; + +struct bch_set { + struct bch_val v; +}; + +/* Extents */ + +/* + * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally + * preceded by checksum/compression information (bch_extent_crc32 or + * bch_extent_crc64). + * + * One major determining factor in the format of extents is how we handle and + * represent extents that have been partially overwritten and thus trimmed: + * + * If an extent is not checksummed or compressed, when the extent is trimmed we + * don't have to remember the extent we originally allocated and wrote: we can + * merely adjust ptr->offset to point to the start of the data that is currently + * live. The size field in struct bkey records the current (live) size of the + * extent, and is also used to mean "size of region on disk that we point to" in + * this case. + * + * Thus an extent that is not checksummed or compressed will consist only of a + * list of bch_extent_ptrs, with none of the fields in + * bch_extent_crc32/bch_extent_crc64. + * + * When an extent is checksummed or compressed, it's not possible to read only + * the data that is currently live: we have to read the entire extent that was + * originally written, and then return only the part of the extent that is + * currently live. + * + * Thus, in addition to the current size of the extent in struct bkey, we need + * to store the size of the originally allocated space - this is the + * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also, + * when the extent is trimmed, instead of modifying the offset field of the + * pointer, we keep a second smaller offset field - "offset into the original + * extent of the currently live region". + * + * The other major determining factor is replication and data migration: + * + * Each pointer may have its own bch_extent_crc32/64. When doing a replicated + * write, we will initially write all the replicas in the same format, with the + * same checksum type and compression format - however, when copygc runs later (or + * tiering/cache promotion, anything that moves data), it is not in general + * going to rewrite all the pointers at once - one of the replicas may be in a + * bucket on one device that has very little fragmentation while another lives + * in a bucket that has become heavily fragmented, and thus is being rewritten + * sooner than the rest. + * + * Thus it will only move a subset of the pointers (or in the case of + * tiering/cache promotion perhaps add a single pointer without dropping any + * current pointers), and if the extent has been partially overwritten it must + * write only the currently live portion (or copygc would not be able to reduce + * fragmentation!) - which necessitates a different bch_extent_crc format for + * the new pointer. + * + * But in the interests of space efficiency, we don't want to store one + * bch_extent_crc for each pointer if we don't have to. + * + * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and + * bch_extent_ptrs appended arbitrarily one after the other. We determine the + * type of a given entry with a scheme similar to utf8 (except we're encoding a + * type, not a size), encoding the type in the position of the first set bit: + * + * bch_extent_crc32 - 0b1 + * bch_extent_ptr - 0b10 + * bch_extent_crc64 - 0b100 + * + * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and + * bch_extent_crc64 is the least constrained). + * + * Then, each bch_extent_crc32/64 applies to the pointers that follow after it, + * until the next bch_extent_crc32/64. + * + * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer + * is neither checksummed nor compressed. + */ + +/* 128 bits, sufficient for cryptographic MACs: */ +struct bch_csum { + __le64 lo; + __le64 hi; +} __packed __aligned(8); + +#define BCH_EXTENT_ENTRY_TYPES() \ + x(ptr, 0) \ + x(crc32, 1) \ + x(crc64, 2) \ + x(crc128, 3) \ + x(stripe_ptr, 4) \ + x(rebalance, 5) +#define BCH_EXTENT_ENTRY_MAX 6 + +enum bch_extent_entry_type { +#define x(f, n) BCH_EXTENT_ENTRY_##f = n, + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +/* Compressed/uncompressed size are stored biased by 1: */ +struct bch_extent_crc32 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u32 type:2, + _compressed_size:7, + _uncompressed_size:7, + offset:7, + _unused:1, + csum_type:4, + compression_type:4; + __u32 csum; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u32 csum; + __u32 compression_type:4, + csum_type:4, + _unused:1, + offset:7, + _uncompressed_size:7, + _compressed_size:7, + type:2; +#endif +} __packed __aligned(8); + +#define CRC32_SIZE_MAX (1U << 7) +#define CRC32_NONCE_MAX 0 + +struct bch_extent_crc64 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:3, + _compressed_size:9, + _uncompressed_size:9, + offset:9, + nonce:10, + csum_type:4, + compression_type:4, + csum_hi:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 csum_hi:16, + compression_type:4, + csum_type:4, + nonce:10, + offset:9, + _uncompressed_size:9, + _compressed_size:9, + type:3; +#endif + __u64 csum_lo; +} __packed __aligned(8); + +#define CRC64_SIZE_MAX (1U << 9) +#define CRC64_NONCE_MAX ((1U << 10) - 1) + +struct bch_extent_crc128 { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:4, + _compressed_size:13, + _uncompressed_size:13, + offset:13, + nonce:13, + csum_type:4, + compression_type:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 compression_type:4, + csum_type:4, + nonce:13, + offset:13, + _uncompressed_size:13, + _compressed_size:13, + type:4; +#endif + struct bch_csum csum; +} __packed __aligned(8); + +#define CRC128_SIZE_MAX (1U << 13) +#define CRC128_NONCE_MAX ((1U << 13) - 1) + +/* + * @reservation - pointer hasn't been written to, just reserved + */ +struct bch_extent_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:1, + cached:1, + unused:1, + unwritten:1, + offset:44, /* 8 petabytes */ + dev:8, + gen:8; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 gen:8, + dev:8, + offset:44, + unwritten:1, + unused:1, + cached:1, + type:1; +#endif +} __packed __aligned(8); + +struct bch_extent_stripe_ptr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:5, + block:8, + redundancy:4, + idx:47; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 idx:47, + redundancy:4, + block:8, + type:5; +#endif +}; + +struct bch_extent_reservation { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:6, + unused:22, + replicas:4, + generation:32; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 generation:32, + replicas:4, + unused:22, + type:6; +#endif +}; + +struct bch_extent_rebalance { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + unused:33, + compression:8, + target:16; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 target:16, + compression:8, + unused:33, + type:7; +#endif +}; + +union bch_extent_entry { +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64 + unsigned long type; +#elif __BITS_PER_LONG == 32 + struct { + unsigned long pad; + unsigned long type; + }; +#else +#error edit for your odd byteorder. +#endif + +#define x(f, n) struct bch_extent_##f f; + BCH_EXTENT_ENTRY_TYPES() +#undef x +}; + +struct bch_btree_ptr { + struct bch_val v; + + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +struct bch_btree_ptr_v2 { + struct bch_val v; + + __u64 mem_ptr; + __le64 seq; + __le16 sectors_written; + __le16 flags; + struct bpos min_key; + __u64 _data[0]; + struct bch_extent_ptr start[]; +} __packed __aligned(8); + +LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1); + +struct bch_extent { + struct bch_val v; + + __u64 _data[0]; + union bch_extent_entry start[]; +} __packed __aligned(8); + +struct bch_reservation { + struct bch_val v; + + __le32 generation; + __u8 nr_replicas; + __u8 pad[3]; +} __packed __aligned(8); + +/* Maximum size (in u64s) a single pointer could be: */ +#define BKEY_EXTENT_PTR_U64s_MAX\ + ((sizeof(struct bch_extent_crc128) + \ + sizeof(struct bch_extent_ptr)) / sizeof(__u64)) + +/* Maximum possible size of an entire extent value: */ +#define BKEY_EXTENT_VAL_U64s_MAX \ + (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1)) + +/* * Maximum possible size of an entire extent, key + value: */ +#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX) + +/* Btree pointers don't carry around checksums: */ +#define BKEY_BTREE_PTR_VAL_U64s_MAX \ + ((sizeof(struct bch_btree_ptr_v2) + \ + sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64)) +#define BKEY_BTREE_PTR_U64s_MAX \ + (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX) + +/* Inodes */ + +#define BLOCKDEV_INODE_MAX 4096 + +#define BCACHEFS_ROOT_INO 4096 + +struct bch_inode { + struct bch_val v; + + __le64 bi_hash_seed; + __le32 bi_flags; + __le16 bi_mode; + __u8 fields[0]; +} __packed __aligned(8); + +struct bch_inode_v2 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le16 bi_mode; + __u8 fields[0]; +} __packed __aligned(8); + +struct bch_inode_v3 { + struct bch_val v; + + __le64 bi_journal_seq; + __le64 bi_hash_seed; + __le64 bi_flags; + __le64 bi_sectors; + __le64 bi_size; + __le64 bi_version; + __u8 fields[0]; +} __packed __aligned(8); + +#define INODEv3_FIELDS_START_INITIAL 6 +#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64)) + +struct bch_inode_generation { + struct bch_val v; + + __le32 bi_generation; + __le32 pad; +} __packed __aligned(8); + +/* + * bi_subvol and bi_parent_subvol are only set for subvolume roots: + */ + +#define BCH_INODE_FIELDS_v2() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_size, 64) \ + x(bi_sectors, 64) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) + +#define BCH_INODE_FIELDS_v3() \ + x(bi_atime, 96) \ + x(bi_ctime, 96) \ + x(bi_mtime, 96) \ + x(bi_otime, 96) \ + x(bi_uid, 32) \ + x(bi_gid, 32) \ + x(bi_nlink, 32) \ + x(bi_generation, 32) \ + x(bi_dev, 32) \ + x(bi_data_checksum, 8) \ + x(bi_compression, 8) \ + x(bi_project, 32) \ + x(bi_background_compression, 8) \ + x(bi_data_replicas, 8) \ + x(bi_promote_target, 16) \ + x(bi_foreground_target, 16) \ + x(bi_background_target, 16) \ + x(bi_erasure_code, 16) \ + x(bi_fields_set, 16) \ + x(bi_dir, 64) \ + x(bi_dir_offset, 64) \ + x(bi_subvol, 32) \ + x(bi_parent_subvol, 32) \ + x(bi_nocow, 8) + +/* subset of BCH_INODE_FIELDS */ +#define BCH_INODE_OPTS() \ + x(data_checksum, 8) \ + x(compression, 8) \ + x(project, 32) \ + x(background_compression, 8) \ + x(data_replicas, 8) \ + x(promote_target, 16) \ + x(foreground_target, 16) \ + x(background_target, 16) \ + x(erasure_code, 16) \ + x(nocow, 8) + +enum inode_opt_id { +#define x(name, ...) \ + Inode_opt_##name, + BCH_INODE_OPTS() +#undef x + Inode_opt_nr, +}; + +enum { + /* + * User flags (get/settable with FS_IOC_*FLAGS, correspond to FS_*_FL + * flags) + */ + __BCH_INODE_SYNC = 0, + __BCH_INODE_IMMUTABLE = 1, + __BCH_INODE_APPEND = 2, + __BCH_INODE_NODUMP = 3, + __BCH_INODE_NOATIME = 4, + + __BCH_INODE_I_SIZE_DIRTY = 5, + __BCH_INODE_I_SECTORS_DIRTY = 6, + __BCH_INODE_UNLINKED = 7, + __BCH_INODE_BACKPTR_UNTRUSTED = 8, + + /* bits 20+ reserved for packed fields below: */ +}; + +#define BCH_INODE_SYNC (1 << __BCH_INODE_SYNC) +#define BCH_INODE_IMMUTABLE (1 << __BCH_INODE_IMMUTABLE) +#define BCH_INODE_APPEND (1 << __BCH_INODE_APPEND) +#define BCH_INODE_NODUMP (1 << __BCH_INODE_NODUMP) +#define BCH_INODE_NOATIME (1 << __BCH_INODE_NOATIME) +#define BCH_INODE_I_SIZE_DIRTY (1 << __BCH_INODE_I_SIZE_DIRTY) +#define BCH_INODE_I_SECTORS_DIRTY (1 << __BCH_INODE_I_SECTORS_DIRTY) +#define BCH_INODE_UNLINKED (1 << __BCH_INODE_UNLINKED) +#define BCH_INODE_BACKPTR_UNTRUSTED (1 << __BCH_INODE_BACKPTR_UNTRUSTED) + +LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24); +LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31); +LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32); + +LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24); +LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24); +LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31); + +LE64_BITMASK(INODEv3_FIELDS_START, + struct bch_inode_v3, bi_flags, 31, 36); +LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52); + +/* Dirents */ + +/* + * Dirents (and xattrs) have to implement string lookups; since our b-tree + * doesn't support arbitrary length strings for the key, we instead index by a + * 64 bit hash (currently truncated sha1) of the string, stored in the offset + * field of the key - using linear probing to resolve hash collisions. This also + * provides us with the readdir cookie posix requires. + * + * Linear probing requires us to use whiteouts for deletions, in the event of a + * collision: + */ + +struct bch_dirent { + struct bch_val v; + + /* Target inode number: */ + union { + __le64 d_inum; + struct { /* DT_SUBVOL */ + __le32 d_child_subvol; + __le32 d_parent_subvol; + }; + }; + + /* + * Copy of mode bits 12-15 from the target inode - so userspace can get + * the filetype without having to do a stat() + */ + __u8 d_type; + + __u8 d_name[]; +} __packed __aligned(8); + +#define DT_SUBVOL 16 +#define BCH_DT_MAX 17 + +#define BCH_NAME_MAX ((unsigned) (U8_MAX * sizeof(__u64) - \ + sizeof(struct bkey) - \ + offsetof(struct bch_dirent, d_name))) + +/* Xattrs */ + +#define KEY_TYPE_XATTR_INDEX_USER 0 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1 +#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2 +#define KEY_TYPE_XATTR_INDEX_TRUSTED 3 +#define KEY_TYPE_XATTR_INDEX_SECURITY 4 + +struct bch_xattr { + struct bch_val v; + __u8 x_type; + __u8 x_name_len; + __le16 x_val_len; + __u8 x_name[]; +} __packed __aligned(8); + +/* Bucket/allocation information: */ + +struct bch_alloc { + struct bch_val v; + __u8 fields; + __u8 gen; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V1() \ + x(read_time, 16) \ + x(write_time, 16) \ + x(data_type, 8) \ + x(dirty_sectors, 16) \ + x(cached_sectors, 16) \ + x(oldest_gen, 8) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +enum { +#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name, + BCH_ALLOC_FIELDS_V1() +#undef x +}; + +struct bch_alloc_v2 { + struct bch_val v; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +#define BCH_ALLOC_FIELDS_V2() \ + x(read_time, 64) \ + x(write_time, 64) \ + x(dirty_sectors, 32) \ + x(cached_sectors, 32) \ + x(stripe, 32) \ + x(stripe_redundancy, 8) + +struct bch_alloc_v3 { + struct bch_val v; + __le64 journal_seq; + __le32 flags; + __u8 nr_fields; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 data[]; +} __packed __aligned(8); + +LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1) +LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2) + +struct bch_alloc_v4 { + struct bch_val v; + __u64 journal_seq; + __u32 flags; + __u8 gen; + __u8 oldest_gen; + __u8 data_type; + __u8 stripe_redundancy; + __u32 dirty_sectors; + __u32 cached_sectors; + __u64 io_time[2]; + __u32 stripe; + __u32 nr_external_backpointers; + __u64 fragmentation_lru; +} __packed __aligned(8); + +#define BCH_ALLOC_V4_U64s_V0 6 +#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64)) + +BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1) +BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2) +BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8) +BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14) + +#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40 + +struct bch_backpointer { + struct bch_val v; + __u8 btree_id; + __u8 level; + __u8 data_type; + __u64 bucket_offset:40; + __u32 bucket_len; + struct bpos pos; +} __packed __aligned(8); + +#define KEY_TYPE_BUCKET_GENS_BITS 8 +#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS) +#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1) + +struct bch_bucket_gens { + struct bch_val v; + u8 gens[KEY_TYPE_BUCKET_GENS_NR]; +} __packed __aligned(8); + +/* Quotas: */ + +enum quota_types { + QTYP_USR = 0, + QTYP_GRP = 1, + QTYP_PRJ = 2, + QTYP_NR = 3, +}; + +enum quota_counters { + Q_SPC = 0, + Q_INO = 1, + Q_COUNTERS = 2, +}; + +struct bch_quota_counter { + __le64 hardlimit; + __le64 softlimit; +}; + +struct bch_quota { + struct bch_val v; + struct bch_quota_counter c[Q_COUNTERS]; +} __packed __aligned(8); + +/* Erasure coding */ + +struct bch_stripe { + struct bch_val v; + __le16 sectors; + __u8 algorithm; + __u8 nr_blocks; + __u8 nr_redundant; + + __u8 csum_granularity_bits; + __u8 csum_type; + __u8 pad; + + struct bch_extent_ptr ptrs[]; +} __packed __aligned(8); + +/* Reflink: */ + +struct bch_reflink_p { + struct bch_val v; + __le64 idx; + /* + * A reflink pointer might point to an indirect extent which is then + * later split (by copygc or rebalance). If we only pointed to part of + * the original indirect extent, and then one of the fragments is + * outside the range we point to, we'd leak a refcount: so when creating + * reflink pointers, we need to store pad values to remember the full + * range we were taking a reference on. + */ + __le32 front_pad; + __le32 back_pad; +} __packed __aligned(8); + +struct bch_reflink_v { + struct bch_val v; + __le64 refcount; + union bch_extent_entry start[0]; + __u64 _data[0]; +} __packed __aligned(8); + +struct bch_indirect_inline_data { + struct bch_val v; + __le64 refcount; + u8 data[0]; +}; + +/* Inline data */ + +struct bch_inline_data { + struct bch_val v; + u8 data[0]; +}; + +/* Subvolumes: */ + +#define SUBVOL_POS_MIN POS(0, 1) +#define SUBVOL_POS_MAX POS(0, S32_MAX) +#define BCACHEFS_ROOT_SUBVOL 1 + +struct bch_subvolume { + struct bch_val v; + __le32 flags; + __le32 snapshot; + __le64 inode; + __le32 parent; + __le32 pad; + bch_le128 otime; +}; + +LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1) +/* + * We need to know whether a subvolume is a snapshot so we can know whether we + * can delete it (or whether it should just be rm -rf'd) + */ +LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2) +LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3) + +/* Snapshots */ + +struct bch_snapshot { + struct bch_val v; + __le32 flags; + __le32 parent; + __le32 children[2]; + __le32 subvol; + __le32 tree; + __le32 depth; + __le32 skip[3]; +}; + +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) + +/* True if a subvolume points to this snapshot node: */ +LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) + +/* + * Snapshot trees: + * + * The snapshot_trees btree gives us persistent indentifier for each tree of + * bch_snapshot nodes, and allow us to record and easily find the root/master + * subvolume that other snapshots were created from: + */ +struct bch_snapshot_tree { + struct bch_val v; + __le32 master_subvol; + __le32 root_snapshot; +}; + +/* LRU btree: */ + +struct bch_lru { + struct bch_val v; + __le64 idx; +} __packed __aligned(8); + +#define LRU_ID_STRIPES (1U << 16) + +/* Optional/variable size superblock sections: */ + +struct bch_sb_field { + __u64 _data[0]; + __le32 u64s; + __le32 type; +}; + +#define BCH_SB_FIELDS() \ + x(journal, 0) \ + x(members, 1) \ + x(crypt, 2) \ + x(replicas_v0, 3) \ + x(quota, 4) \ + x(disk_groups, 5) \ + x(clean, 6) \ + x(replicas, 7) \ + x(journal_seq_blacklist, 8) \ + x(journal_v2, 9) \ + x(counters, 10) + +enum bch_sb_field_type { +#define x(f, nr) BCH_SB_FIELD_##f = nr, + BCH_SB_FIELDS() +#undef x + BCH_SB_FIELD_NR +}; + +/* + * Most superblock fields are replicated in all device's superblocks - a few are + * not: + */ +#define BCH_SINGLE_DEVICE_SB_FIELDS \ + ((1U << BCH_SB_FIELD_journal)| \ + (1U << BCH_SB_FIELD_journal_v2)) + +/* BCH_SB_FIELD_journal: */ + +struct bch_sb_field_journal { + struct bch_sb_field field; + __le64 buckets[0]; +}; + +struct bch_sb_field_journal_v2 { + struct bch_sb_field field; + + struct bch_sb_field_journal_v2_entry { + __le64 start; + __le64 nr; + } d[0]; +}; + +/* BCH_SB_FIELD_members: */ + +#define BCH_MIN_NR_NBUCKETS (1 << 6) + +struct bch_member { + __uuid_t uuid; + __le64 nbuckets; /* device size */ + __le16 first_bucket; /* index of first bucket used */ + __le16 bucket_size; /* sectors */ + __le32 pad; + __le64 last_mount; /* time_t */ + + __le64 flags[2]; +}; + +LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags[0], 0, 4) +/* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ +LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags[0], 14, 15) +LE64_BITMASK(BCH_MEMBER_DATA_ALLOWED, struct bch_member, flags[0], 15, 20) +LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags[0], 20, 28) +LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags[0], 28, 30) +LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, + struct bch_member, flags[0], 30, 31) + +#if 0 +LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); +LE64_BITMASK(BCH_MEMBER_NR_WRITE_ERRORS,struct bch_member, flags[1], 20, 40); +#endif + +#define BCH_MEMBER_STATES() \ + x(rw, 0) \ + x(ro, 1) \ + x(failed, 2) \ + x(spare, 3) + +enum bch_member_state { +#define x(t, n) BCH_MEMBER_STATE_##t = n, + BCH_MEMBER_STATES() +#undef x + BCH_MEMBER_STATE_NR +}; + +struct bch_sb_field_members { + struct bch_sb_field field; + struct bch_member members[0]; +}; + +/* BCH_SB_FIELD_crypt: */ + +struct nonce { + __le32 d[4]; +}; + +struct bch_key { + __le64 key[4]; +}; + +#define BCH_KEY_MAGIC \ + (((__u64) 'b' << 0)|((__u64) 'c' << 8)| \ + ((__u64) 'h' << 16)|((__u64) '*' << 24)| \ + ((__u64) '*' << 32)|((__u64) 'k' << 40)| \ + ((__u64) 'e' << 48)|((__u64) 'y' << 56)) + +struct bch_encrypted_key { + __le64 magic; + struct bch_key key; +}; + +/* + * If this field is present in the superblock, it stores an encryption key which + * is used encrypt all other data/metadata. The key will normally be encrypted + * with the key userspace provides, but if encryption has been turned off we'll + * just store the master key unencrypted in the superblock so we can access the + * previously encrypted data. + */ +struct bch_sb_field_crypt { + struct bch_sb_field field; + + __le64 flags; + __le64 kdf_flags; + struct bch_encrypted_key key; +}; + +LE64_BITMASK(BCH_CRYPT_KDF_TYPE, struct bch_sb_field_crypt, flags, 0, 4); + +enum bch_kdf_types { + BCH_KDF_SCRYPT = 0, + BCH_KDF_NR = 1, +}; + +/* stored as base 2 log of scrypt params: */ +LE64_BITMASK(BCH_KDF_SCRYPT_N, struct bch_sb_field_crypt, kdf_flags, 0, 16); +LE64_BITMASK(BCH_KDF_SCRYPT_R, struct bch_sb_field_crypt, kdf_flags, 16, 32); +LE64_BITMASK(BCH_KDF_SCRYPT_P, struct bch_sb_field_crypt, kdf_flags, 32, 48); + +/* BCH_SB_FIELD_replicas: */ + +#define BCH_DATA_TYPES() \ + x(free, 0) \ + x(sb, 1) \ + x(journal, 2) \ + x(btree, 3) \ + x(user, 4) \ + x(cached, 5) \ + x(parity, 6) \ + x(stripe, 7) \ + x(need_gc_gens, 8) \ + x(need_discard, 9) + +enum bch_data_type { +#define x(t, n) BCH_DATA_##t, + BCH_DATA_TYPES() +#undef x + BCH_DATA_NR +}; + +static inline bool data_type_is_empty(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_free: + case BCH_DATA_need_gc_gens: + case BCH_DATA_need_discard: + return true; + default: + return false; + } +} + +static inline bool data_type_is_hidden(enum bch_data_type type) +{ + switch (type) { + case BCH_DATA_sb: + case BCH_DATA_journal: + return true; + default: + return false; + } +} + +struct bch_replicas_entry_v0 { + __u8 data_type; + __u8 nr_devs; + __u8 devs[0]; +} __packed; + +struct bch_sb_field_replicas_v0 { + struct bch_sb_field field; + struct bch_replicas_entry_v0 entries[0]; +} __packed __aligned(8); + +struct bch_replicas_entry { + __u8 data_type; + __u8 nr_devs; + __u8 nr_required; + __u8 devs[0]; +} __packed; + +#define replicas_entry_bytes(_i) \ + (offsetof(typeof(*(_i)), devs) + (_i)->nr_devs) + +struct bch_sb_field_replicas { + struct bch_sb_field field; + struct bch_replicas_entry entries[0]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_quota: */ + +struct bch_sb_quota_counter { + __le32 timelimit; + __le32 warnlimit; +}; + +struct bch_sb_quota_type { + __le64 flags; + struct bch_sb_quota_counter c[Q_COUNTERS]; +}; + +struct bch_sb_field_quota { + struct bch_sb_field field; + struct bch_sb_quota_type q[QTYP_NR]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_disk_groups: */ + +#define BCH_SB_LABEL_SIZE 32 + +struct bch_disk_group { + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 flags[2]; +} __packed __aligned(8); + +LE64_BITMASK(BCH_GROUP_DELETED, struct bch_disk_group, flags[0], 0, 1) +LE64_BITMASK(BCH_GROUP_DATA_ALLOWED, struct bch_disk_group, flags[0], 1, 6) +LE64_BITMASK(BCH_GROUP_PARENT, struct bch_disk_group, flags[0], 6, 24) + +struct bch_sb_field_disk_groups { + struct bch_sb_field field; + struct bch_disk_group entries[0]; +} __packed __aligned(8); + +/* BCH_SB_FIELD_counters */ + +#define BCH_PERSISTENT_COUNTERS() \ + x(io_read, 0) \ + x(io_write, 1) \ + x(io_move, 2) \ + x(bucket_invalidate, 3) \ + x(bucket_discard, 4) \ + x(bucket_alloc, 5) \ + x(bucket_alloc_fail, 6) \ + x(btree_cache_scan, 7) \ + x(btree_cache_reap, 8) \ + x(btree_cache_cannibalize, 9) \ + x(btree_cache_cannibalize_lock, 10) \ + x(btree_cache_cannibalize_lock_fail, 11) \ + x(btree_cache_cannibalize_unlock, 12) \ + x(btree_node_write, 13) \ + x(btree_node_read, 14) \ + x(btree_node_compact, 15) \ + x(btree_node_merge, 16) \ + x(btree_node_split, 17) \ + x(btree_node_rewrite, 18) \ + x(btree_node_alloc, 19) \ + x(btree_node_free, 20) \ + x(btree_node_set_root, 21) \ + x(btree_path_relock_fail, 22) \ + x(btree_path_upgrade_fail, 23) \ + x(btree_reserve_get_fail, 24) \ + x(journal_entry_full, 25) \ + x(journal_full, 26) \ + x(journal_reclaim_finish, 27) \ + x(journal_reclaim_start, 28) \ + x(journal_write, 29) \ + x(read_promote, 30) \ + x(read_bounce, 31) \ + x(read_split, 33) \ + x(read_retry, 32) \ + x(read_reuse_race, 34) \ + x(move_extent_read, 35) \ + x(move_extent_write, 36) \ + x(move_extent_finish, 37) \ + x(move_extent_fail, 38) \ + x(move_extent_alloc_mem_fail, 39) \ + x(copygc, 40) \ + x(copygc_wait, 41) \ + x(gc_gens_end, 42) \ + x(gc_gens_start, 43) \ + x(trans_blocked_journal_reclaim, 44) \ + x(trans_restart_btree_node_reused, 45) \ + x(trans_restart_btree_node_split, 46) \ + x(trans_restart_fault_inject, 47) \ + x(trans_restart_iter_upgrade, 48) \ + x(trans_restart_journal_preres_get, 49) \ + x(trans_restart_journal_reclaim, 50) \ + x(trans_restart_journal_res_get, 51) \ + x(trans_restart_key_cache_key_realloced, 52) \ + x(trans_restart_key_cache_raced, 53) \ + x(trans_restart_mark_replicas, 54) \ + x(trans_restart_mem_realloced, 55) \ + x(trans_restart_memory_allocation_failure, 56) \ + x(trans_restart_relock, 57) \ + x(trans_restart_relock_after_fill, 58) \ + x(trans_restart_relock_key_cache_fill, 59) \ + x(trans_restart_relock_next_node, 60) \ + x(trans_restart_relock_parent_for_fill, 61) \ + x(trans_restart_relock_path, 62) \ + x(trans_restart_relock_path_intent, 63) \ + x(trans_restart_too_many_iters, 64) \ + x(trans_restart_traverse, 65) \ + x(trans_restart_upgrade, 66) \ + x(trans_restart_would_deadlock, 67) \ + x(trans_restart_would_deadlock_write, 68) \ + x(trans_restart_injected, 69) \ + x(trans_restart_key_cache_upgrade, 70) \ + x(trans_traverse_all, 71) \ + x(transaction_commit, 72) \ + x(write_super, 73) \ + x(trans_restart_would_deadlock_recursion_limit, 74) \ + x(trans_restart_write_buffer_flush, 75) \ + x(trans_restart_split_race, 76) + +enum bch_persistent_counters { +#define x(t, n, ...) BCH_COUNTER_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_NR +}; + +struct bch_sb_field_counters { + struct bch_sb_field field; + __le64 d[0]; +}; + +/* + * On clean shutdown, store btree roots and current journal sequence number in + * the superblock: + */ +struct jset_entry { + __le16 u64s; + __u8 btree_id; + __u8 level; + __u8 type; /* designates what this jset holds */ + __u8 pad[3]; + + union { + struct bkey_i start[0]; + __u64 _data[0]; + }; +}; + +struct bch_sb_field_clean { + struct bch_sb_field field; + + __le32 flags; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; + __le64 journal_seq; + + union { + struct jset_entry start[0]; + __u64 _data[0]; + }; +}; + +struct journal_seq_blacklist_entry { + __le64 start; + __le64 end; +}; + +struct bch_sb_field_journal_seq_blacklist { + struct bch_sb_field field; + + union { + struct journal_seq_blacklist_entry start[0]; + __u64 _data[0]; + }; +}; + +/* Superblock: */ + +/* + * New versioning scheme: + * One common version number for all on disk data structures - superblock, btree + * nodes, journal entries + */ +#define BCH_VERSION_MAJOR(_v) ((__u16) ((_v) >> 10)) +#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10))) +#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0) + +#define RECOVERY_PASS_ALL_FSCK (1ULL << 63) + +#define BCH_METADATA_VERSIONS() \ + x(bkey_renumber, BCH_VERSION(0, 10), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_btree_change, BCH_VERSION(0, 11), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot, BCH_VERSION(0, 12), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_backpointers, BCH_VERSION(0, 13), \ + RECOVERY_PASS_ALL_FSCK) \ + x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_2, BCH_VERSION(0, 15), \ + BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \ + BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(reflink_p_fix, BCH_VERSION(0, 16), \ + BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \ + x(subvol_dirent, BCH_VERSION(0, 17), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v2, BCH_VERSION(0, 18), \ + RECOVERY_PASS_ALL_FSCK) \ + x(freespace, BCH_VERSION(0, 19), \ + RECOVERY_PASS_ALL_FSCK) \ + x(alloc_v4, BCH_VERSION(0, 20), \ + RECOVERY_PASS_ALL_FSCK) \ + x(new_data_types, BCH_VERSION(0, 21), \ + RECOVERY_PASS_ALL_FSCK) \ + x(backpointers, BCH_VERSION(0, 22), \ + RECOVERY_PASS_ALL_FSCK) \ + x(inode_v3, BCH_VERSION(0, 23), \ + RECOVERY_PASS_ALL_FSCK) \ + x(unwritten_extents, BCH_VERSION(0, 24), \ + RECOVERY_PASS_ALL_FSCK) \ + x(bucket_gens, BCH_VERSION(0, 25), \ + BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \ + RECOVERY_PASS_ALL_FSCK) \ + x(lru_v2, BCH_VERSION(0, 26), \ + RECOVERY_PASS_ALL_FSCK) \ + x(fragmentation_lru, BCH_VERSION(0, 27), \ + RECOVERY_PASS_ALL_FSCK) \ + x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \ + RECOVERY_PASS_ALL_FSCK) \ + x(snapshot_trees, BCH_VERSION(0, 29), \ + RECOVERY_PASS_ALL_FSCK) \ + x(major_minor, BCH_VERSION(1, 0), \ + 0) \ + x(snapshot_skiplists, BCH_VERSION(1, 1), \ + BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) + +enum bcachefs_metadata_version { + bcachefs_metadata_version_min = 9, +#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n, + BCH_METADATA_VERSIONS() +#undef x + bcachefs_metadata_version_max +}; + +static const unsigned bcachefs_metadata_required_upgrade_below = bcachefs_metadata_version_major_minor; + +#define bcachefs_metadata_version_current (bcachefs_metadata_version_max - 1) + +#define BCH_SB_SECTOR 8 +#define BCH_SB_MEMBERS_MAX 64 /* XXX kill */ + +struct bch_sb_layout { + __uuid_t magic; /* bcachefs superblock UUID */ + __u8 layout_type; + __u8 sb_max_size_bits; /* base 2 of 512 byte sectors */ + __u8 nr_superblocks; + __u8 pad[5]; + __le64 sb_offset[61]; +} __packed __aligned(8); + +#define BCH_SB_LAYOUT_SECTOR 7 + +/* + * @offset - sector where this sb was written + * @version - on disk format version + * @version_min - Oldest metadata version this filesystem contains; so we can + * safely drop compatibility code and refuse to mount filesystems + * we'd need it for + * @magic - identifies as a bcachefs superblock (BCHFS_MAGIC) + * @seq - incremented each time superblock is written + * @uuid - used for generating various magic numbers and identifying + * member devices, never changes + * @user_uuid - user visible UUID, may be changed + * @label - filesystem label + * @seq - identifies most recent superblock, incremented each time + * superblock is written + * @features - enabled incompatible features + */ +struct bch_sb { + struct bch_csum csum; + __le16 version; + __le16 version_min; + __le16 pad[2]; + __uuid_t magic; + __uuid_t uuid; + __uuid_t user_uuid; + __u8 label[BCH_SB_LABEL_SIZE]; + __le64 offset; + __le64 seq; + + __le16 block_size; + __u8 dev_idx; + __u8 nr_devices; + __le32 u64s; + + __le64 time_base_lo; + __le32 time_base_hi; + __le32 time_precision; + + __le64 flags[8]; + __le64 features[2]; + __le64 compat[2]; + + struct bch_sb_layout layout; + + union { + struct bch_sb_field start[0]; + __le64 _data[0]; + }; +} __packed __aligned(8); + +/* + * Flags: + * BCH_SB_INITALIZED - set on first mount + * BCH_SB_CLEAN - did we shut down cleanly? Just a hint, doesn't affect + * behaviour of mount/recovery path: + * BCH_SB_INODE_32BIT - limit inode numbers to 32 bits + * BCH_SB_128_BIT_MACS - 128 bit macs instead of 80 + * BCH_SB_ENCRYPTION_TYPE - if nonzero encryption is enabled; overrides + * DATA/META_CSUM_TYPE. Also indicates encryption + * algorithm in use, if/when we get more than one + */ + +LE16_BITMASK(BCH_SB_BLOCK_SIZE, struct bch_sb, block_size, 0, 16); + +LE64_BITMASK(BCH_SB_INITIALIZED, struct bch_sb, flags[0], 0, 1); +LE64_BITMASK(BCH_SB_CLEAN, struct bch_sb, flags[0], 1, 2); +LE64_BITMASK(BCH_SB_CSUM_TYPE, struct bch_sb, flags[0], 2, 8); +LE64_BITMASK(BCH_SB_ERROR_ACTION, struct bch_sb, flags[0], 8, 12); + +LE64_BITMASK(BCH_SB_BTREE_NODE_SIZE, struct bch_sb, flags[0], 12, 28); + +LE64_BITMASK(BCH_SB_GC_RESERVE, struct bch_sb, flags[0], 28, 33); +LE64_BITMASK(BCH_SB_ROOT_RESERVE, struct bch_sb, flags[0], 33, 40); + +LE64_BITMASK(BCH_SB_META_CSUM_TYPE, struct bch_sb, flags[0], 40, 44); +LE64_BITMASK(BCH_SB_DATA_CSUM_TYPE, struct bch_sb, flags[0], 44, 48); + +LE64_BITMASK(BCH_SB_META_REPLICAS_WANT, struct bch_sb, flags[0], 48, 52); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_WANT, struct bch_sb, flags[0], 52, 56); + +LE64_BITMASK(BCH_SB_POSIX_ACL, struct bch_sb, flags[0], 56, 57); +LE64_BITMASK(BCH_SB_USRQUOTA, struct bch_sb, flags[0], 57, 58); +LE64_BITMASK(BCH_SB_GRPQUOTA, struct bch_sb, flags[0], 58, 59); +LE64_BITMASK(BCH_SB_PRJQUOTA, struct bch_sb, flags[0], 59, 60); + +LE64_BITMASK(BCH_SB_HAS_ERRORS, struct bch_sb, flags[0], 60, 61); +LE64_BITMASK(BCH_SB_HAS_TOPOLOGY_ERRORS,struct bch_sb, flags[0], 61, 62); + +LE64_BITMASK(BCH_SB_BIG_ENDIAN, struct bch_sb, flags[0], 62, 63); + +LE64_BITMASK(BCH_SB_STR_HASH_TYPE, struct bch_sb, flags[1], 0, 4); +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_LO,struct bch_sb, flags[1], 4, 8); +LE64_BITMASK(BCH_SB_INODE_32BIT, struct bch_sb, flags[1], 8, 9); + +LE64_BITMASK(BCH_SB_128_BIT_MACS, struct bch_sb, flags[1], 9, 10); +LE64_BITMASK(BCH_SB_ENCRYPTION_TYPE, struct bch_sb, flags[1], 10, 14); + +/* + * Max size of an extent that may require bouncing to read or write + * (checksummed, compressed): 64k + */ +LE64_BITMASK(BCH_SB_ENCODED_EXTENT_MAX_BITS, + struct bch_sb, flags[1], 14, 20); + +LE64_BITMASK(BCH_SB_META_REPLICAS_REQ, struct bch_sb, flags[1], 20, 24); +LE64_BITMASK(BCH_SB_DATA_REPLICAS_REQ, struct bch_sb, flags[1], 24, 28); + +LE64_BITMASK(BCH_SB_PROMOTE_TARGET, struct bch_sb, flags[1], 28, 40); +LE64_BITMASK(BCH_SB_FOREGROUND_TARGET, struct bch_sb, flags[1], 40, 52); +LE64_BITMASK(BCH_SB_BACKGROUND_TARGET, struct bch_sb, flags[1], 52, 64); + +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO, + struct bch_sb, flags[2], 0, 4); +LE64_BITMASK(BCH_SB_GC_RESERVE_BYTES, struct bch_sb, flags[2], 4, 64); + +LE64_BITMASK(BCH_SB_ERASURE_CODE, struct bch_sb, flags[3], 0, 16); +LE64_BITMASK(BCH_SB_METADATA_TARGET, struct bch_sb, flags[3], 16, 28); +LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); +LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); +LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); +LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); +LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); +LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); +LE64_BITMASK(BCH_SB_WRITE_BUFFER_SIZE, struct bch_sb, flags[4], 34, 54); +LE64_BITMASK(BCH_SB_VERSION_UPGRADE, struct bch_sb, flags[4], 54, 56); + +LE64_BITMASK(BCH_SB_COMPRESSION_TYPE_HI,struct bch_sb, flags[4], 56, 60); +LE64_BITMASK(BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI, + struct bch_sb, flags[4], 60, 64); + +LE64_BITMASK(BCH_SB_VERSION_UPGRADE_COMPLETE, + struct bch_sb, flags[5], 0, 16); + +static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) +{ + return BCH_SB_COMPRESSION_TYPE_LO(sb) | (BCH_SB_COMPRESSION_TYPE_HI(sb) << 4); +} + +static inline void SET_BCH_SB_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) +{ + SET_BCH_SB_COMPRESSION_TYPE_LO(sb, v); + SET_BCH_SB_COMPRESSION_TYPE_HI(sb, v >> 4); +} + +static inline __u64 BCH_SB_BACKGROUND_COMPRESSION_TYPE(const struct bch_sb *sb) +{ + return BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb) | + (BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb) << 4); +} + +static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u64 v) +{ + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_LO(sb, v); + SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE_HI(sb, v >> 4); +} + +/* + * Features: + * + * journal_seq_blacklist_v3: gates BCH_SB_FIELD_journal_seq_blacklist + * reflink: gates KEY_TYPE_reflink + * inline_data: gates KEY_TYPE_inline_data + * new_siphash: gates BCH_STR_HASH_siphash + * new_extent_overwrite: gates BTREE_NODE_NEW_EXTENT_OVERWRITE + */ +#define BCH_SB_FEATURES() \ + x(lz4, 0) \ + x(gzip, 1) \ + x(zstd, 2) \ + x(atomic_nlink, 3) \ + x(ec, 4) \ + x(journal_seq_blacklist_v3, 5) \ + x(reflink, 6) \ + x(new_siphash, 7) \ + x(inline_data, 8) \ + x(new_extent_overwrite, 9) \ + x(incompressible, 10) \ + x(btree_ptr_v2, 11) \ + x(extents_above_btree_updates, 12) \ + x(btree_updates_journalled, 13) \ + x(reflink_inline_data, 14) \ + x(new_varint, 15) \ + x(journal_no_flush, 16) \ + x(alloc_v2, 17) \ + x(extents_across_btree_nodes, 18) + +#define BCH_SB_FEATURES_ALWAYS \ + ((1ULL << BCH_FEATURE_new_extent_overwrite)| \ + (1ULL << BCH_FEATURE_extents_above_btree_updates)|\ + (1ULL << BCH_FEATURE_btree_updates_journalled)|\ + (1ULL << BCH_FEATURE_alloc_v2)|\ + (1ULL << BCH_FEATURE_extents_across_btree_nodes)) + +#define BCH_SB_FEATURES_ALL \ + (BCH_SB_FEATURES_ALWAYS| \ + (1ULL << BCH_FEATURE_new_siphash)| \ + (1ULL << BCH_FEATURE_btree_ptr_v2)| \ + (1ULL << BCH_FEATURE_new_varint)| \ + (1ULL << BCH_FEATURE_journal_no_flush)) + +enum bch_sb_feature { +#define x(f, n) BCH_FEATURE_##f, + BCH_SB_FEATURES() +#undef x + BCH_FEATURE_NR, +}; + +#define BCH_SB_COMPAT() \ + x(alloc_info, 0) \ + x(alloc_metadata, 1) \ + x(extents_above_btree_updates_done, 2) \ + x(bformat_overflow_done, 3) + +enum bch_sb_compat { +#define x(f, n) BCH_COMPAT_##f, + BCH_SB_COMPAT() +#undef x + BCH_COMPAT_NR, +}; + +/* options: */ + +#define BCH_VERSION_UPGRADE_OPTS() \ + x(compatible, 0) \ + x(incompatible, 1) \ + x(none, 2) + +enum bch_version_upgrade_opts { +#define x(t, n) BCH_VERSION_UPGRADE_##t = n, + BCH_VERSION_UPGRADE_OPTS() +#undef x +}; + +#define BCH_REPLICAS_MAX 4U + +#define BCH_BKEY_PTRS_MAX 16U + +#define BCH_ERROR_ACTIONS() \ + x(continue, 0) \ + x(ro, 1) \ + x(panic, 2) + +enum bch_error_actions { +#define x(t, n) BCH_ON_ERROR_##t = n, + BCH_ERROR_ACTIONS() +#undef x + BCH_ON_ERROR_NR +}; + +#define BCH_STR_HASH_TYPES() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash_old, 2) \ + x(siphash, 3) + +enum bch_str_hash_type { +#define x(t, n) BCH_STR_HASH_##t = n, + BCH_STR_HASH_TYPES() +#undef x + BCH_STR_HASH_NR +}; + +#define BCH_STR_HASH_OPTS() \ + x(crc32c, 0) \ + x(crc64, 1) \ + x(siphash, 2) + +enum bch_str_hash_opts { +#define x(t, n) BCH_STR_HASH_OPT_##t = n, + BCH_STR_HASH_OPTS() +#undef x + BCH_STR_HASH_OPT_NR +}; + +#define BCH_CSUM_TYPES() \ + x(none, 0) \ + x(crc32c_nonzero, 1) \ + x(crc64_nonzero, 2) \ + x(chacha20_poly1305_80, 3) \ + x(chacha20_poly1305_128, 4) \ + x(crc32c, 5) \ + x(crc64, 6) \ + x(xxhash, 7) + +enum bch_csum_type { +#define x(t, n) BCH_CSUM_##t = n, + BCH_CSUM_TYPES() +#undef x + BCH_CSUM_NR +}; + +static const unsigned bch_crc_bytes[] = { + [BCH_CSUM_none] = 0, + [BCH_CSUM_crc32c_nonzero] = 4, + [BCH_CSUM_crc32c] = 4, + [BCH_CSUM_crc64_nonzero] = 8, + [BCH_CSUM_crc64] = 8, + [BCH_CSUM_xxhash] = 8, + [BCH_CSUM_chacha20_poly1305_80] = 10, + [BCH_CSUM_chacha20_poly1305_128] = 16, +}; + +static inline _Bool bch2_csum_type_is_encryption(enum bch_csum_type type) +{ + switch (type) { + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: + return true; + default: + return false; + } +} + +#define BCH_CSUM_OPTS() \ + x(none, 0) \ + x(crc32c, 1) \ + x(crc64, 2) \ + x(xxhash, 3) + +enum bch_csum_opts { +#define x(t, n) BCH_CSUM_OPT_##t = n, + BCH_CSUM_OPTS() +#undef x + BCH_CSUM_OPT_NR +}; + +#define BCH_COMPRESSION_TYPES() \ + x(none, 0) \ + x(lz4_old, 1) \ + x(gzip, 2) \ + x(lz4, 3) \ + x(zstd, 4) \ + x(incompressible, 5) + +enum bch_compression_type { +#define x(t, n) BCH_COMPRESSION_TYPE_##t = n, + BCH_COMPRESSION_TYPES() +#undef x + BCH_COMPRESSION_TYPE_NR +}; + +#define BCH_COMPRESSION_OPTS() \ + x(none, 0) \ + x(lz4, 1) \ + x(gzip, 2) \ + x(zstd, 3) + +enum bch_compression_opts { +#define x(t, n) BCH_COMPRESSION_OPT_##t = n, + BCH_COMPRESSION_OPTS() +#undef x + BCH_COMPRESSION_OPT_NR +}; + +/* + * Magic numbers + * + * The various other data structures have their own magic numbers, which are + * xored with the first part of the cache set's UUID + */ + +#define BCACHE_MAGIC \ + UUID_INIT(0xc68573f6, 0x4e1a, 0x45ca, \ + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81) +#define BCHFS_MAGIC \ + UUID_INIT(0xc68573f6, 0x66ce, 0x90a9, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + +#define BCACHEFS_STATFS_MAGIC 0xca451a4e + +#define JSET_MAGIC __cpu_to_le64(0x245235c1a3625032ULL) +#define BSET_MAGIC __cpu_to_le64(0x90135c78b99e07f5ULL) + +static inline __le64 __bch2_sb_magic(struct bch_sb *sb) +{ + __le64 ret; + + memcpy(&ret, &sb->uuid, sizeof(ret)); + return ret; +} + +static inline __u64 __jset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ JSET_MAGIC); +} + +static inline __u64 __bset_magic(struct bch_sb *sb) +{ + return __le64_to_cpu(__bch2_sb_magic(sb) ^ BSET_MAGIC); +} + +/* Journal */ + +#define JSET_KEYS_U64s (sizeof(struct jset_entry) / sizeof(__u64)) + +#define BCH_JSET_ENTRY_TYPES() \ + x(btree_keys, 0) \ + x(btree_root, 1) \ + x(prio_ptrs, 2) \ + x(blacklist, 3) \ + x(blacklist_v2, 4) \ + x(usage, 5) \ + x(data_usage, 6) \ + x(clock, 7) \ + x(dev_usage, 8) \ + x(log, 9) \ + x(overwrite, 10) + +enum { +#define x(f, nr) BCH_JSET_ENTRY_##f = nr, + BCH_JSET_ENTRY_TYPES() +#undef x + BCH_JSET_ENTRY_NR +}; + +/* + * Journal sequence numbers can be blacklisted: bsets record the max sequence + * number of all the journal entries they contain updates for, so that on + * recovery we can ignore those bsets that contain index updates newer that what + * made it into the journal. + * + * This means that we can't reuse that journal_seq - we have to skip it, and + * then record that we skipped it so that the next time we crash and recover we + * don't think there was a missing journal entry. + */ +struct jset_entry_blacklist { + struct jset_entry entry; + __le64 seq; +}; + +struct jset_entry_blacklist_v2 { + struct jset_entry entry; + __le64 start; + __le64 end; +}; + +#define BCH_FS_USAGE_TYPES() \ + x(reserved, 0) \ + x(inodes, 1) \ + x(key_version, 2) + +enum { +#define x(f, nr) BCH_FS_USAGE_##f = nr, + BCH_FS_USAGE_TYPES() +#undef x + BCH_FS_USAGE_NR +}; + +struct jset_entry_usage { + struct jset_entry entry; + __le64 v; +} __packed; + +struct jset_entry_data_usage { + struct jset_entry entry; + __le64 v; + struct bch_replicas_entry r; +} __packed; + +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; +} __packed; + +struct jset_entry_dev_usage_type { + __le64 buckets; + __le64 sectors; + __le64 fragmented; +} __packed; + +struct jset_entry_dev_usage { + struct jset_entry entry; + __le32 dev; + __u32 pad; + + __le64 buckets_ec; + __le64 _buckets_unavailable; /* No longer used */ + + struct jset_entry_dev_usage_type d[]; +} __packed; + +static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage *u) +{ + return (vstruct_bytes(&u->entry) - sizeof(struct jset_entry_dev_usage)) / + sizeof(struct jset_entry_dev_usage_type); +} + +struct jset_entry_log { + struct jset_entry entry; + u8 d[]; +} __packed; + +/* + * On disk format for a journal entry: + * seq is monotonically increasing; every journal entry has its own unique + * sequence number. + * + * last_seq is the oldest journal entry that still has keys the btree hasn't + * flushed to disk yet. + * + * version is for on disk format changes. + */ +struct jset { + struct bch_csum csum; + + __le64 magic; + __le64 seq; + __le32 version; + __le32 flags; + + __le32 u64s; /* size of d[] in u64s */ + + __u8 encrypted_start[0]; + + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; + + /* Sequence number of oldest dirty journal entry */ + __le64 last_seq; + + + union { + struct jset_entry start[0]; + __u64 _data[0]; + }; +} __packed __aligned(8); + +LE32_BITMASK(JSET_CSUM_TYPE, struct jset, flags, 0, 4); +LE32_BITMASK(JSET_BIG_ENDIAN, struct jset, flags, 4, 5); +LE32_BITMASK(JSET_NO_FLUSH, struct jset, flags, 5, 6); + +#define BCH_JOURNAL_BUCKETS_MIN 8 + +/* Btree: */ + +#define BCH_BTREE_IDS() \ + x(extents, 0) \ + x(inodes, 1) \ + x(dirents, 2) \ + x(xattrs, 3) \ + x(alloc, 4) \ + x(quotas, 5) \ + x(stripes, 6) \ + x(reflink, 7) \ + x(subvolumes, 8) \ + x(snapshots, 9) \ + x(lru, 10) \ + x(freespace, 11) \ + x(need_discard, 12) \ + x(backpointers, 13) \ + x(bucket_gens, 14) \ + x(snapshot_trees, 15) + +enum btree_id { +#define x(kwd, val) BTREE_ID_##kwd = val, + BCH_BTREE_IDS() +#undef x + BTREE_ID_NR +}; + +#define BTREE_MAX_DEPTH 4U + +/* Btree nodes */ + +/* + * Btree nodes + * + * On disk a btree node is a list/log of these; within each set the keys are + * sorted + */ +struct bset { + __le64 seq; + + /* + * Highest journal entry this bset contains keys for. + * If on recovery we don't see that journal entry, this bset is ignored: + * this allows us to preserve the order of all index updates after a + * crash, since the journal records a total order of all index updates + * and anything that didn't make it to the journal doesn't get used. + */ + __le64 journal_seq; + + __le32 flags; + __le16 version; + __le16 u64s; /* count of d[] in u64s */ + + union { + struct bkey_packed start[0]; + __u64 _data[0]; + }; +} __packed __aligned(8); + +LE32_BITMASK(BSET_CSUM_TYPE, struct bset, flags, 0, 4); + +LE32_BITMASK(BSET_BIG_ENDIAN, struct bset, flags, 4, 5); +LE32_BITMASK(BSET_SEPARATE_WHITEOUTS, + struct bset, flags, 5, 6); + +/* Sector offset within the btree node: */ +LE32_BITMASK(BSET_OFFSET, struct bset, flags, 16, 32); + +struct btree_node { + struct bch_csum csum; + __le64 magic; + + /* this flags field is encrypted, unlike bset->flags: */ + __le64 flags; + + /* Closed interval: */ + struct bpos min_key; + struct bpos max_key; + struct bch_extent_ptr _ptr; /* not used anymore */ + struct bkey_format format; + + union { + struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + + }; + }; +} __packed __aligned(8); + +LE64_BITMASK(BTREE_NODE_ID_LO, struct btree_node, flags, 0, 4); +LE64_BITMASK(BTREE_NODE_LEVEL, struct btree_node, flags, 4, 8); +LE64_BITMASK(BTREE_NODE_NEW_EXTENT_OVERWRITE, + struct btree_node, flags, 8, 9); +LE64_BITMASK(BTREE_NODE_ID_HI, struct btree_node, flags, 9, 25); +/* 25-32 unused */ +LE64_BITMASK(BTREE_NODE_SEQ, struct btree_node, flags, 32, 64); + +static inline __u64 BTREE_NODE_ID(struct btree_node *n) +{ + return BTREE_NODE_ID_LO(n) | (BTREE_NODE_ID_HI(n) << 4); +} + +static inline void SET_BTREE_NODE_ID(struct btree_node *n, __u64 v) +{ + SET_BTREE_NODE_ID_LO(n, v); + SET_BTREE_NODE_ID_HI(n, v >> 4); +} + +struct btree_node_entry { + struct bch_csum csum; + + union { + struct bset keys; + struct { + __u8 pad[22]; + __le16 u64s; + __u64 _data[0]; + }; + }; +} __packed __aligned(8); + +#endif /* _BCACHEFS_FORMAT_H */ diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h new file mode 100644 index 000000000..f05881f7e --- /dev/null +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -0,0 +1,368 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IOCTL_H +#define _BCACHEFS_IOCTL_H + +#include +#include +#include "bcachefs_format.h" + +/* + * Flags common to multiple ioctls: + */ +#define BCH_FORCE_IF_DATA_LOST (1 << 0) +#define BCH_FORCE_IF_METADATA_LOST (1 << 1) +#define BCH_FORCE_IF_DATA_DEGRADED (1 << 2) +#define BCH_FORCE_IF_METADATA_DEGRADED (1 << 3) + +#define BCH_FORCE_IF_LOST \ + (BCH_FORCE_IF_DATA_LOST| \ + BCH_FORCE_IF_METADATA_LOST) +#define BCH_FORCE_IF_DEGRADED \ + (BCH_FORCE_IF_DATA_DEGRADED| \ + BCH_FORCE_IF_METADATA_DEGRADED) + +/* + * If cleared, ioctl that refer to a device pass it as a pointer to a pathname + * (e.g. /dev/sda1); if set, the dev field is the device's index within the + * filesystem: + */ +#define BCH_BY_INDEX (1 << 4) + +/* + * For BCH_IOCTL_READ_SUPER: get superblock of a specific device, not filesystem + * wide superblock: + */ +#define BCH_READ_DEV (1 << 5) + +/* global control dev: */ + +/* These are currently broken, and probably unnecessary: */ +#if 0 +#define BCH_IOCTL_ASSEMBLE _IOW(0xbc, 1, struct bch_ioctl_assemble) +#define BCH_IOCTL_INCREMENTAL _IOW(0xbc, 2, struct bch_ioctl_incremental) + +struct bch_ioctl_assemble { + __u32 flags; + __u32 nr_devs; + __u64 pad; + __u64 devs[]; +}; + +struct bch_ioctl_incremental { + __u32 flags; + __u64 pad; + __u64 dev; +}; +#endif + +/* filesystem ioctls: */ + +#define BCH_IOCTL_QUERY_UUID _IOR(0xbc, 1, struct bch_ioctl_query_uuid) + +/* These only make sense when we also have incremental assembly */ +#if 0 +#define BCH_IOCTL_START _IOW(0xbc, 2, struct bch_ioctl_start) +#define BCH_IOCTL_STOP _IO(0xbc, 3) +#endif + +#define BCH_IOCTL_DISK_ADD _IOW(0xbc, 4, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_REMOVE _IOW(0xbc, 5, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_ONLINE _IOW(0xbc, 6, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_OFFLINE _IOW(0xbc, 7, struct bch_ioctl_disk) +#define BCH_IOCTL_DISK_SET_STATE _IOW(0xbc, 8, struct bch_ioctl_disk_set_state) +#define BCH_IOCTL_DATA _IOW(0xbc, 10, struct bch_ioctl_data) +#define BCH_IOCTL_FS_USAGE _IOWR(0xbc, 11, struct bch_ioctl_fs_usage) +#define BCH_IOCTL_DEV_USAGE _IOWR(0xbc, 11, struct bch_ioctl_dev_usage) +#define BCH_IOCTL_READ_SUPER _IOW(0xbc, 12, struct bch_ioctl_read_super) +#define BCH_IOCTL_DISK_GET_IDX _IOW(0xbc, 13, struct bch_ioctl_disk_get_idx) +#define BCH_IOCTL_DISK_RESIZE _IOW(0xbc, 14, struct bch_ioctl_disk_resize) +#define BCH_IOCTL_DISK_RESIZE_JOURNAL _IOW(0xbc,15, struct bch_ioctl_disk_resize_journal) + +#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume) +#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume) + +/* ioctl below act on a particular file, not the filesystem as a whole: */ + +#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *) + +/* + * BCH_IOCTL_QUERY_UUID: get filesystem UUID + * + * Returns user visible UUID, not internal UUID (which may not ever be changed); + * the filesystem's sysfs directory may be found under /sys/fs/bcachefs with + * this UUID. + */ +struct bch_ioctl_query_uuid { + __uuid_t uuid; +}; + +#if 0 +struct bch_ioctl_start { + __u32 flags; + __u32 pad; +}; +#endif + +/* + * BCH_IOCTL_DISK_ADD: add a new device to an existing filesystem + * + * The specified device must not be open or in use. On success, the new device + * will be an online member of the filesystem just like any other member. + * + * The device must first be prepared by userspace by formatting with a bcachefs + * superblock, which is only used for passing in superblock options/parameters + * for that device (in struct bch_member). The new device's superblock should + * not claim to be a member of any existing filesystem - UUIDs on it will be + * ignored. + */ + +/* + * BCH_IOCTL_DISK_REMOVE: permanently remove a member device from a filesystem + * + * Any data present on @dev will be permanently deleted, and @dev will be + * removed from its slot in the filesystem's list of member devices. The device + * may be either offline or offline. + * + * Will fail removing @dev would leave us with insufficient read write devices + * or degraded/unavailable data, unless the approprate BCH_FORCE_IF_* flags are + * set. + */ + +/* + * BCH_IOCTL_DISK_ONLINE: given a disk that is already a member of a filesystem + * but is not open (e.g. because we started in degraded mode), bring it online + * + * all existing data on @dev will be available once the device is online, + * exactly as if @dev was present when the filesystem was first mounted + */ + +/* + * BCH_IOCTL_DISK_OFFLINE: offline a disk, causing the kernel to close that + * block device, without removing it from the filesystem (so it can be brought + * back online later) + * + * Data present on @dev will be unavailable while @dev is offline (unless + * replicated), but will still be intact and untouched if @dev is brought back + * online + * + * Will fail (similarly to BCH_IOCTL_DISK_SET_STATE) if offlining @dev would + * leave us with insufficient read write devices or degraded/unavailable data, + * unless the approprate BCH_FORCE_IF_* flags are set. + */ + +struct bch_ioctl_disk { + __u32 flags; + __u32 pad; + __u64 dev; +}; + +/* + * BCH_IOCTL_DISK_SET_STATE: modify state of a member device of a filesystem + * + * @new_state - one of the bch_member_state states (rw, ro, failed, + * spare) + * + * Will refuse to change member state if we would then have insufficient devices + * to write to, or if it would result in degraded data (when @new_state is + * failed or spare) unless the appropriate BCH_FORCE_IF_* flags are set. + */ +struct bch_ioctl_disk_set_state { + __u32 flags; + __u8 new_state; + __u8 pad[3]; + __u64 dev; +}; + +enum bch_data_ops { + BCH_DATA_OP_SCRUB = 0, + BCH_DATA_OP_REREPLICATE = 1, + BCH_DATA_OP_MIGRATE = 2, + BCH_DATA_OP_REWRITE_OLD_NODES = 3, + BCH_DATA_OP_NR = 4, +}; + +/* + * BCH_IOCTL_DATA: operations that walk and manipulate filesystem data (e.g. + * scrub, rereplicate, migrate). + * + * This ioctl kicks off a job in the background, and returns a file descriptor. + * Reading from the file descriptor returns a struct bch_ioctl_data_event, + * indicating current progress, and closing the file descriptor will stop the + * job. The file descriptor is O_CLOEXEC. + */ +struct bch_ioctl_data { + __u16 op; + __u8 start_btree; + __u8 end_btree; + __u32 flags; + + struct bpos start_pos; + struct bpos end_pos; + + union { + struct { + __u32 dev; + __u32 pad; + } migrate; + struct { + __u64 pad[8]; + }; + }; +} __packed __aligned(8); + +enum bch_data_event { + BCH_DATA_EVENT_PROGRESS = 0, + /* XXX: add an event for reporting errors */ + BCH_DATA_EVENT_NR = 1, +}; + +struct bch_ioctl_data_progress { + __u8 data_type; + __u8 btree_id; + __u8 pad[2]; + struct bpos pos; + + __u64 sectors_done; + __u64 sectors_total; +} __packed __aligned(8); + +struct bch_ioctl_data_event { + __u8 type; + __u8 pad[7]; + union { + struct bch_ioctl_data_progress p; + __u64 pad2[15]; + }; +} __packed __aligned(8); + +struct bch_replicas_usage { + __u64 sectors; + struct bch_replicas_entry r; +} __packed; + +static inline struct bch_replicas_usage * +replicas_usage_next(struct bch_replicas_usage *u) +{ + return (void *) u + replicas_entry_bytes(&u->r) + 8; +} + +/* + * BCH_IOCTL_FS_USAGE: query filesystem disk space usage + * + * Returns disk space usage broken out by data type, number of replicas, and + * by component device + * + * @replica_entries_bytes - size, in bytes, allocated for replica usage entries + * + * On success, @replica_entries_bytes will be changed to indicate the number of + * bytes actually used. + * + * Returns -ERANGE if @replica_entries_bytes was too small + */ +struct bch_ioctl_fs_usage { + __u64 capacity; + __u64 used; + __u64 online_reserved; + __u64 persistent_reserved[BCH_REPLICAS_MAX]; + + __u32 replica_entries_bytes; + __u32 pad; + + struct bch_replicas_usage replicas[0]; +}; + +/* + * BCH_IOCTL_DEV_USAGE: query device disk space usage + * + * Returns disk space usage broken out by data type - both by buckets and + * sectors. + */ +struct bch_ioctl_dev_usage { + __u64 dev; + __u32 flags; + __u8 state; + __u8 pad[7]; + + __u32 bucket_size; + __u64 nr_buckets; + + __u64 buckets_ec; + + struct bch_ioctl_dev_usage_type { + __u64 buckets; + __u64 sectors; + __u64 fragmented; + } d[BCH_DATA_NR]; +}; + +/* + * BCH_IOCTL_READ_SUPER: read filesystem superblock + * + * Equivalent to reading the superblock directly from the block device, except + * avoids racing with the kernel writing the superblock or having to figure out + * which block device to read + * + * @sb - buffer to read into + * @size - size of userspace allocated buffer + * @dev - device to read superblock for, if BCH_READ_DEV flag is + * specified + * + * Returns -ERANGE if buffer provided is too small + */ +struct bch_ioctl_read_super { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 size; + __u64 sb; +}; + +/* + * BCH_IOCTL_DISK_GET_IDX: give a path to a block device, query filesystem to + * determine if disk is a (online) member - if so, returns device's index + * + * Returns -ENOENT if not found + */ +struct bch_ioctl_disk_get_idx { + __u64 dev; +}; + +/* + * BCH_IOCTL_DISK_RESIZE: resize filesystem on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ +struct bch_ioctl_disk_resize { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + +/* + * BCH_IOCTL_DISK_RESIZE_JOURNAL: resize journal on a device + * + * @dev - member to resize + * @nbuckets - new number of buckets + */ +struct bch_ioctl_disk_resize_journal { + __u32 flags; + __u32 pad; + __u64 dev; + __u64 nbuckets; +}; + +struct bch_ioctl_subvolume { + __u32 flags; + __u32 dirfd; + __u16 mode; + __u16 pad[3]; + __u64 dst_ptr; + __u64 src_ptr; +}; + +#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0) +#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1) + +#endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c new file mode 100644 index 000000000..ee7ba700e --- /dev/null +++ b/fs/bcachefs/bkey.c @@ -0,0 +1,1063 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey.h" +#include "bkey_cmp.h" +#include "bkey_methods.h" +#include "bset.h" +#include "util.h" + +#undef EBUG_ON + +#ifdef DEBUG_BKEYS +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) +#endif + +const struct bkey_format bch2_bkey_format_current = BKEY_FORMAT_CURRENT; + +void bch2_bkey_packed_to_binary_text(struct printbuf *out, + const struct bkey_format *f, + const struct bkey_packed *k) +{ + const u64 *p = high_word(f, k); + unsigned word_bits = 64 - high_bit_offset; + unsigned nr_key_bits = bkey_format_key_bits(f) + high_bit_offset; + u64 v = *p & (~0ULL >> high_bit_offset); + + if (!nr_key_bits) { + prt_str(out, "(empty)"); + return; + } + + while (1) { + unsigned next_key_bits = nr_key_bits; + + if (nr_key_bits < 64) { + v >>= 64 - nr_key_bits; + next_key_bits = 0; + } else { + next_key_bits -= 64; + } + + bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits)); + + if (!next_key_bits) + break; + + prt_char(out, ' '); + + p = next_word(p); + v = *p; + word_bits = 64; + nr_key_bits = next_key_bits; + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +static void bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) +{ + struct bkey tmp; + + BUG_ON(bkeyp_val_u64s(format, packed) != + bkey_val_u64s(unpacked)); + + BUG_ON(packed->u64s < bkeyp_key_u64s(format, packed)); + + tmp = __bch2_bkey_unpack_key(format, packed); + + if (memcmp(&tmp, unpacked, sizeof(struct bkey))) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "keys differ: format u64s %u fields %u %u %u %u %u\n", + format->key_u64s, + format->bits_per_field[0], + format->bits_per_field[1], + format->bits_per_field[2], + format->bits_per_field[3], + format->bits_per_field[4]); + + prt_printf(&buf, "compiled unpack: "); + bch2_bkey_to_text(&buf, unpacked); + prt_newline(&buf); + + prt_printf(&buf, "c unpack: "); + bch2_bkey_to_text(&buf, &tmp); + prt_newline(&buf); + + prt_printf(&buf, "compiled unpack: "); + bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, + (struct bkey_packed *) unpacked); + prt_newline(&buf); + + prt_printf(&buf, "c unpack: "); + bch2_bkey_packed_to_binary_text(&buf, &bch2_bkey_format_current, + (struct bkey_packed *) &tmp); + prt_newline(&buf); + + panic("%s", buf.buf); + } +} + +#else +static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) {} +#endif + +struct pack_state { + const struct bkey_format *format; + unsigned bits; /* bits remaining in current word */ + u64 w; /* current word */ + u64 *p; /* pointer to next word */ +}; + +__always_inline +static struct pack_state pack_state_init(const struct bkey_format *format, + struct bkey_packed *k) +{ + u64 *p = high_word(format, k); + + return (struct pack_state) { + .format = format, + .bits = 64 - high_bit_offset, + .w = 0, + .p = p, + }; +} + +__always_inline +static void pack_state_finish(struct pack_state *state, + struct bkey_packed *k) +{ + EBUG_ON(state->p < k->_data); + EBUG_ON(state->p >= k->_data + state->format->key_u64s); + + *state->p = state->w; +} + +struct unpack_state { + const struct bkey_format *format; + unsigned bits; /* bits remaining in current word */ + u64 w; /* current word */ + const u64 *p; /* pointer to next word */ +}; + +__always_inline +static struct unpack_state unpack_state_init(const struct bkey_format *format, + const struct bkey_packed *k) +{ + const u64 *p = high_word(format, k); + + return (struct unpack_state) { + .format = format, + .bits = 64 - high_bit_offset, + .w = *p << high_bit_offset, + .p = p, + }; +} + +__always_inline +static u64 get_inc_field(struct unpack_state *state, unsigned field) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 v = 0, offset = le64_to_cpu(state->format->field_offset[field]); + + if (bits >= state->bits) { + v = state->w >> (64 - bits); + bits -= state->bits; + + state->p = next_word(state->p); + state->w = *state->p; + state->bits = 64; + } + + /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ + v |= (state->w >> 1) >> (63 - bits); + state->w <<= bits; + state->bits -= bits; + + return v + offset; +} + +__always_inline +static bool set_inc_field(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 offset = le64_to_cpu(state->format->field_offset[field]); + + if (v < offset) + return false; + + v -= offset; + + if (fls64(v) > bits) + return false; + + if (bits > state->bits) { + bits -= state->bits; + /* avoid shift by 64 if bits is 0 - bits is never 64 here: */ + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + + return true; +} + +/* + * Note: does NOT set out->format (we don't know what it should be here!) + * + * Also: doesn't work on extents - it doesn't preserve the invariant that + * if k is packed bkey_start_pos(k) will successfully pack + */ +static bool bch2_bkey_transform_key(const struct bkey_format *out_f, + struct bkey_packed *out, + const struct bkey_format *in_f, + const struct bkey_packed *in) +{ + struct pack_state out_s = pack_state_init(out_f, out); + struct unpack_state in_s = unpack_state_init(in_f, in); + u64 *w = out->_data; + unsigned i; + + *w = 0; + + for (i = 0; i < BKEY_NR_FIELDS; i++) + if (!set_inc_field(&out_s, i, get_inc_field(&in_s, i))) + return false; + + /* Can't happen because the val would be too big to unpack: */ + EBUG_ON(in->u64s - in_f->key_u64s + out_f->key_u64s > U8_MAX); + + pack_state_finish(&out_s, out); + out->u64s = out_f->key_u64s + in->u64s - in_f->key_u64s; + out->needs_whiteout = in->needs_whiteout; + out->type = in->type; + + return true; +} + +bool bch2_bkey_transform(const struct bkey_format *out_f, + struct bkey_packed *out, + const struct bkey_format *in_f, + const struct bkey_packed *in) +{ + if (!bch2_bkey_transform_key(out_f, out, in_f, in)) + return false; + + memcpy_u64s((u64 *) out + out_f->key_u64s, + (u64 *) in + in_f->key_u64s, + (in->u64s - in_f->key_u64s)); + return true; +} + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *format, + const struct bkey_packed *in) +{ + struct unpack_state state = unpack_state_init(format, in); + struct bkey out; + + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->u64s < format->key_u64s); + EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + EBUG_ON(in->u64s - format->key_u64s + BKEY_U64s > U8_MAX); + + out.u64s = BKEY_U64s + in->u64s - format->key_u64s; + out.format = KEY_FORMAT_CURRENT; + out.needs_whiteout = in->needs_whiteout; + out.type = in->type; + out.pad[0] = 0; + +#define x(id, field) out.field = get_inc_field(&state, id); + bkey_fields() +#undef x + + return out; +} + +#ifndef HAVE_BCACHEFS_COMPILED_UNPACK +struct bpos __bkey_unpack_pos(const struct bkey_format *format, + const struct bkey_packed *in) +{ + struct unpack_state state = unpack_state_init(format, in); + struct bpos out; + + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->u64s < format->key_u64s); + EBUG_ON(in->format != KEY_FORMAT_LOCAL_BTREE); + + out.inode = get_inc_field(&state, BKEY_FIELD_INODE); + out.offset = get_inc_field(&state, BKEY_FIELD_OFFSET); + out.snapshot = get_inc_field(&state, BKEY_FIELD_SNAPSHOT); + + return out; +} +#endif + +/** + * bch2_bkey_pack_key -- pack just the key, not the value + */ +bool bch2_bkey_pack_key(struct bkey_packed *out, const struct bkey *in, + const struct bkey_format *format) +{ + struct pack_state state = pack_state_init(format, out); + u64 *w = out->_data; + + EBUG_ON((void *) in == (void *) out); + EBUG_ON(format->nr_fields != BKEY_NR_FIELDS); + EBUG_ON(in->format != KEY_FORMAT_CURRENT); + + *w = 0; + +#define x(id, field) if (!set_inc_field(&state, id, in->field)) return false; + bkey_fields() +#undef x + pack_state_finish(&state, out); + out->u64s = format->key_u64s + in->u64s - BKEY_U64s; + out->format = KEY_FORMAT_LOCAL_BTREE; + out->needs_whiteout = in->needs_whiteout; + out->type = in->type; + + bch2_bkey_pack_verify(out, in, format); + return true; +} + +/** + * bch2_bkey_unpack -- unpack the key and the value + */ +void bch2_bkey_unpack(const struct btree *b, struct bkey_i *dst, + const struct bkey_packed *src) +{ + __bkey_unpack_key(b, &dst->k, src); + + memcpy_u64s(&dst->v, + bkeyp_val(&b->format, src), + bkeyp_val_u64s(&b->format, src)); +} + +/** + * bch2_bkey_pack -- pack the key and the value + */ +bool bch2_bkey_pack(struct bkey_packed *out, const struct bkey_i *in, + const struct bkey_format *format) +{ + struct bkey_packed tmp; + + if (!bch2_bkey_pack_key(&tmp, &in->k, format)) + return false; + + memmove_u64s((u64 *) out + format->key_u64s, + &in->v, + bkey_val_u64s(&in->k)); + memcpy_u64s_small(out, &tmp, format->key_u64s); + + return true; +} + +__always_inline +static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) +{ + unsigned bits = state->format->bits_per_field[field]; + u64 offset = le64_to_cpu(state->format->field_offset[field]); + bool ret = true; + + EBUG_ON(v < offset); + v -= offset; + + if (fls64(v) > bits) { + v = ~(~0ULL << bits); + ret = false; + } + + if (bits > state->bits) { + bits -= state->bits; + state->w |= (v >> 1) >> (bits - 1); + + *state->p = state->w; + state->p = next_word(state->p); + state->w = 0; + state->bits = 64; + } + + state->bits -= bits; + state->w |= v << state->bits; + + return ret; +} + +#ifdef CONFIG_BCACHEFS_DEBUG +static bool bkey_packed_successor(struct bkey_packed *out, + const struct btree *b, + struct bkey_packed k) +{ + const struct bkey_format *f = &b->format; + unsigned nr_key_bits = b->nr_key_bits; + unsigned first_bit, offset; + u64 *p; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + if (!nr_key_bits) + return false; + + *out = k; + + first_bit = high_bit_offset + nr_key_bits - 1; + p = nth_word(high_word(f, out), first_bit >> 6); + offset = 63 - (first_bit & 63); + + while (nr_key_bits) { + unsigned bits = min(64 - offset, nr_key_bits); + u64 mask = (~0ULL >> (64 - bits)) << offset; + + if ((*p & mask) != mask) { + *p += 1ULL << offset; + EBUG_ON(bch2_bkey_cmp_packed(b, out, &k) <= 0); + return true; + } + + *p &= ~mask; + p = prev_word(p); + nr_key_bits -= bits; + offset = 0; + } + + return false; +} +#endif + +/* + * Returns a packed key that compares <= in + * + * This is used in bset_search_tree(), where we need a packed pos in order to be + * able to compare against the keys in the auxiliary search tree - and it's + * legal to use a packed pos that isn't equivalent to the original pos, + * _provided_ it compares <= to the original pos. + */ +enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, + struct bpos in, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + struct pack_state state = pack_state_init(f, out); + u64 *w = out->_data; +#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos orig = in; +#endif + bool exact = true; + unsigned i; + + /* + * bch2_bkey_pack_key() will write to all of f->key_u64s, minus the 3 + * byte header, but pack_pos() won't if the len/version fields are big + * enough - we need to make sure to zero them out: + */ + for (i = 0; i < f->key_u64s; i++) + w[i] = 0; + + if (unlikely(in.snapshot < + le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]))) { + if (!in.offset-- && + !in.inode--) + return BKEY_PACK_POS_FAIL; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(in.offset < + le64_to_cpu(f->field_offset[BKEY_FIELD_OFFSET]))) { + if (!in.inode--) + return BKEY_PACK_POS_FAIL; + in.offset = KEY_OFFSET_MAX; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(in.inode < + le64_to_cpu(f->field_offset[BKEY_FIELD_INODE]))) + return BKEY_PACK_POS_FAIL; + + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_INODE, in.inode))) { + in.offset = KEY_OFFSET_MAX; + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_OFFSET, in.offset))) { + in.snapshot = KEY_SNAPSHOT_MAX; + exact = false; + } + + if (unlikely(!set_inc_field_lossy(&state, BKEY_FIELD_SNAPSHOT, in.snapshot))) + exact = false; + + pack_state_finish(&state, out); + out->u64s = f->key_u64s; + out->format = KEY_FORMAT_LOCAL_BTREE; + out->type = KEY_TYPE_deleted; + +#ifdef CONFIG_BCACHEFS_DEBUG + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; + + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0); + } +#endif + + return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; +} + +void bch2_bkey_format_init(struct bkey_format_state *s) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(s->field_min); i++) + s->field_min[i] = U64_MAX; + + for (i = 0; i < ARRAY_SIZE(s->field_max); i++) + s->field_max[i] = 0; + + /* Make sure we can store a size of 0: */ + s->field_min[BKEY_FIELD_SIZE] = 0; +} + +void bch2_bkey_format_add_pos(struct bkey_format_state *s, struct bpos p) +{ + unsigned field = 0; + + __bkey_format_add(s, field++, p.inode); + __bkey_format_add(s, field++, p.offset); + __bkey_format_add(s, field++, p.snapshot); +} + +/* + * We don't want it to be possible for the packed format to represent fields + * bigger than a u64... that will cause confusion and issues (like with + * bkey_packed_successor()) + */ +static void set_format_field(struct bkey_format *f, enum bch_bkey_fields i, + unsigned bits, u64 offset) +{ + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + + bits = min(bits, unpacked_bits); + + offset = bits == unpacked_bits ? 0 : min(offset, unpacked_max - ((1ULL << bits) - 1)); + + f->bits_per_field[i] = bits; + f->field_offset[i] = cpu_to_le64(offset); +} + +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + struct bkey_format ret = { + .nr_fields = BKEY_NR_FIELDS, + }; + + for (i = 0; i < ARRAY_SIZE(s->field_min); i++) { + s->field_min[i] = min(s->field_min[i], s->field_max[i]); + + set_format_field(&ret, i, + fls64(s->field_max[i] - s->field_min[i]), + s->field_min[i]); + + bits += ret.bits_per_field[i]; + } + + /* allow for extent merging: */ + if (ret.bits_per_field[BKEY_FIELD_SIZE]) { + ret.bits_per_field[BKEY_FIELD_SIZE] += 4; + bits += 4; + } + + ret.key_u64s = DIV_ROUND_UP(bits, 64); + + /* if we have enough spare bits, round fields up to nearest byte */ + bits = ret.key_u64s * 64 - bits; + + for (i = 0; i < ARRAY_SIZE(ret.bits_per_field); i++) { + unsigned r = round_up(ret.bits_per_field[i], 8) - + ret.bits_per_field[i]; + + if (r <= bits) { + set_format_field(&ret, i, + ret.bits_per_field[i] + r, + le64_to_cpu(ret.field_offset[i])); + bits -= r; + } + } + + EBUG_ON(bch2_bkey_format_validate(&ret)); + return ret; +} + +const char *bch2_bkey_format_validate(struct bkey_format *f) +{ + unsigned i, bits = KEY_PACKED_BITS_START; + + if (f->nr_fields != BKEY_NR_FIELDS) + return "incorrect number of fields"; + + /* + * Verify that the packed format can't represent fields larger than the + * unpacked format: + */ + for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_max = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 packed_max = f->bits_per_field[i] + ? ~((~0ULL << 1) << (f->bits_per_field[i] - 1)) + : 0; + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (packed_max + field_offset < packed_max || + packed_max + field_offset > unpacked_max) + return "field too large"; + + bits += f->bits_per_field[i]; + } + + if (f->key_u64s != DIV_ROUND_UP(bits, 64)) + return "incorrect key_u64s"; + + return NULL; +} + +/* + * Most significant differing bit + * Bits are indexed from 0 - return is [0, nr_key_bits) + */ +__pure +unsigned bch2_bkey_greatest_differing_bit(const struct btree *b, + const struct bkey_packed *l_k, + const struct bkey_packed *r_k) +{ + const u64 *l = high_word(&b->format, l_k); + const u64 *r = high_word(&b->format, r_k); + unsigned nr_key_bits = b->nr_key_bits; + unsigned word_bits = 64 - high_bit_offset; + u64 l_v, r_v; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); + + /* for big endian, skip past header */ + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (nr_key_bits) { + if (nr_key_bits < word_bits) { + l_v >>= word_bits - nr_key_bits; + r_v >>= word_bits - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= word_bits; + } + + if (l_v != r_v) + return fls64(l_v ^ r_v) - 1 + nr_key_bits; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + word_bits = 64; + } + + return 0; +} + +/* + * First set bit + * Bits are indexed from 0 - return is [0, nr_key_bits) + */ +__pure +unsigned bch2_bkey_ffs(const struct btree *b, const struct bkey_packed *k) +{ + const u64 *p = high_word(&b->format, k); + unsigned nr_key_bits = b->nr_key_bits; + unsigned ret = 0, offset; + + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(&b->format)); + + offset = nr_key_bits; + while (offset > 64) { + p = next_word(p); + offset -= 64; + } + + offset = 64 - offset; + + while (nr_key_bits) { + unsigned bits = nr_key_bits + offset < 64 + ? nr_key_bits + : 64 - offset; + + u64 mask = (~0ULL >> (64 - bits)) << offset; + + if (*p & mask) + return ret + __ffs64(*p & mask) - offset; + + p = prev_word(p); + nr_key_bits -= bits; + ret += bits; + offset = 0; + } + + return 0; +} + +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + +#define I(_x) (*(out)++ = (_x)) +#define I1(i0) I(i0) +#define I2(i0, i1) (I1(i0), I(i1)) +#define I3(i0, i1, i2) (I2(i0, i1), I(i2)) +#define I4(i0, i1, i2, i3) (I3(i0, i1, i2), I(i3)) +#define I5(i0, i1, i2, i3, i4) (I4(i0, i1, i2, i3), I(i4)) + +static u8 *compile_bkey_field(const struct bkey_format *format, u8 *out, + enum bch_bkey_fields field, + unsigned dst_offset, unsigned dst_size, + bool *eax_zeroed) +{ + unsigned bits = format->bits_per_field[field]; + u64 offset = le64_to_cpu(format->field_offset[field]); + unsigned i, byte, bit_offset, align, shl, shr; + + if (!bits && !offset) { + if (!*eax_zeroed) { + /* xor eax, eax */ + I2(0x31, 0xc0); + } + + *eax_zeroed = true; + goto set_field; + } + + if (!bits) { + /* just return offset: */ + + switch (dst_size) { + case 8: + if (offset > S32_MAX) { + /* mov [rdi + dst_offset], offset */ + I3(0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + + I3(0xc7, 0x47, dst_offset + 4); + memcpy(out, (void *) &offset + 4, 4); + out += 4; + } else { + /* mov [rdi + dst_offset], offset */ + /* sign extended */ + I4(0x48, 0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + } + break; + case 4: + /* mov [rdi + dst_offset], offset */ + I3(0xc7, 0x47, dst_offset); + memcpy(out, &offset, 4); + out += 4; + break; + default: + BUG(); + } + + return out; + } + + bit_offset = format->key_u64s * 64; + for (i = 0; i <= field; i++) + bit_offset -= format->bits_per_field[i]; + + byte = bit_offset / 8; + bit_offset -= byte * 8; + + *eax_zeroed = false; + + if (bit_offset == 0 && bits == 8) { + /* movzx eax, BYTE PTR [rsi + imm8] */ + I4(0x0f, 0xb6, 0x46, byte); + } else if (bit_offset == 0 && bits == 16) { + /* movzx eax, WORD PTR [rsi + imm8] */ + I4(0x0f, 0xb7, 0x46, byte); + } else if (bit_offset + bits <= 32) { + align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 32); + + /* mov eax, [rsi + imm8] */ + I3(0x8b, 0x46, byte); + + if (bit_offset) { + /* shr eax, imm8 */ + I3(0xc1, 0xe8, bit_offset); + } + + if (bit_offset + bits < 32) { + unsigned mask = ~0U >> (32 - bits); + + /* and eax, imm32 */ + I1(0x25); + memcpy(out, &mask, 4); + out += 4; + } + } else if (bit_offset + bits <= 64) { + align = min(8 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 7); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 64); + + /* mov rax, [rsi + imm8] */ + I4(0x48, 0x8b, 0x46, byte); + + shl = 64 - bit_offset - bits; + shr = bit_offset + shl; + + if (shl) { + /* shl rax, imm8 */ + I4(0x48, 0xc1, 0xe0, shl); + } + + if (shr) { + /* shr rax, imm8 */ + I4(0x48, 0xc1, 0xe8, shr); + } + } else { + align = min(4 - DIV_ROUND_UP(bit_offset + bits, 8), byte & 3); + byte -= align; + bit_offset += align * 8; + + BUG_ON(bit_offset + bits > 96); + + /* mov rax, [rsi + byte] */ + I4(0x48, 0x8b, 0x46, byte); + + /* mov edx, [rsi + byte + 8] */ + I3(0x8b, 0x56, byte + 8); + + /* bits from next word: */ + shr = bit_offset + bits - 64; + BUG_ON(shr > bit_offset); + + /* shr rax, bit_offset */ + I4(0x48, 0xc1, 0xe8, shr); + + /* shl rdx, imm8 */ + I4(0x48, 0xc1, 0xe2, 64 - shr); + + /* or rax, rdx */ + I3(0x48, 0x09, 0xd0); + + shr = bit_offset - shr; + + if (shr) { + /* shr rax, imm8 */ + I4(0x48, 0xc1, 0xe8, shr); + } + } + + /* rax += offset: */ + if (offset > S32_MAX) { + /* mov rdx, imm64 */ + I2(0x48, 0xba); + memcpy(out, &offset, 8); + out += 8; + /* add %rdx, %rax */ + I3(0x48, 0x01, 0xd0); + } else if (offset + (~0ULL >> (64 - bits)) > U32_MAX) { + /* add rax, imm32 */ + I2(0x48, 0x05); + memcpy(out, &offset, 4); + out += 4; + } else if (offset) { + /* add eax, imm32 */ + I1(0x05); + memcpy(out, &offset, 4); + out += 4; + } +set_field: + switch (dst_size) { + case 8: + /* mov [rdi + dst_offset], rax */ + I4(0x48, 0x89, 0x47, dst_offset); + break; + case 4: + /* mov [rdi + dst_offset], eax */ + I3(0x89, 0x47, dst_offset); + break; + default: + BUG(); + } + + return out; +} + +int bch2_compile_bkey_format(const struct bkey_format *format, void *_out) +{ + bool eax_zeroed = false; + u8 *out = _out; + + /* + * rdi: dst - unpacked key + * rsi: src - packed key + */ + + /* k->u64s, k->format, k->type */ + + /* mov eax, [rsi] */ + I2(0x8b, 0x06); + + /* add eax, BKEY_U64s - format->key_u64s */ + I5(0x05, BKEY_U64s - format->key_u64s, KEY_FORMAT_CURRENT, 0, 0); + + /* and eax, imm32: mask out k->pad: */ + I5(0x25, 0xff, 0xff, 0xff, 0); + + /* mov [rdi], eax */ + I2(0x89, 0x07); + +#define x(id, field) \ + out = compile_bkey_field(format, out, id, \ + offsetof(struct bkey, field), \ + sizeof(((struct bkey *) NULL)->field), \ + &eax_zeroed); + bkey_fields() +#undef x + + /* retq */ + I1(0xc3); + + return (void *) out - _out; +} + +#else +#endif + +__pure +int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); +} + +__pure __flatten +int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + return bpos_cmp(bkey_unpack_pos_format_checked(b, l), *r); +} + +__pure __flatten +int bch2_bkey_cmp_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed_inlined(b, l, r); +} + +__pure __flatten +int __bch2_bkey_cmp_left_packed(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + const struct bkey *l_unpacked; + + return unlikely(l_unpacked = packed_to_bkey_c(l)) + ? bpos_cmp(l_unpacked->p, *r) + : __bch2_bkey_cmp_left_packed_format_checked(b, l, r); +} + +void bch2_bpos_swab(struct bpos *p) +{ + u8 *l = (u8 *) p; + u8 *h = ((u8 *) &p[1]) - 1; + + while (l < h) { + swap(*l, *h); + l++; + --h; + } +} + +void bch2_bkey_swab_key(const struct bkey_format *_f, struct bkey_packed *k) +{ + const struct bkey_format *f = bkey_packed(k) ? _f : &bch2_bkey_format_current; + u8 *l = k->key_start; + u8 *h = (u8 *) (k->_data + f->key_u64s) - 1; + + while (l < h) { + swap(*l, *h); + l++; + --h; + } +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_bkey_pack_test(void) +{ + struct bkey t = KEY(4134ULL, 1250629070527416633ULL, 0); + struct bkey_packed p; + + struct bkey_format test_format = { + .key_u64s = 3, + .nr_fields = BKEY_NR_FIELDS, + .bits_per_field = { + 13, + 64, + 32, + }, + }; + + struct unpack_state in_s = + unpack_state_init(&bch2_bkey_format_current, (void *) &t); + struct pack_state out_s = pack_state_init(&test_format, &p); + unsigned i; + + for (i = 0; i < out_s.format->nr_fields; i++) { + u64 a, v = get_inc_field(&in_s, i); + + switch (i) { +#define x(id, field) case id: a = t.field; break; + bkey_fields() +#undef x + default: + BUG(); + } + + if (a != v) + panic("got %llu actual %llu i %u\n", v, a, i); + + if (!set_inc_field(&out_s, i, v)) + panic("failed at %u\n", i); + } + + BUG_ON(!bch2_bkey_pack_key(&p, &t, &test_format)); +} +#endif diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h new file mode 100644 index 000000000..e81fb3e00 --- /dev/null +++ b/fs/bcachefs/bkey.h @@ -0,0 +1,774 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_H +#define _BCACHEFS_BKEY_H + +#include +#include "bcachefs_format.h" + +#include "btree_types.h" +#include "util.h" +#include "vstructs.h" + +#if 0 + +/* + * compiled unpack functions are disabled, pending a new interface for + * dynamically allocating executable memory: + */ + +#ifdef CONFIG_X86_64 +#define HAVE_BCACHEFS_COMPILED_UNPACK 1 +#endif +#endif + +void bch2_bkey_packed_to_binary_text(struct printbuf *, + const struct bkey_format *, + const struct bkey_packed *); + +/* bkey with split value, const */ +struct bkey_s_c { + const struct bkey *k; + const struct bch_val *v; +}; + +/* bkey with split value */ +struct bkey_s { + union { + struct { + struct bkey *k; + struct bch_val *v; + }; + struct bkey_s_c s_c; + }; +}; + +#define bkey_p_next(_k) vstruct_next(_k) + +static inline struct bkey_i *bkey_next(struct bkey_i *k) +{ + return (struct bkey_i *) (k->_data + k->k.u64s); +} + +#define bkey_val_u64s(_k) ((_k)->u64s - BKEY_U64s) + +static inline size_t bkey_val_bytes(const struct bkey *k) +{ + return bkey_val_u64s(k) * sizeof(u64); +} + +static inline void set_bkey_val_u64s(struct bkey *k, unsigned val_u64s) +{ + unsigned u64s = BKEY_U64s + val_u64s; + + BUG_ON(u64s > U8_MAX); + k->u64s = u64s; +} + +static inline void set_bkey_val_bytes(struct bkey *k, unsigned bytes) +{ + set_bkey_val_u64s(k, DIV_ROUND_UP(bytes, sizeof(u64))); +} + +#define bkey_val_end(_k) ((void *) (((u64 *) (_k).v) + bkey_val_u64s((_k).k))) + +#define bkey_deleted(_k) ((_k)->type == KEY_TYPE_deleted) + +#define bkey_whiteout(_k) \ + ((_k)->type == KEY_TYPE_deleted || (_k)->type == KEY_TYPE_whiteout) + +enum bkey_lr_packed { + BKEY_PACKED_BOTH, + BKEY_PACKED_RIGHT, + BKEY_PACKED_LEFT, + BKEY_PACKED_NONE, +}; + +#define bkey_lr_packed(_l, _r) \ + ((_l)->format + ((_r)->format << 1)) + +#define bkey_copy(_dst, _src) \ +do { \ + BUILD_BUG_ON(!type_is(_dst, struct bkey_i *) && \ + !type_is(_dst, struct bkey_packed *)); \ + BUILD_BUG_ON(!type_is(_src, struct bkey_i *) && \ + !type_is(_src, struct bkey_packed *)); \ + EBUG_ON((u64 *) (_dst) > (u64 *) (_src) && \ + (u64 *) (_dst) < (u64 *) (_src) + \ + ((struct bkey *) (_src))->u64s); \ + \ + memcpy_u64s_small((_dst), (_src), \ + ((struct bkey *) (_src))->u64s); \ +} while (0) + +struct btree; + +__pure +unsigned bch2_bkey_greatest_differing_bit(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); +__pure +unsigned bch2_bkey_ffs(const struct btree *, const struct bkey_packed *); + +__pure +int __bch2_bkey_cmp_packed_format_checked(const struct bkey_packed *, + const struct bkey_packed *, + const struct btree *); + +__pure +int __bch2_bkey_cmp_left_packed_format_checked(const struct btree *, + const struct bkey_packed *, + const struct bpos *); + +__pure +int bch2_bkey_cmp_packed(const struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); + +__pure +int __bch2_bkey_cmp_left_packed(const struct btree *, + const struct bkey_packed *, + const struct bpos *); + +static inline __pure +int bkey_cmp_left_packed(const struct btree *b, + const struct bkey_packed *l, const struct bpos *r) +{ + return __bch2_bkey_cmp_left_packed(b, l, r); +} + +/* + * The compiler generates better code when we pass bpos by ref, but it's often + * enough terribly convenient to pass it by val... as much as I hate c++, const + * ref would be nice here: + */ +__pure __flatten +static inline int bkey_cmp_left_packed_byval(const struct btree *b, + const struct bkey_packed *l, + struct bpos r) +{ + return bkey_cmp_left_packed(b, l, &r); +} + +static __always_inline bool bpos_eq(struct bpos l, struct bpos r) +{ + return !((l.inode ^ r.inode) | + (l.offset ^ r.offset) | + (l.snapshot ^ r.snapshot)); +} + +static __always_inline bool bpos_lt(struct bpos l, struct bpos r) +{ + return l.inode != r.inode ? l.inode < r.inode : + l.offset != r.offset ? l.offset < r.offset : + l.snapshot != r.snapshot ? l.snapshot < r.snapshot : false; +} + +static __always_inline bool bpos_le(struct bpos l, struct bpos r) +{ + return l.inode != r.inode ? l.inode < r.inode : + l.offset != r.offset ? l.offset < r.offset : + l.snapshot != r.snapshot ? l.snapshot < r.snapshot : true; +} + +static __always_inline bool bpos_gt(struct bpos l, struct bpos r) +{ + return bpos_lt(r, l); +} + +static __always_inline bool bpos_ge(struct bpos l, struct bpos r) +{ + return bpos_le(r, l); +} + +static __always_inline int bpos_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset) ?: + cmp_int(l.snapshot, r.snapshot); +} + +static inline struct bpos bpos_min(struct bpos l, struct bpos r) +{ + return bpos_lt(l, r) ? l : r; +} + +static inline struct bpos bpos_max(struct bpos l, struct bpos r) +{ + return bpos_gt(l, r) ? l : r; +} + +static __always_inline bool bkey_eq(struct bpos l, struct bpos r) +{ + return !((l.inode ^ r.inode) | + (l.offset ^ r.offset)); +} + +static __always_inline bool bkey_lt(struct bpos l, struct bpos r) +{ + return l.inode != r.inode + ? l.inode < r.inode + : l.offset < r.offset; +} + +static __always_inline bool bkey_le(struct bpos l, struct bpos r) +{ + return l.inode != r.inode + ? l.inode < r.inode + : l.offset <= r.offset; +} + +static __always_inline bool bkey_gt(struct bpos l, struct bpos r) +{ + return bkey_lt(r, l); +} + +static __always_inline bool bkey_ge(struct bpos l, struct bpos r) +{ + return bkey_le(r, l); +} + +static __always_inline int bkey_cmp(struct bpos l, struct bpos r) +{ + return cmp_int(l.inode, r.inode) ?: + cmp_int(l.offset, r.offset); +} + +static inline struct bpos bkey_min(struct bpos l, struct bpos r) +{ + return bkey_lt(l, r) ? l : r; +} + +static inline struct bpos bkey_max(struct bpos l, struct bpos r) +{ + return bkey_gt(l, r) ? l : r; +} + +void bch2_bpos_swab(struct bpos *); +void bch2_bkey_swab_key(const struct bkey_format *, struct bkey_packed *); + +static __always_inline int bversion_cmp(struct bversion l, struct bversion r) +{ + return cmp_int(l.hi, r.hi) ?: + cmp_int(l.lo, r.lo); +} + +#define ZERO_VERSION ((struct bversion) { .hi = 0, .lo = 0 }) +#define MAX_VERSION ((struct bversion) { .hi = ~0, .lo = ~0ULL }) + +static __always_inline int bversion_zero(struct bversion v) +{ + return !bversion_cmp(v, ZERO_VERSION); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +/* statement expressions confusing unlikely()? */ +#define bkey_packed(_k) \ + ({ EBUG_ON((_k)->format > KEY_FORMAT_CURRENT); \ + (_k)->format != KEY_FORMAT_CURRENT; }) +#else +#define bkey_packed(_k) ((_k)->format != KEY_FORMAT_CURRENT) +#endif + +/* + * It's safe to treat an unpacked bkey as a packed one, but not the reverse + */ +static inline struct bkey_packed *bkey_to_packed(struct bkey_i *k) +{ + return (struct bkey_packed *) k; +} + +static inline const struct bkey_packed *bkey_to_packed_c(const struct bkey_i *k) +{ + return (const struct bkey_packed *) k; +} + +static inline struct bkey_i *packed_to_bkey(struct bkey_packed *k) +{ + return bkey_packed(k) ? NULL : (struct bkey_i *) k; +} + +static inline const struct bkey *packed_to_bkey_c(const struct bkey_packed *k) +{ + return bkey_packed(k) ? NULL : (const struct bkey *) k; +} + +static inline unsigned bkey_format_key_bits(const struct bkey_format *format) +{ + return format->bits_per_field[BKEY_FIELD_INODE] + + format->bits_per_field[BKEY_FIELD_OFFSET] + + format->bits_per_field[BKEY_FIELD_SNAPSHOT]; +} + +static inline struct bpos bpos_successor(struct bpos p) +{ + if (!++p.snapshot && + !++p.offset && + !++p.inode) + BUG(); + + return p; +} + +static inline struct bpos bpos_predecessor(struct bpos p) +{ + if (!p.snapshot-- && + !p.offset-- && + !p.inode--) + BUG(); + + return p; +} + +static inline struct bpos bpos_nosnap_successor(struct bpos p) +{ + p.snapshot = 0; + + if (!++p.offset && + !++p.inode) + BUG(); + + return p; +} + +static inline struct bpos bpos_nosnap_predecessor(struct bpos p) +{ + p.snapshot = 0; + + if (!p.offset-- && + !p.inode--) + BUG(); + + return p; +} + +static inline u64 bkey_start_offset(const struct bkey *k) +{ + return k->p.offset - k->size; +} + +static inline struct bpos bkey_start_pos(const struct bkey *k) +{ + return (struct bpos) { + .inode = k->p.inode, + .offset = bkey_start_offset(k), + .snapshot = k->p.snapshot, + }; +} + +/* Packed helpers */ + +static inline unsigned bkeyp_key_u64s(const struct bkey_format *format, + const struct bkey_packed *k) +{ + unsigned ret = bkey_packed(k) ? format->key_u64s : BKEY_U64s; + + EBUG_ON(k->u64s < ret); + return ret; +} + +static inline unsigned bkeyp_key_bytes(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return bkeyp_key_u64s(format, k) * sizeof(u64); +} + +static inline unsigned bkeyp_val_u64s(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return k->u64s - bkeyp_key_u64s(format, k); +} + +static inline size_t bkeyp_val_bytes(const struct bkey_format *format, + const struct bkey_packed *k) +{ + return bkeyp_val_u64s(format, k) * sizeof(u64); +} + +static inline void set_bkeyp_val_u64s(const struct bkey_format *format, + struct bkey_packed *k, unsigned val_u64s) +{ + k->u64s = bkeyp_key_u64s(format, k) + val_u64s; +} + +#define bkeyp_val(_format, _k) \ + ((struct bch_val *) ((_k)->_data + bkeyp_key_u64s(_format, _k))) + +extern const struct bkey_format bch2_bkey_format_current; + +bool bch2_bkey_transform(const struct bkey_format *, + struct bkey_packed *, + const struct bkey_format *, + const struct bkey_packed *); + +struct bkey __bch2_bkey_unpack_key(const struct bkey_format *, + const struct bkey_packed *); + +#ifndef HAVE_BCACHEFS_COMPILED_UNPACK +struct bpos __bkey_unpack_pos(const struct bkey_format *, + const struct bkey_packed *); +#endif + +bool bch2_bkey_pack_key(struct bkey_packed *, const struct bkey *, + const struct bkey_format *); + +enum bkey_pack_pos_ret { + BKEY_PACK_POS_EXACT, + BKEY_PACK_POS_SMALLER, + BKEY_PACK_POS_FAIL, +}; + +enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *, struct bpos, + const struct btree *); + +static inline bool bkey_pack_pos(struct bkey_packed *out, struct bpos in, + const struct btree *b) +{ + return bch2_bkey_pack_pos_lossy(out, in, b) == BKEY_PACK_POS_EXACT; +} + +void bch2_bkey_unpack(const struct btree *, struct bkey_i *, + const struct bkey_packed *); +bool bch2_bkey_pack(struct bkey_packed *, const struct bkey_i *, + const struct bkey_format *); + +typedef void (*compiled_unpack_fn)(struct bkey *, const struct bkey_packed *); + +static inline void +__bkey_unpack_key_format_checked(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (IS_ENABLED(HAVE_BCACHEFS_COMPILED_UNPACK)) { + compiled_unpack_fn unpack_fn = b->aux_data; + unpack_fn(dst, src); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + bch2_expensive_debug_checks) { + struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); + + BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); + } + } else { + *dst = __bch2_bkey_unpack_key(&b->format, src); + } +} + +static inline struct bkey +bkey_unpack_key_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ + struct bkey dst; + + __bkey_unpack_key_format_checked(b, &dst, src); + return dst; +} + +static inline void __bkey_unpack_key(const struct btree *b, + struct bkey *dst, + const struct bkey_packed *src) +{ + if (likely(bkey_packed(src))) + __bkey_unpack_key_format_checked(b, dst, src); + else + *dst = *packed_to_bkey_c(src); +} + +/** + * bkey_unpack_key -- unpack just the key, not the value + */ +static inline struct bkey bkey_unpack_key(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_key_format_checked(b, src) + : *packed_to_bkey_c(src); +} + +static inline struct bpos +bkey_unpack_pos_format_checked(const struct btree *b, + const struct bkey_packed *src) +{ +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + return bkey_unpack_key_format_checked(b, src).p; +#else + return __bkey_unpack_pos(&b->format, src); +#endif +} + +static inline struct bpos bkey_unpack_pos(const struct btree *b, + const struct bkey_packed *src) +{ + return likely(bkey_packed(src)) + ? bkey_unpack_pos_format_checked(b, src) + : packed_to_bkey_c(src)->p; +} + +/* Disassembled bkeys */ + +static inline struct bkey_s_c bkey_disassemble(const struct btree *b, + const struct bkey_packed *k, + struct bkey *u) +{ + __bkey_unpack_key(b, u, k); + + return (struct bkey_s_c) { u, bkeyp_val(&b->format, k), }; +} + +/* non const version: */ +static inline struct bkey_s __bkey_disassemble(const struct btree *b, + struct bkey_packed *k, + struct bkey *u) +{ + __bkey_unpack_key(b, u, k); + + return (struct bkey_s) { .k = u, .v = bkeyp_val(&b->format, k), }; +} + +static inline u64 bkey_field_max(const struct bkey_format *f, + enum bch_bkey_fields nr) +{ + return f->bits_per_field[nr] < 64 + ? (le64_to_cpu(f->field_offset[nr]) + + ~(~0ULL << f->bits_per_field[nr])) + : U64_MAX; +} + +#ifdef HAVE_BCACHEFS_COMPILED_UNPACK + +int bch2_compile_bkey_format(const struct bkey_format *, void *); + +#else + +static inline int bch2_compile_bkey_format(const struct bkey_format *format, + void *out) { return 0; } + +#endif + +static inline void bkey_reassemble(struct bkey_i *dst, + struct bkey_s_c src) +{ + dst->k = *src.k; + memcpy_u64s_small(&dst->v, src.v, bkey_val_u64s(src.k)); +} + +#define bkey_s_null ((struct bkey_s) { .k = NULL }) +#define bkey_s_c_null ((struct bkey_s_c) { .k = NULL }) + +#define bkey_s_err(err) ((struct bkey_s) { .k = ERR_PTR(err) }) +#define bkey_s_c_err(err) ((struct bkey_s_c) { .k = ERR_PTR(err) }) + +static inline struct bkey_s bkey_to_s(struct bkey *k) +{ + return (struct bkey_s) { .k = k, .v = NULL }; +} + +static inline struct bkey_s_c bkey_to_s_c(const struct bkey *k) +{ + return (struct bkey_s_c) { .k = k, .v = NULL }; +} + +static inline struct bkey_s bkey_i_to_s(struct bkey_i *k) +{ + return (struct bkey_s) { .k = &k->k, .v = &k->v }; +} + +static inline struct bkey_s_c bkey_i_to_s_c(const struct bkey_i *k) +{ + return (struct bkey_s_c) { .k = &k->k, .v = &k->v }; +} + +/* + * For a given type of value (e.g. struct bch_extent), generates the types for + * bkey + bch_extent - inline, split, split const - and also all the conversion + * functions, which also check that the value is of the correct type. + * + * We use anonymous unions for upcasting - e.g. converting from e.g. a + * bkey_i_extent to a bkey_i - since that's always safe, instead of conversion + * functions. + */ +#define x(name, ...) \ +struct bkey_i_##name { \ + union { \ + struct bkey k; \ + struct bkey_i k_i; \ + }; \ + struct bch_##name v; \ +}; \ + \ +struct bkey_s_c_##name { \ + union { \ + struct { \ + const struct bkey *k; \ + const struct bch_##name *v; \ + }; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +struct bkey_s_##name { \ + union { \ + struct { \ + struct bkey *k; \ + struct bch_##name *v; \ + }; \ + struct bkey_s_c_##name c; \ + struct bkey_s s; \ + struct bkey_s_c s_c; \ + }; \ +}; \ + \ +static inline struct bkey_i_##name *bkey_i_to_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline const struct bkey_i_##name * \ +bkey_i_to_##name##_c(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return container_of(&k->k, struct bkey_i_##name, k); \ +} \ + \ +static inline struct bkey_s_##name bkey_s_to_##name(struct bkey_s k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name bkey_s_c_to_##name(struct bkey_s_c k)\ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k.k) && k.k->type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = k.k, \ + .v = container_of(k.v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_##name name##_i_to_s(struct bkey_i_##name *k)\ +{ \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +name##_i_to_s_c(const struct bkey_i_##name *k) \ +{ \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = &k->v, \ + }; \ +} \ + \ +static inline struct bkey_s_##name bkey_i_to_s_##name(struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_s_c_##name \ +bkey_i_to_s_c_##name(const struct bkey_i *k) \ +{ \ + EBUG_ON(!IS_ERR_OR_NULL(k) && k->k.type != KEY_TYPE_##name); \ + return (struct bkey_s_c_##name) { \ + .k = &k->k, \ + .v = container_of(&k->v, struct bch_##name, v), \ + }; \ +} \ + \ +static inline struct bkey_i_##name *bkey_##name##_init(struct bkey_i *_k)\ +{ \ + struct bkey_i_##name *k = \ + container_of(&_k->k, struct bkey_i_##name, k); \ + \ + bkey_init(&k->k); \ + memset(&k->v, 0, sizeof(k->v)); \ + k->k.type = KEY_TYPE_##name; \ + set_bkey_val_bytes(&k->k, sizeof(k->v)); \ + \ + return k; \ +} + +BCH_BKEY_TYPES(); +#undef x + +/* byte order helpers */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + +static inline unsigned high_word_offset(const struct bkey_format *f) +{ + return f->key_u64s - 1; +} + +#define high_bit_offset 0 +#define nth_word(p, n) ((p) - (n)) + +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + +static inline unsigned high_word_offset(const struct bkey_format *f) +{ + return 0; +} + +#define high_bit_offset KEY_PACKED_BITS_START +#define nth_word(p, n) ((p) + (n)) + +#else +#error edit for your odd byteorder. +#endif + +#define high_word(f, k) ((k)->_data + high_word_offset(f)) +#define next_word(p) nth_word(p, 1) +#define prev_word(p) nth_word(p, -1) + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_bkey_pack_test(void); +#else +static inline void bch2_bkey_pack_test(void) {} +#endif + +#define bkey_fields() \ + x(BKEY_FIELD_INODE, p.inode) \ + x(BKEY_FIELD_OFFSET, p.offset) \ + x(BKEY_FIELD_SNAPSHOT, p.snapshot) \ + x(BKEY_FIELD_SIZE, size) \ + x(BKEY_FIELD_VERSION_HI, version.hi) \ + x(BKEY_FIELD_VERSION_LO, version.lo) + +struct bkey_format_state { + u64 field_min[BKEY_NR_FIELDS]; + u64 field_max[BKEY_NR_FIELDS]; +}; + +void bch2_bkey_format_init(struct bkey_format_state *); + +static inline void __bkey_format_add(struct bkey_format_state *s, unsigned field, u64 v) +{ + s->field_min[field] = min(s->field_min[field], v); + s->field_max[field] = max(s->field_max[field], v); +} + +/* + * Changes @format so that @k can be successfully packed with @format + */ +static inline void bch2_bkey_format_add_key(struct bkey_format_state *s, const struct bkey *k) +{ +#define x(id, field) __bkey_format_add(s, id, k->field); + bkey_fields() +#undef x +} + +void bch2_bkey_format_add_pos(struct bkey_format_state *, struct bpos); +struct bkey_format bch2_bkey_format_done(struct bkey_format_state *); +const char *bch2_bkey_format_validate(struct bkey_format *); + +#endif /* _BCACHEFS_BKEY_H */ diff --git a/fs/bcachefs/bkey_buf.h b/fs/bcachefs/bkey_buf.h new file mode 100644 index 000000000..a30c4ae8e --- /dev/null +++ b/fs/bcachefs/bkey_buf.h @@ -0,0 +1,61 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_BUF_H +#define _BCACHEFS_BKEY_BUF_H + +#include "bcachefs.h" +#include "bkey.h" + +struct bkey_buf { + struct bkey_i *k; + u64 onstack[12]; +}; + +static inline void bch2_bkey_buf_realloc(struct bkey_buf *s, + struct bch_fs *c, unsigned u64s) +{ + if (s->k == (void *) s->onstack && + u64s > ARRAY_SIZE(s->onstack)) { + s->k = mempool_alloc(&c->large_bkey_pool, GFP_NOFS); + memcpy(s->k, s->onstack, sizeof(s->onstack)); + } +} + +static inline void bch2_bkey_buf_reassemble(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_buf_realloc(s, c, k.k->u64s); + bkey_reassemble(s->k, k); +} + +static inline void bch2_bkey_buf_copy(struct bkey_buf *s, + struct bch_fs *c, + struct bkey_i *src) +{ + bch2_bkey_buf_realloc(s, c, src->k.u64s); + bkey_copy(s->k, src); +} + +static inline void bch2_bkey_buf_unpack(struct bkey_buf *s, + struct bch_fs *c, + struct btree *b, + struct bkey_packed *src) +{ + bch2_bkey_buf_realloc(s, c, BKEY_U64s + + bkeyp_val_u64s(&b->format, src)); + bch2_bkey_unpack(b, s->k, src); +} + +static inline void bch2_bkey_buf_init(struct bkey_buf *s) +{ + s->k = (void *) s->onstack; +} + +static inline void bch2_bkey_buf_exit(struct bkey_buf *s, struct bch_fs *c) +{ + if (s->k != (void *) s->onstack) + mempool_free(s->k, &c->large_bkey_pool); + s->k = NULL; +} + +#endif /* _BCACHEFS_BKEY_BUF_H */ diff --git a/fs/bcachefs/bkey_cmp.h b/fs/bcachefs/bkey_cmp.h new file mode 100644 index 000000000..5f42a6e69 --- /dev/null +++ b/fs/bcachefs/bkey_cmp.h @@ -0,0 +1,129 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_CMP_H +#define _BCACHEFS_BKEY_CMP_H + +#include "bkey.h" + +#ifdef CONFIG_X86_64 +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + long d0, d1, d2, d3; + int cmp; + + /* we shouldn't need asm for this, but gcc is being retarded: */ + + asm(".intel_syntax noprefix;" + "xor eax, eax;" + "xor edx, edx;" + "1:;" + "mov r8, [rdi];" + "mov r9, [rsi];" + "sub ecx, 64;" + "jl 2f;" + + "cmp r8, r9;" + "jnz 3f;" + + "lea rdi, [rdi - 8];" + "lea rsi, [rsi - 8];" + "jmp 1b;" + + "2:;" + "not ecx;" + "shr r8, 1;" + "shr r9, 1;" + "shr r8, cl;" + "shr r9, cl;" + "cmp r8, r9;" + + "3:\n" + "seta al;" + "setb dl;" + "sub eax, edx;" + ".att_syntax prefix;" + : "=&D" (d0), "=&S" (d1), "=&d" (d2), "=&c" (d3), "=&a" (cmp) + : "0" (l), "1" (r), "3" (nr_key_bits) + : "r8", "r9", "cc", "memory"); + + return cmp; +} +#else +static inline int __bkey_cmp_bits(const u64 *l, const u64 *r, + unsigned nr_key_bits) +{ + u64 l_v, r_v; + + if (!nr_key_bits) + return 0; + + /* for big endian, skip past header */ + nr_key_bits += high_bit_offset; + l_v = *l & (~0ULL >> high_bit_offset); + r_v = *r & (~0ULL >> high_bit_offset); + + while (1) { + if (nr_key_bits < 64) { + l_v >>= 64 - nr_key_bits; + r_v >>= 64 - nr_key_bits; + nr_key_bits = 0; + } else { + nr_key_bits -= 64; + } + + if (!nr_key_bits || l_v != r_v) + break; + + l = next_word(l); + r = next_word(r); + + l_v = *l; + r_v = *r; + } + + return cmp_int(l_v, r_v); +} +#endif + +static inline __pure __flatten +int __bch2_bkey_cmp_packed_format_checked_inlined(const struct bkey_packed *l, + const struct bkey_packed *r, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + int ret; + + EBUG_ON(!bkey_packed(l) || !bkey_packed(r)); + EBUG_ON(b->nr_key_bits != bkey_format_key_bits(f)); + + ret = __bkey_cmp_bits(high_word(f, l), + high_word(f, r), + b->nr_key_bits); + + EBUG_ON(ret != bpos_cmp(bkey_unpack_pos(b, l), + bkey_unpack_pos(b, r))); + return ret; +} + +static inline __pure __flatten +int bch2_bkey_cmp_packed_inlined(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + struct bkey unpacked; + + if (likely(bkey_packed(l) && bkey_packed(r))) + return __bch2_bkey_cmp_packed_format_checked_inlined(l, r, b); + + if (bkey_packed(l)) { + __bkey_unpack_key_format_checked(b, &unpacked, l); + l = (void *) &unpacked; + } else if (bkey_packed(r)) { + __bkey_unpack_key_format_checked(b, &unpacked, r); + r = (void *) &unpacked; + } + + return bpos_cmp(((struct bkey *) l)->p, ((struct bkey *) r)->p); +} + +#endif /* _BCACHEFS_BKEY_CMP_H */ diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c new file mode 100644 index 000000000..1381166bf --- /dev/null +++ b/fs/bcachefs/bkey_methods.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "backpointers.h" +#include "bkey_methods.h" +#include "btree_types.h" +#include "alloc_background.h" +#include "dirent.h" +#include "ec.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "lru.h" +#include "quota.h" +#include "reflink.h" +#include "subvolume.h" +#include "xattr.h" + +const char * const bch2_bkey_types[] = { +#define x(name, nr) #name, + BCH_BKEY_TYPES() +#undef x + NULL +}; + +static int deleted_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + return 0; +} + +#define bch2_bkey_ops_deleted ((struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +}) + +#define bch2_bkey_ops_whiteout ((struct bkey_ops) { \ + .key_invalid = deleted_key_invalid, \ +}) + +static int empty_val_key_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + if (bkey_val_bytes(k.k)) { + prt_printf(err, "incorrect value size (%zu != 0)", + bkey_val_bytes(k.k)); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +#define bch2_bkey_ops_error ((struct bkey_ops) { \ + .key_invalid = empty_val_key_invalid, \ +}) + +static int key_type_cookie_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + return 0; +} + +#define bch2_bkey_ops_cookie ((struct bkey_ops) { \ + .key_invalid = key_type_cookie_invalid, \ + .min_val_size = 8, \ +}) + +#define bch2_bkey_ops_hash_whiteout ((struct bkey_ops) {\ + .key_invalid = empty_val_key_invalid, \ +}) + +static int key_type_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + return 0; +} + +static void key_type_inline_data_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inline_data d = bkey_s_c_to_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "datalen %u: %*phN", + datalen, min(datalen, 32U), d.v->data); +} + +#define bch2_bkey_ops_inline_data ((struct bkey_ops) { \ + .key_invalid = key_type_inline_data_invalid, \ + .val_to_text = key_type_inline_data_to_text, \ +}) + +static int key_type_set_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + if (bkey_val_bytes(k.k)) { + prt_printf(err, "incorrect value size (%zu != %zu)", + bkey_val_bytes(k.k), sizeof(struct bch_cookie)); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +#define bch2_bkey_ops_set ((struct bkey_ops) { \ + .key_invalid = key_type_set_invalid, \ + .key_merge = key_type_set_merge, \ +}) + +const struct bkey_ops bch2_bkey_ops[] = { +#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, + BCH_BKEY_TYPES() +#undef x +}; + +const struct bkey_ops bch2_bkey_null_ops = { + .min_val_size = U8_MAX, +}; + +int bch2_bkey_val_invalid(struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (bkey_val_bytes(k.k) < ops->min_val_size) { + prt_printf(err, "bad val size (%zu < %u)", + bkey_val_bytes(k.k), ops->min_val_size); + return -BCH_ERR_invalid_bkey; + } + + if (!ops->key_invalid) + return 0; + + return ops->key_invalid(c, k, flags, err); +} + +static unsigned bch2_key_types_allowed[] = { + [BKEY_TYPE_extents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_error)| + (1U << KEY_TYPE_cookie)| + (1U << KEY_TYPE_extent)| + (1U << KEY_TYPE_reservation)| + (1U << KEY_TYPE_reflink_p)| + (1U << KEY_TYPE_inline_data), + [BKEY_TYPE_inodes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_inode)| + (1U << KEY_TYPE_inode_v2)| + (1U << KEY_TYPE_inode_v3)| + (1U << KEY_TYPE_inode_generation), + [BKEY_TYPE_dirents] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_hash_whiteout)| + (1U << KEY_TYPE_dirent), + [BKEY_TYPE_xattrs] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_whiteout)| + (1U << KEY_TYPE_cookie)| + (1U << KEY_TYPE_hash_whiteout)| + (1U << KEY_TYPE_xattr), + [BKEY_TYPE_alloc] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_alloc)| + (1U << KEY_TYPE_alloc_v2)| + (1U << KEY_TYPE_alloc_v3)| + (1U << KEY_TYPE_alloc_v4), + [BKEY_TYPE_quotas] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_quota), + [BKEY_TYPE_stripes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_stripe), + [BKEY_TYPE_reflink] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_reflink_v)| + (1U << KEY_TYPE_indirect_inline_data), + [BKEY_TYPE_subvolumes] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_subvolume), + [BKEY_TYPE_snapshots] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot), + [BKEY_TYPE_lru] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_freespace] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_need_discard] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_set), + [BKEY_TYPE_backpointers] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_backpointer), + [BKEY_TYPE_bucket_gens] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_bucket_gens), + [BKEY_TYPE_snapshot_trees] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_snapshot_tree), + [BKEY_TYPE_btree] = + (1U << KEY_TYPE_deleted)| + (1U << KEY_TYPE_btree_ptr)| + (1U << KEY_TYPE_btree_ptr_v2), +}; + +int __bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (k.k->u64s < BKEY_U64s) { + prt_printf(err, "u64s too small (%u < %zu)", k.k->u64s, BKEY_U64s); + return -BCH_ERR_invalid_bkey; + } + + if (flags & BKEY_INVALID_COMMIT && + !(bch2_key_types_allowed[type] & (1U << k.k->type))) { + prt_printf(err, "invalid key type for btree %s (%s)", + bch2_btree_ids[type], bch2_bkey_types[k.k->type]); + return -BCH_ERR_invalid_bkey; + } + + if (btree_node_type_is_extents(type) && !bkey_whiteout(k.k)) { + if (k.k->size == 0) { + prt_printf(err, "size == 0"); + return -BCH_ERR_invalid_bkey; + } + + if (k.k->size > k.k->p.offset) { + prt_printf(err, "size greater than offset (%u > %llu)", + k.k->size, k.k->p.offset); + return -BCH_ERR_invalid_bkey; + } + } else { + if (k.k->size) { + prt_printf(err, "size != 0"); + return -BCH_ERR_invalid_bkey; + } + } + + if (type != BKEY_TYPE_btree) { + if (!btree_type_has_snapshots((enum btree_id) type) && + k.k->p.snapshot) { + prt_printf(err, "nonzero snapshot"); + return -BCH_ERR_invalid_bkey; + } + + if (btree_type_has_snapshots((enum btree_id) type) && + !k.k->p.snapshot) { + prt_printf(err, "snapshot == 0"); + return -BCH_ERR_invalid_bkey; + } + + if (bkey_eq(k.k->p, POS_MAX)) { + prt_printf(err, "key at POS_MAX"); + return -BCH_ERR_invalid_bkey; + } + } + + return 0; +} + +int bch2_bkey_invalid(struct bch_fs *c, struct bkey_s_c k, + enum btree_node_type type, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, type, flags, err) ?: + bch2_bkey_val_invalid(c, k, flags, err); +} + +int bch2_bkey_in_btree_node(struct btree *b, struct bkey_s_c k, + struct printbuf *err) +{ + if (bpos_lt(k.k->p, b->data->min_key)) { + prt_printf(err, "key before start of btree node"); + return -BCH_ERR_invalid_bkey; + } + + if (bpos_gt(k.k->p, b->data->max_key)) { + prt_printf(err, "key past end of btree node"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_bpos_to_text(struct printbuf *out, struct bpos pos) +{ + if (bpos_eq(pos, POS_MIN)) + prt_printf(out, "POS_MIN"); + else if (bpos_eq(pos, POS_MAX)) + prt_printf(out, "POS_MAX"); + else if (bpos_eq(pos, SPOS_MAX)) + prt_printf(out, "SPOS_MAX"); + else { + if (pos.inode == U64_MAX) + prt_printf(out, "U64_MAX"); + else + prt_printf(out, "%llu", pos.inode); + prt_printf(out, ":"); + if (pos.offset == U64_MAX) + prt_printf(out, "U64_MAX"); + else + prt_printf(out, "%llu", pos.offset); + prt_printf(out, ":"); + if (pos.snapshot == U32_MAX) + prt_printf(out, "U32_MAX"); + else + prt_printf(out, "%u", pos.snapshot); + } +} + +void bch2_bkey_to_text(struct printbuf *out, const struct bkey *k) +{ + if (k) { + prt_printf(out, "u64s %u type ", k->u64s); + + if (k->type < KEY_TYPE_MAX) + prt_printf(out, "%s ", bch2_bkey_types[k->type]); + else + prt_printf(out, "%u ", k->type); + + bch2_bpos_to_text(out, k->p); + + prt_printf(out, " len %u ver %llu", k->size, k->version.lo); + } else { + prt_printf(out, "(null)"); + } +} + +void bch2_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (likely(ops->val_to_text)) + ops->val_to_text(out, c, k); +} + +void bch2_bkey_val_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_to_text(out, k.k); + + if (bkey_val_bytes(k.k)) { + prt_printf(out, ": "); + bch2_val_to_text(out, c, k); + } +} + +void bch2_bkey_swab_val(struct bkey_s k) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + if (ops->swab) + ops->swab(k); +} + +bool bch2_bkey_normalize(struct bch_fs *c, struct bkey_s k) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(k.k->type); + + return ops->key_normalize + ? ops->key_normalize(c, k) + : false; +} + +bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(l.k->type); + + return ops->key_merge && + bch2_bkey_maybe_mergable(l.k, r.k) && + (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && + !bch2_key_merging_disabled && + ops->key_merge(c, l, r); +} + +static const struct old_bkey_type { + u8 btree_node_type; + u8 old; + u8 new; +} bkey_renumber_table[] = { + {BKEY_TYPE_btree, 128, KEY_TYPE_btree_ptr }, + {BKEY_TYPE_extents, 128, KEY_TYPE_extent }, + {BKEY_TYPE_extents, 129, KEY_TYPE_extent }, + {BKEY_TYPE_extents, 130, KEY_TYPE_reservation }, + {BKEY_TYPE_inodes, 128, KEY_TYPE_inode }, + {BKEY_TYPE_inodes, 130, KEY_TYPE_inode_generation }, + {BKEY_TYPE_dirents, 128, KEY_TYPE_dirent }, + {BKEY_TYPE_dirents, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_xattrs, 128, KEY_TYPE_xattr }, + {BKEY_TYPE_xattrs, 129, KEY_TYPE_hash_whiteout }, + {BKEY_TYPE_alloc, 128, KEY_TYPE_alloc }, + {BKEY_TYPE_quotas, 128, KEY_TYPE_quota }, +}; + +void bch2_bkey_renumber(enum btree_node_type btree_node_type, + struct bkey_packed *k, + int write) +{ + const struct old_bkey_type *i; + + for (i = bkey_renumber_table; + i < bkey_renumber_table + ARRAY_SIZE(bkey_renumber_table); + i++) + if (btree_node_type == i->btree_node_type && + k->type == (write ? i->new : i->old)) { + k->type = write ? i->old : i->new; + break; + } +} + +void __bch2_bkey_compat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct bkey_format *f, + struct bkey_packed *k) +{ + const struct bkey_ops *ops; + struct bkey uk; + struct bkey_s u; + unsigned nr_compat = 5; + int i; + + /* + * Do these operations in reverse order in the write path: + */ + + for (i = 0; i < nr_compat; i++) + switch (!write ? i : nr_compat - 1 - i) { + case 0: + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_key(f, k); + break; + case 1: + if (version < bcachefs_metadata_version_bkey_renumber) + bch2_bkey_renumber(__btree_node_type(level, btree_id), k, write); + break; + case 2: + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) { + if (!bkey_packed(k)) { + struct bkey_i *u = packed_to_bkey(k); + + swap(u->k.p.inode, u->k.p.offset); + } else if (f->bits_per_field[BKEY_FIELD_INODE] && + f->bits_per_field[BKEY_FIELD_OFFSET]) { + struct bkey_format tmp = *f, *in = f, *out = &tmp; + + swap(tmp.bits_per_field[BKEY_FIELD_INODE], + tmp.bits_per_field[BKEY_FIELD_OFFSET]); + swap(tmp.field_offset[BKEY_FIELD_INODE], + tmp.field_offset[BKEY_FIELD_OFFSET]); + + if (!write) + swap(in, out); + + uk = __bch2_bkey_unpack_key(in, k); + swap(uk.p.inode, uk.p.offset); + BUG_ON(!bch2_bkey_pack_key(k, &uk, out)); + } + } + break; + case 3: + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + struct bkey_i *u = packed_to_bkey(k); + + if (u) { + u->k.p.snapshot = write + ? 0 : U32_MAX; + } else { + u64 min_packed = le64_to_cpu(f->field_offset[BKEY_FIELD_SNAPSHOT]); + u64 max_packed = min_packed + + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + uk = __bch2_bkey_unpack_key(f, k); + uk.p.snapshot = write + ? min_packed : min_t(u64, U32_MAX, max_packed); + + BUG_ON(!bch2_bkey_pack_key(k, &uk, f)); + } + } + + break; + case 4: + if (!bkey_packed(k)) { + u = bkey_i_to_s(packed_to_bkey(k)); + } else { + uk = __bch2_bkey_unpack_key(f, k); + u.k = &uk; + u.v = bkeyp_val(f, k); + } + + if (big_endian != CPU_BIG_ENDIAN) + bch2_bkey_swab_val(u); + + ops = bch2_bkey_type_ops(k->type); + + if (ops->compat) + ops->compat(btree_id, version, big_endian, write, u); + break; + default: + BUG(); + } +} diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h new file mode 100644 index 000000000..0f3dc156a --- /dev/null +++ b/fs/bcachefs/bkey_methods.h @@ -0,0 +1,191 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_METHODS_H +#define _BCACHEFS_BKEY_METHODS_H + +#include "bkey.h" + +struct bch_fs; +struct btree; +struct btree_trans; +struct bkey; +enum btree_node_type; + +extern const char * const bch2_bkey_types[]; +extern const struct bkey_ops bch2_bkey_null_ops; + +enum bkey_invalid_flags { + BKEY_INVALID_WRITE = (1U << 0), + BKEY_INVALID_COMMIT = (1U << 1), + BKEY_INVALID_JOURNAL = (1U << 2), +}; + +/* + * key_invalid: checks validity of @k, returns 0 if good or -EINVAL if bad. If + * invalid, entire key will be deleted. + * + * When invalid, error string is returned via @err. @rw indicates whether key is + * being read or written; more aggressive checks can be enabled when rw == WRITE. + */ +struct bkey_ops { + int (*key_invalid)(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, struct printbuf *err); + void (*val_to_text)(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + void (*swab)(struct bkey_s); + bool (*key_normalize)(struct bch_fs *, struct bkey_s); + bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c); + int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); + int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + void (*compat)(enum btree_id id, unsigned version, + unsigned big_endian, int write, + struct bkey_s); + + /* Size of value type when first created: */ + unsigned min_val_size; +}; + +extern const struct bkey_ops bch2_bkey_ops[]; + +static inline const struct bkey_ops *bch2_bkey_type_ops(enum bch_bkey_type type) +{ + return likely(type < KEY_TYPE_MAX) + ? &bch2_bkey_ops[type] + : &bch2_bkey_null_ops; +} + +int bch2_bkey_val_invalid(struct bch_fs *, struct bkey_s_c, unsigned, struct printbuf *); +int __bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type, unsigned, struct printbuf *); +int bch2_bkey_invalid(struct bch_fs *, struct bkey_s_c, + enum btree_node_type, unsigned, struct printbuf *); +int bch2_bkey_in_btree_node(struct btree *, struct bkey_s_c, struct printbuf *); + +void bch2_bpos_to_text(struct printbuf *, struct bpos); +void bch2_bkey_to_text(struct printbuf *, const struct bkey *); +void bch2_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +void bch2_bkey_val_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +void bch2_bkey_swab_val(struct bkey_s); + +bool bch2_bkey_normalize(struct bch_fs *, struct bkey_s); + +static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct bkey *r) +{ + return l->type == r->type && + !bversion_cmp(l->version, r->version) && + bpos_eq(l->p, bkey_start_pos(r)); +} + +bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +static inline int bch2_mark_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type); + + return ops->atomic_trigger + ? ops->atomic_trigger(trans, btree, level, old, new, flags) + : 0; +} + +enum btree_update_flags { + __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END, + __BTREE_UPDATE_NOJOURNAL, + __BTREE_UPDATE_KEY_CACHE_RECLAIM, + + __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */ + + __BTREE_TRIGGER_INSERT, + __BTREE_TRIGGER_OVERWRITE, + + __BTREE_TRIGGER_GC, + __BTREE_TRIGGER_BUCKET_INVALIDATE, + __BTREE_TRIGGER_NOATOMIC, +}; + +#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) +#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL) +#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM) + +#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN) + +#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT) +#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE) + +#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC) +#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE) +#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC) + +#define BTREE_TRIGGER_WANTS_OLD_AND_NEW \ + ((1U << KEY_TYPE_alloc)| \ + (1U << KEY_TYPE_alloc_v2)| \ + (1U << KEY_TYPE_alloc_v3)| \ + (1U << KEY_TYPE_alloc_v4)| \ + (1U << KEY_TYPE_stripe)| \ + (1U << KEY_TYPE_inode)| \ + (1U << KEY_TYPE_inode_v2)| \ + (1U << KEY_TYPE_snapshot)) + +static inline int bch2_trans_mark_key(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type); + + return ops->trans_trigger + ? ops->trans_trigger(trans, btree_id, level, old, new, flags) + : 0; +} + +static inline int bch2_trans_mark_old(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = old.k->p; + + return bch2_trans_mark_key(trans, btree_id, level, old, &deleted, + BTREE_TRIGGER_OVERWRITE|flags); +} + +static inline int bch2_trans_mark_new(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_i *new, unsigned flags) +{ + struct bkey_i deleted; + + bkey_init(&deleted.k); + deleted.k.p = new->k.p; + + return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new, + BTREE_TRIGGER_INSERT|flags); +} + +void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int); + +void __bch2_bkey_compat(unsigned, enum btree_id, unsigned, unsigned, + int, struct bkey_format *, struct bkey_packed *); + +static inline void bch2_bkey_compat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct bkey_format *f, + struct bkey_packed *k) +{ + if (version < bcachefs_metadata_version_current || + big_endian != CPU_BIG_ENDIAN) + __bch2_bkey_compat(level, btree_id, version, + big_endian, write, f, k); + +} + +#endif /* _BCACHEFS_BKEY_METHODS_H */ diff --git a/fs/bcachefs/bkey_sort.c b/fs/bcachefs/bkey_sort.c new file mode 100644 index 000000000..b9aa027c8 --- /dev/null +++ b/fs/bcachefs/bkey_sort.c @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" +#include "bkey_cmp.h" +#include "bkey_sort.h" +#include "bset.h" +#include "extents.h" + +typedef int (*sort_cmp_fn)(struct btree *, + struct bkey_packed *, + struct bkey_packed *); + +static inline bool sort_iter_end(struct sort_iter *iter) +{ + return !iter->used; +} + +static inline void sort_iter_sift(struct sort_iter *iter, unsigned from, + sort_cmp_fn cmp) +{ + unsigned i; + + for (i = from; + i + 1 < iter->used && + cmp(iter->b, iter->data[i].k, iter->data[i + 1].k) > 0; + i++) + swap(iter->data[i], iter->data[i + 1]); +} + +static inline void sort_iter_sort(struct sort_iter *iter, sort_cmp_fn cmp) +{ + unsigned i = iter->used; + + while (i--) + sort_iter_sift(iter, i, cmp); +} + +static inline struct bkey_packed *sort_iter_peek(struct sort_iter *iter) +{ + return !sort_iter_end(iter) ? iter->data->k : NULL; +} + +static inline void sort_iter_advance(struct sort_iter *iter, sort_cmp_fn cmp) +{ + struct sort_iter_set *i = iter->data; + + BUG_ON(!iter->used); + + i->k = bkey_p_next(i->k); + + BUG_ON(i->k > i->end); + + if (i->k == i->end) + array_remove_item(iter->data, iter->used, 0); + else + sort_iter_sift(iter, 0, cmp); +} + +static inline struct bkey_packed *sort_iter_next(struct sort_iter *iter, + sort_cmp_fn cmp) +{ + struct bkey_packed *ret = sort_iter_peek(iter); + + if (ret) + sort_iter_advance(iter, cmp); + + return ret; +} + +/* + * If keys compare equal, compare by pointer order: + */ +static inline int key_sort_fix_overlapping_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) ?: + cmp_int((unsigned long) l, (unsigned long) r); +} + +static inline bool should_drop_next_key(struct sort_iter *iter) +{ + /* + * key_sort_cmp() ensures that when keys compare equal the older key + * comes first; so if l->k compares equal to r->k then l->k is older + * and should be dropped. + */ + return iter->used >= 2 && + !bch2_bkey_cmp_packed(iter->b, + iter->data[0].k, + iter->data[1].k); +} + +struct btree_nr_keys +bch2_key_sort_fix_overlapping(struct bch_fs *c, struct bset *dst, + struct sort_iter *iter) +{ + struct bkey_packed *out = dst->start; + struct bkey_packed *k; + struct btree_nr_keys nr; + + memset(&nr, 0, sizeof(nr)); + + sort_iter_sort(iter, key_sort_fix_overlapping_cmp); + + while ((k = sort_iter_peek(iter))) { + if (!bkey_deleted(k) && + !should_drop_next_key(iter)) { + bkey_copy(out, k); + btree_keys_account_key_add(&nr, 0, out); + out = bkey_p_next(out); + } + + sort_iter_advance(iter, key_sort_fix_overlapping_cmp); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +/* Sort + repack in a new format: */ +struct btree_nr_keys +bch2_sort_repack(struct bset *dst, struct btree *src, + struct btree_node_iter *src_iter, + struct bkey_format *out_f, + bool filter_whiteouts) +{ + struct bkey_format *in_f = &src->format; + struct bkey_packed *in, *out = vstruct_last(dst); + struct btree_nr_keys nr; + bool transform = memcmp(out_f, &src->format, sizeof(*out_f)); + + memset(&nr, 0, sizeof(nr)); + + while ((in = bch2_btree_node_iter_next_all(src_iter, src))) { + if (filter_whiteouts && bkey_deleted(in)) + continue; + + if (!transform) + bkey_copy(out, in); + else if (bch2_bkey_transform(out_f, out, bkey_packed(in) + ? in_f : &bch2_bkey_format_current, in)) + out->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(src, (void *) out, in); + + out->needs_whiteout = false; + + btree_keys_account_key_add(&nr, 0, out); + out = bkey_p_next(out); + } + + dst->u64s = cpu_to_le16((u64 *) out - dst->_data); + return nr; +} + +static inline int sort_keys_cmp(struct btree *b, + struct bkey_packed *l, + struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed_inlined(b, l, r) ?: + (int) bkey_deleted(r) - (int) bkey_deleted(l) ?: + (int) l->needs_whiteout - (int) r->needs_whiteout; +} + +unsigned bch2_sort_keys(struct bkey_packed *dst, + struct sort_iter *iter, + bool filter_whiteouts) +{ + const struct bkey_format *f = &iter->b->format; + struct bkey_packed *in, *next, *out = dst; + + sort_iter_sort(iter, sort_keys_cmp); + + while ((in = sort_iter_next(iter, sort_keys_cmp))) { + bool needs_whiteout = false; + + if (bkey_deleted(in) && + (filter_whiteouts || !in->needs_whiteout)) + continue; + + while ((next = sort_iter_peek(iter)) && + !bch2_bkey_cmp_packed_inlined(iter->b, in, next)) { + BUG_ON(in->needs_whiteout && + next->needs_whiteout); + needs_whiteout |= in->needs_whiteout; + in = sort_iter_next(iter, sort_keys_cmp); + } + + if (bkey_deleted(in)) { + memcpy_u64s_small(out, in, bkeyp_key_u64s(f, in)); + set_bkeyp_val_u64s(f, out, 0); + } else { + bkey_copy(out, in); + } + out->needs_whiteout |= needs_whiteout; + out = bkey_p_next(out); + } + + return (u64 *) out - (u64 *) dst; +} diff --git a/fs/bcachefs/bkey_sort.h b/fs/bcachefs/bkey_sort.h new file mode 100644 index 000000000..79cf11d1b --- /dev/null +++ b/fs/bcachefs/bkey_sort.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BKEY_SORT_H +#define _BCACHEFS_BKEY_SORT_H + +struct sort_iter { + struct btree *b; + unsigned used; + unsigned size; + + struct sort_iter_set { + struct bkey_packed *k, *end; + } data[MAX_BSETS + 1]; +}; + +static inline void sort_iter_init(struct sort_iter *iter, struct btree *b) +{ + iter->b = b; + iter->used = 0; + iter->size = ARRAY_SIZE(iter->data); +} + +static inline void sort_iter_add(struct sort_iter *iter, + struct bkey_packed *k, + struct bkey_packed *end) +{ + BUG_ON(iter->used >= iter->size); + + if (k != end) + iter->data[iter->used++] = (struct sort_iter_set) { k, end }; +} + +struct btree_nr_keys +bch2_key_sort_fix_overlapping(struct bch_fs *, struct bset *, + struct sort_iter *); + +struct btree_nr_keys +bch2_sort_repack(struct bset *, struct btree *, + struct btree_node_iter *, + struct bkey_format *, bool); + +unsigned bch2_sort_keys(struct bkey_packed *, + struct sort_iter *, bool); + +#endif /* _BCACHEFS_BKEY_SORT_H */ diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c new file mode 100644 index 000000000..bcdf28f39 --- /dev/null +++ b/fs/bcachefs/bset.c @@ -0,0 +1,1587 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for working with individual keys, and sorted sets of keys with in a + * btree node + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "btree_cache.h" +#include "bset.h" +#include "eytzinger.h" +#include "trace.h" +#include "util.h" + +#include +#include +#include +#include + +static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *, + struct btree *); + +static inline unsigned __btree_node_iter_used(struct btree_node_iter *iter) +{ + unsigned n = ARRAY_SIZE(iter->data); + + while (n && __btree_node_iter_set_end(iter, n - 1)) + --n; + + return n; +} + +struct bset_tree *bch2_bkey_to_bset(struct btree *b, struct bkey_packed *k) +{ + return bch2_bkey_to_bset_inlined(b, k); +} + +/* + * There are never duplicate live keys in the btree - but including keys that + * have been flagged as deleted (and will be cleaned up later) we _will_ see + * duplicates. + * + * Thus the sort order is: usual key comparison first, but for keys that compare + * equal the deleted key(s) come first, and the (at most one) live version comes + * last. + * + * The main reason for this is insertion: to handle overwrites, we first iterate + * over keys that compare equal to our insert key, and then insert immediately + * prior to the first key greater than the key we're inserting - our insert + * position will be after all keys that compare equal to our insert key, which + * by the time we actually do the insert will all be deleted. + */ + +void bch2_dump_bset(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned set) +{ + struct bkey_packed *_k, *_n; + struct bkey uk, n; + struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + + if (!i->u64s) + return; + + for (_k = i->start; + _k < vstruct_last(i); + _k = _n) { + _n = bkey_p_next(_k); + + k = bkey_disassemble(b, _k, &uk); + + printbuf_reset(&buf); + if (c) + bch2_bkey_val_to_text(&buf, c, k); + else + bch2_bkey_to_text(&buf, k.k); + printk(KERN_ERR "block %u key %5zu: %s\n", set, + _k->_data - i->_data, buf.buf); + + if (_n == vstruct_last(i)) + continue; + + n = bkey_unpack_key(b, _n); + + if (bpos_lt(n.p, k.k->p)) { + printk(KERN_ERR "Key skipped backwards\n"); + continue; + } + + if (!bkey_deleted(k.k) && bpos_eq(n.p, k.k->p)) + printk(KERN_ERR "Duplicate keys\n"); + } + + printbuf_exit(&buf); +} + +void bch2_dump_btree_node(struct bch_fs *c, struct btree *b) +{ + struct bset_tree *t; + + console_lock(); + for_each_bset(b, t) + bch2_dump_bset(c, b, bset(b, t), t - b->set); + console_unlock(); +} + +void bch2_dump_btree_node_iter(struct btree *b, + struct btree_node_iter *iter) +{ + struct btree_node_iter_set *set; + struct printbuf buf = PRINTBUF; + + printk(KERN_ERR "btree node iter with %u/%u sets:\n", + __btree_node_iter_used(iter), b->nsets); + + btree_node_iter_for_each(iter, set) { + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk = bkey_unpack_key(b, k); + + printbuf_reset(&buf); + bch2_bkey_to_text(&buf, &uk); + printk(KERN_ERR "set %zu key %u: %s\n", + t - b->set, set->k, buf.buf); + } + + printbuf_exit(&buf); +} + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *b) +{ + struct bset_tree *t; + struct bkey_packed *k; + struct btree_nr_keys nr = { 0 }; + + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) + if (!bkey_deleted(k)) + btree_keys_account_key_add(&nr, t - b->set, k); + + BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); +} + +static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, + struct btree *b) +{ + struct btree_node_iter iter = *_iter; + const struct bkey_packed *k, *n; + + k = bch2_btree_node_iter_peek_all(&iter, b); + __bch2_btree_node_iter_advance(&iter, b); + n = bch2_btree_node_iter_peek_all(&iter, b); + + bkey_unpack_key(b, k); + + if (n && + bkey_iter_cmp(b, k, n) > 0) { + struct btree_node_iter_set *set; + struct bkey ku = bkey_unpack_key(b, k); + struct bkey nu = bkey_unpack_key(b, n); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&buf1, &ku); + bch2_bkey_to_text(&buf2, &nu); + printk(KERN_ERR "out of order/overlapping:\n%s\n%s\n", + buf1.buf, buf2.buf); + printk(KERN_ERR "iter was:"); + + btree_node_iter_for_each(_iter, set) { + struct bkey_packed *k = __btree_node_offset_to_key(b, set->k); + struct bset_tree *t = bch2_bkey_to_bset(b, k); + printk(" [%zi %zi]", t - b->set, + k->_data - bset(b, t)->_data); + } + panic("\n"); + } +} + +void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) +{ + struct btree_node_iter_set *set, *s2; + struct bkey_packed *k, *p; + struct bset_tree *t; + + if (bch2_btree_node_iter_end(iter)) + return; + + /* Verify no duplicates: */ + btree_node_iter_for_each(iter, set) { + BUG_ON(set->k > set->end); + btree_node_iter_for_each(iter, s2) + BUG_ON(set != s2 && set->end == s2->end); + } + + /* Verify that set->end is correct: */ + btree_node_iter_for_each(iter, set) { + for_each_bset(b, t) + if (set->end == t->end_offset) + goto found; + BUG(); +found: + BUG_ON(set->k < btree_bkey_first_offset(t) || + set->k >= t->end_offset); + } + + /* Verify iterator is sorted: */ + btree_node_iter_for_each(iter, set) + BUG_ON(set != iter->data && + btree_node_iter_cmp(b, set[-1], set[0]) > 0); + + k = bch2_btree_node_iter_peek_all(iter, b); + + for_each_bset(b, t) { + if (iter->data[0].end == t->end_offset) + continue; + + p = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + + BUG_ON(p && bkey_iter_cmp(b, k, p) < 0); + } +} + +void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, where); + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); + struct bkey_packed *next = (void *) (where->_data + clobber_u64s); + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; +#if 0 + BUG_ON(prev && + bkey_iter_cmp(b, prev, insert) > 0); +#else + if (prev && + bkey_iter_cmp(b, prev, insert) > 0) { + struct bkey k1 = bkey_unpack_key(b, prev); + struct bkey k2 = bkey_unpack_key(b, insert); + + bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); + + panic("prev > insert:\n" + "prev key %s\n" + "insert key %s\n", + buf1.buf, buf2.buf); + } +#endif +#if 0 + BUG_ON(next != btree_bkey_last(b, t) && + bkey_iter_cmp(b, insert, next) > 0); +#else + if (next != btree_bkey_last(b, t) && + bkey_iter_cmp(b, insert, next) > 0) { + struct bkey k1 = bkey_unpack_key(b, insert); + struct bkey k2 = bkey_unpack_key(b, next); + + bch2_dump_btree_node(NULL, b); + bch2_bkey_to_text(&buf1, &k1); + bch2_bkey_to_text(&buf2, &k2); + + panic("insert > next:\n" + "insert key %s\n" + "next key %s\n", + buf1.buf, buf2.buf); + } +#endif +} + +#else + +static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, + struct btree *b) {} + +#endif + +/* Auxiliary search trees */ + +#define BFLOAT_FAILED_UNPACKED U8_MAX +#define BFLOAT_FAILED U8_MAX + +struct bkey_float { + u8 exponent; + u8 key_offset; + u16 mantissa; +}; +#define BKEY_MANTISSA_BITS 16 + +static unsigned bkey_float_byte_offset(unsigned idx) +{ + return idx * sizeof(struct bkey_float); +} + +struct ro_aux_tree { + struct bkey_float f[0]; +}; + +struct rw_aux_tree { + u16 offset; + struct bpos k; +}; + +static unsigned bset_aux_tree_buf_end(const struct bset_tree *t) +{ + BUG_ON(t->aux_data_offset == U16_MAX); + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + return t->aux_data_offset; + case BSET_RO_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(t->size * sizeof(struct bkey_float) + + t->size * sizeof(u8), 8); + case BSET_RW_AUX_TREE: + return t->aux_data_offset + + DIV_ROUND_UP(sizeof(struct rw_aux_tree) * t->size, 8); + default: + BUG(); + } +} + +static unsigned bset_aux_tree_buf_start(const struct btree *b, + const struct bset_tree *t) +{ + return t == b->set + ? DIV_ROUND_UP(b->unpack_fn_len, 8) + : bset_aux_tree_buf_end(t - 1); +} + +static void *__aux_tree_base(const struct btree *b, + const struct bset_tree *t) +{ + return b->aux_data + t->aux_data_offset * 8; +} + +static struct ro_aux_tree *ro_aux_tree_base(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + return __aux_tree_base(b, t); +} + +static u8 *ro_aux_tree_prev(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RO_AUX_TREE); + + return __aux_tree_base(b, t) + bkey_float_byte_offset(t->size); +} + +static struct bkey_float *bkey_float(const struct btree *b, + const struct bset_tree *t, + unsigned idx) +{ + return ro_aux_tree_base(b, t)->f + idx; +} + +static void bset_aux_tree_verify(const struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + const struct bset_tree *t; + + for_each_bset(b, t) { + if (t->aux_data_offset == U16_MAX) + continue; + + BUG_ON(t != b->set && + t[-1].aux_data_offset == U16_MAX); + + BUG_ON(t->aux_data_offset < bset_aux_tree_buf_start(b, t)); + BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); + BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); + } +#endif +} + +void bch2_btree_keys_init(struct btree *b) +{ + unsigned i; + + b->nsets = 0; + memset(&b->nr, 0, sizeof(b->nr)); + + for (i = 0; i < MAX_BSETS; i++) + b->set[i].data_offset = U16_MAX; + + bch2_bset_set_no_aux_tree(b, b->set); +} + +/* Binary tree stuff for auxiliary search trees */ + +/* + * Cacheline/offset <-> bkey pointer arithmetic: + * + * t->tree is a binary search tree in an array; each node corresponds to a key + * in one cacheline in t->set (BSET_CACHELINE bytes). + * + * This means we don't have to store the full index of the key that a node in + * the binary tree points to; eytzinger1_to_inorder() gives us the cacheline, and + * then bkey_float->m gives us the offset within that cacheline, in units of 8 + * bytes. + * + * cacheline_to_bkey() and friends abstract out all the pointer arithmetic to + * make this work. + * + * To construct the bfloat for an arbitrary key we need to know what the key + * immediately preceding it is: we have to check if the two keys differ in the + * bits we're going to store in bkey_float->mantissa. t->prev[j] stores the size + * of the previous key so we can walk backwards to it from t->tree[j]'s key. + */ + +static inline void *bset_cacheline(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline) +{ + return (void *) round_down((unsigned long) btree_bkey_first(b, t), + L1_CACHE_BYTES) + + cacheline * BSET_CACHELINE; +} + +static struct bkey_packed *cacheline_to_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + unsigned offset) +{ + return bset_cacheline(b, t, cacheline) + offset * 8; +} + +static unsigned bkey_to_cacheline(const struct btree *b, + const struct bset_tree *t, + const struct bkey_packed *k) +{ + return ((void *) k - bset_cacheline(b, t, 0)) / BSET_CACHELINE; +} + +static ssize_t __bkey_to_cacheline_offset(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + const struct bkey_packed *k) +{ + return (u64 *) k - (u64 *) bset_cacheline(b, t, cacheline); +} + +static unsigned bkey_to_cacheline_offset(const struct btree *b, + const struct bset_tree *t, + unsigned cacheline, + const struct bkey_packed *k) +{ + size_t m = __bkey_to_cacheline_offset(b, t, cacheline, k); + + EBUG_ON(m > U8_MAX); + return m; +} + +static inline struct bkey_packed *tree_to_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned j) +{ + return cacheline_to_bkey(b, t, + __eytzinger1_to_inorder(j, t->size - 1, t->extra), + bkey_float(b, t, j)->key_offset); +} + +static struct bkey_packed *tree_to_prev_bkey(const struct btree *b, + const struct bset_tree *t, + unsigned j) +{ + unsigned prev_u64s = ro_aux_tree_prev(b, t)[j]; + + return (void *) (tree_to_bkey(b, t, j)->_data - prev_u64s); +} + +static struct rw_aux_tree *rw_aux_tree(const struct btree *b, + const struct bset_tree *t) +{ + EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + + return __aux_tree_base(b, t); +} + +/* + * For the write set - the one we're currently inserting keys into - we don't + * maintain a full search tree, we just keep a simple lookup table in t->prev. + */ +static struct bkey_packed *rw_aux_to_bkey(const struct btree *b, + struct bset_tree *t, + unsigned j) +{ + return __btree_node_offset_to_key(b, rw_aux_tree(b, t)[j].offset); +} + +static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, + unsigned j, struct bkey_packed *k) +{ + EBUG_ON(k >= btree_bkey_last(b, t)); + + rw_aux_tree(b, t)[j] = (struct rw_aux_tree) { + .offset = __btree_node_key_to_offset(b, k), + .k = bkey_unpack_pos(b, k), + }; +} + +static void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + struct bkey_packed *k = btree_bkey_first(b, t); + unsigned j = 0; + + if (!bch2_expensive_debug_checks) + return; + + BUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + BUG_ON(t->size < 1); + BUG_ON(rw_aux_to_bkey(b, t, j) != k); + + goto start; + while (1) { + if (rw_aux_to_bkey(b, t, j) == k) { + BUG_ON(!bpos_eq(rw_aux_tree(b, t)[j].k, + bkey_unpack_pos(b, k))); +start: + if (++j == t->size) + break; + + BUG_ON(rw_aux_tree(b, t)[j].offset <= + rw_aux_tree(b, t)[j - 1].offset); + } + + k = bkey_p_next(k); + BUG_ON(k >= btree_bkey_last(b, t)); + } +} + +/* returns idx of first entry >= offset: */ +static unsigned rw_aux_tree_bsearch(struct btree *b, + struct bset_tree *t, + unsigned offset) +{ + unsigned bset_offs = offset - btree_bkey_first_offset(t); + unsigned bset_u64s = t->end_offset - btree_bkey_first_offset(t); + unsigned idx = bset_u64s ? bset_offs * t->size / bset_u64s : 0; + + EBUG_ON(bset_aux_tree_type(t) != BSET_RW_AUX_TREE); + EBUG_ON(!t->size); + EBUG_ON(idx > t->size); + + while (idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset) + idx++; + + while (idx && + rw_aux_tree(b, t)[idx - 1].offset >= offset) + idx--; + + EBUG_ON(idx < t->size && + rw_aux_tree(b, t)[idx].offset < offset); + EBUG_ON(idx && rw_aux_tree(b, t)[idx - 1].offset >= offset); + EBUG_ON(idx + 1 < t->size && + rw_aux_tree(b, t)[idx].offset == + rw_aux_tree(b, t)[idx + 1].offset); + + return idx; +} + +static inline unsigned bkey_mantissa(const struct bkey_packed *k, + const struct bkey_float *f, + unsigned idx) +{ + u64 v; + + EBUG_ON(!bkey_packed(k)); + + v = get_unaligned((u64 *) (((u8 *) k->_data) + (f->exponent >> 3))); + + /* + * In little endian, we're shifting off low bits (and then the bits we + * want are at the low end), in big endian we're shifting off high bits + * (and then the bits we want are at the high end, so we shift them + * back down): + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + v >>= f->exponent & 7; +#else + v >>= 64 - (f->exponent & 7) - BKEY_MANTISSA_BITS; +#endif + return (u16) v; +} + +static __always_inline void make_bfloat(struct btree *b, struct bset_tree *t, + unsigned j, + struct bkey_packed *min_key, + struct bkey_packed *max_key) +{ + struct bkey_float *f = bkey_float(b, t, j); + struct bkey_packed *m = tree_to_bkey(b, t, j); + struct bkey_packed *l = is_power_of_2(j) + ? min_key + : tree_to_prev_bkey(b, t, j >> ffs(j)); + struct bkey_packed *r = is_power_of_2(j + 1) + ? max_key + : tree_to_bkey(b, t, j >> (ffz(j) + 1)); + unsigned mantissa; + int shift, exponent, high_bit; + + /* + * for failed bfloats, the lookup code falls back to comparing against + * the original key. + */ + + if (!bkey_packed(l) || !bkey_packed(r) || !bkey_packed(m) || + !b->nr_key_bits) { + f->exponent = BFLOAT_FAILED_UNPACKED; + return; + } + + /* + * The greatest differing bit of l and r is the first bit we must + * include in the bfloat mantissa we're creating in order to do + * comparisons - that bit always becomes the high bit of + * bfloat->mantissa, and thus the exponent we're calculating here is + * the position of what will become the low bit in bfloat->mantissa: + * + * Note that this may be negative - we may be running off the low end + * of the key: we handle this later: + */ + high_bit = max(bch2_bkey_greatest_differing_bit(b, l, r), + min_t(unsigned, BKEY_MANTISSA_BITS, b->nr_key_bits) - 1); + exponent = high_bit - (BKEY_MANTISSA_BITS - 1); + + /* + * Then we calculate the actual shift value, from the start of the key + * (k->_data), to get the key bits starting at exponent: + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + shift = (int) (b->format.key_u64s * 64 - b->nr_key_bits) + exponent; + + EBUG_ON(shift + BKEY_MANTISSA_BITS > b->format.key_u64s * 64); +#else + shift = high_bit_offset + + b->nr_key_bits - + exponent - + BKEY_MANTISSA_BITS; + + EBUG_ON(shift < KEY_PACKED_BITS_START); +#endif + EBUG_ON(shift < 0 || shift >= BFLOAT_FAILED); + + f->exponent = shift; + mantissa = bkey_mantissa(m, f, j); + + /* + * If we've got garbage bits, set them to all 1s - it's legal for the + * bfloat to compare larger than the original key, but not smaller: + */ + if (exponent < 0) + mantissa |= ~(~0U << -exponent); + + f->mantissa = mantissa; +} + +/* bytes remaining - only valid for last bset: */ +static unsigned __bset_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + bset_aux_tree_verify(b); + + return btree_aux_data_bytes(b) - t->aux_data_offset * sizeof(u64); +} + +static unsigned bset_ro_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / + (sizeof(struct bkey_float) + sizeof(u8)); +} + +static unsigned bset_rw_tree_capacity(const struct btree *b, const struct bset_tree *t) +{ + return __bset_tree_capacity(b, t) / sizeof(struct rw_aux_tree); +} + +static noinline void __build_rw_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *k; + + t->size = 1; + t->extra = BSET_RW_AUX_TREE_VAL; + rw_aux_tree(b, t)[0].offset = + __btree_node_key_to_offset(b, btree_bkey_first(b, t)); + + bset_tree_for_each_key(b, t, k) { + if (t->size == bset_rw_tree_capacity(b, t)) + break; + + if ((void *) k - (void *) rw_aux_to_bkey(b, t, t->size - 1) > + L1_CACHE_BYTES) + rw_aux_tree_set(b, t, t->size++, k); + } +} + +static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t) +{ + struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t); + struct bkey_i min_key, max_key; + unsigned j, cacheline = 1; + + t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)), + bset_ro_tree_capacity(b, t)); +retry: + if (t->size < 2) { + t->size = 0; + t->extra = BSET_NO_AUX_TREE_VAL; + return; + } + + t->extra = (t->size - rounddown_pow_of_two(t->size - 1)) << 1; + + /* First we figure out where the first key in each cacheline is */ + eytzinger1_for_each(j, t->size - 1) { + while (bkey_to_cacheline(b, t, k) < cacheline) + prev = k, k = bkey_p_next(k); + + if (k >= btree_bkey_last(b, t)) { + /* XXX: this path sucks */ + t->size--; + goto retry; + } + + ro_aux_tree_prev(b, t)[j] = prev->u64s; + bkey_float(b, t, j)->key_offset = + bkey_to_cacheline_offset(b, t, cacheline++, k); + + EBUG_ON(tree_to_prev_bkey(b, t, j) != prev); + EBUG_ON(tree_to_bkey(b, t, j) != k); + } + + while (k != btree_bkey_last(b, t)) + prev = k, k = bkey_p_next(k); + + if (!bkey_pack_pos(bkey_to_packed(&min_key), b->data->min_key, b)) { + bkey_init(&min_key.k); + min_key.k.p = b->data->min_key; + } + + if (!bkey_pack_pos(bkey_to_packed(&max_key), b->data->max_key, b)) { + bkey_init(&max_key.k); + max_key.k.p = b->data->max_key; + } + + /* Then we build the tree */ + eytzinger1_for_each(j, t->size - 1) + make_bfloat(b, t, j, + bkey_to_packed(&min_key), + bkey_to_packed(&max_key)); +} + +static void bset_alloc_tree(struct btree *b, struct bset_tree *t) +{ + struct bset_tree *i; + + for (i = b->set; i != t; i++) + BUG_ON(bset_has_rw_aux_tree(i)); + + bch2_bset_set_no_aux_tree(b, t); + + /* round up to next cacheline: */ + t->aux_data_offset = round_up(bset_aux_tree_buf_start(b, t), + SMP_CACHE_BYTES / sizeof(u64)); + + bset_aux_tree_verify(b); +} + +void bch2_bset_build_aux_tree(struct btree *b, struct bset_tree *t, + bool writeable) +{ + if (writeable + ? bset_has_rw_aux_tree(t) + : bset_has_ro_aux_tree(t)) + return; + + bset_alloc_tree(b, t); + + if (!__bset_tree_capacity(b, t)) + return; + + if (writeable) + __build_rw_aux_tree(b, t); + else + __build_ro_aux_tree(b, t); + + bset_aux_tree_verify(b); +} + +void bch2_bset_init_first(struct btree *b, struct bset *i) +{ + struct bset_tree *t; + + BUG_ON(b->nsets); + + memset(i, 0, sizeof(*i)); + get_random_bytes(&i->seq, sizeof(i->seq)); + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + t = &b->set[b->nsets++]; + set_btree_bset(b, t, i); +} + +void bch2_bset_init_next(struct bch_fs *c, struct btree *b, + struct btree_node_entry *bne) +{ + struct bset *i = &bne->keys; + struct bset_tree *t; + + BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c)); + BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b))); + BUG_ON(b->nsets >= MAX_BSETS); + + memset(i, 0, sizeof(*i)); + i->seq = btree_bset_first(b)->seq; + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + t = &b->set[b->nsets++]; + set_btree_bset(b, t, i); +} + +/* + * find _some_ key in the same bset as @k that precedes @k - not necessarily the + * immediate predecessor: + */ +static struct bkey_packed *__bkey_prev(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + struct bkey_packed *p; + unsigned offset; + int j; + + EBUG_ON(k < btree_bkey_first(b, t) || + k > btree_bkey_last(b, t)); + + if (k == btree_bkey_first(b, t)) + return NULL; + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + p = btree_bkey_first(b, t); + break; + case BSET_RO_AUX_TREE: + j = min_t(unsigned, t->size - 1, bkey_to_cacheline(b, t, k)); + + do { + p = j ? tree_to_bkey(b, t, + __inorder_to_eytzinger1(j--, + t->size - 1, t->extra)) + : btree_bkey_first(b, t); + } while (p >= k); + break; + case BSET_RW_AUX_TREE: + offset = __btree_node_key_to_offset(b, k); + j = rw_aux_tree_bsearch(b, t, offset); + p = j ? rw_aux_to_bkey(b, t, j - 1) + : btree_bkey_first(b, t); + break; + } + + return p; +} + +struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, + struct bset_tree *t, + struct bkey_packed *k, + unsigned min_key_type) +{ + struct bkey_packed *p, *i, *ret = NULL, *orig_k = k; + + while ((p = __bkey_prev(b, t, k)) && !ret) { + for (i = p; i != k; i = bkey_p_next(i)) + if (i->type >= min_key_type) + ret = i; + + k = p; + } + + if (bch2_expensive_debug_checks) { + BUG_ON(ret >= orig_k); + + for (i = ret + ? bkey_p_next(ret) + : btree_bkey_first(b, t); + i != orig_k; + i = bkey_p_next(i)) + BUG_ON(i->type >= min_key_type); + } + + return ret; +} + +/* Insert */ + +static void bch2_bset_fix_lookup_table(struct btree *b, + struct bset_tree *t, + struct bkey_packed *_where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + int shift = new_u64s - clobber_u64s; + unsigned l, j, where = __btree_node_key_to_offset(b, _where); + + EBUG_ON(bset_has_ro_aux_tree(t)); + + if (!bset_has_rw_aux_tree(t)) + return; + + /* returns first entry >= where */ + l = rw_aux_tree_bsearch(b, t, where); + + if (!l) /* never delete first entry */ + l++; + else if (l < t->size && + where < t->end_offset && + rw_aux_tree(b, t)[l].offset == where) + rw_aux_tree_set(b, t, l++, _where); + + /* l now > where */ + + for (j = l; + j < t->size && + rw_aux_tree(b, t)[j].offset < where + clobber_u64s; + j++) + ; + + if (j < t->size && + rw_aux_tree(b, t)[j].offset + shift == + rw_aux_tree(b, t)[l - 1].offset) + j++; + + memmove(&rw_aux_tree(b, t)[l], + &rw_aux_tree(b, t)[j], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[j]); + t->size -= j - l; + + for (j = l; j < t->size; j++) + rw_aux_tree(b, t)[j].offset += shift; + + EBUG_ON(l < t->size && + rw_aux_tree(b, t)[l].offset == + rw_aux_tree(b, t)[l - 1].offset); + + if (t->size < bset_rw_tree_capacity(b, t) && + (l < t->size + ? rw_aux_tree(b, t)[l].offset + : t->end_offset) - + rw_aux_tree(b, t)[l - 1].offset > + L1_CACHE_BYTES / sizeof(u64)) { + struct bkey_packed *start = rw_aux_to_bkey(b, t, l - 1); + struct bkey_packed *end = l < t->size + ? rw_aux_to_bkey(b, t, l) + : btree_bkey_last(b, t); + struct bkey_packed *k = start; + + while (1) { + k = bkey_p_next(k); + if (k == end) + break; + + if ((void *) k - (void *) start >= L1_CACHE_BYTES) { + memmove(&rw_aux_tree(b, t)[l + 1], + &rw_aux_tree(b, t)[l], + (void *) &rw_aux_tree(b, t)[t->size] - + (void *) &rw_aux_tree(b, t)[l]); + t->size++; + rw_aux_tree_set(b, t, l, k); + break; + } + } + } + + bch2_bset_verify_rw_aux_tree(b, t); + bset_aux_tree_verify(b); +} + +void bch2_bset_insert(struct btree *b, + struct btree_node_iter *iter, + struct bkey_packed *where, + struct bkey_i *insert, + unsigned clobber_u64s) +{ + struct bkey_format *f = &b->format; + struct bset_tree *t = bset_tree_last(b); + struct bkey_packed packed, *src = bkey_to_packed(insert); + + bch2_bset_verify_rw_aux_tree(b, t); + bch2_verify_insert_pos(b, where, bkey_to_packed(insert), clobber_u64s); + + if (bch2_bkey_pack_key(&packed, &insert->k, f)) + src = &packed; + + if (!bkey_deleted(&insert->k)) + btree_keys_account_key_add(&b->nr, t - b->set, src); + + if (src->u64s != clobber_u64s) { + u64 *src_p = where->_data + clobber_u64s; + u64 *dst_p = where->_data + src->u64s; + + EBUG_ON((int) le16_to_cpu(bset(b, t)->u64s) < + (int) clobber_u64s - src->u64s); + + memmove_u64s(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); + le16_add_cpu(&bset(b, t)->u64s, src->u64s - clobber_u64s); + set_btree_bset_end(b, t); + } + + memcpy_u64s_small(where, src, + bkeyp_key_u64s(f, src)); + memcpy_u64s(bkeyp_val(f, where), &insert->v, + bkeyp_val_u64s(f, src)); + + if (src->u64s != clobber_u64s) + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, src->u64s); + + bch2_verify_btree_nr_keys(b); +} + +void bch2_bset_delete(struct btree *b, + struct bkey_packed *where, + unsigned clobber_u64s) +{ + struct bset_tree *t = bset_tree_last(b); + u64 *src_p = where->_data + clobber_u64s; + u64 *dst_p = where->_data; + + bch2_bset_verify_rw_aux_tree(b, t); + + EBUG_ON(le16_to_cpu(bset(b, t)->u64s) < clobber_u64s); + + memmove_u64s_down(dst_p, src_p, btree_bkey_last(b, t)->_data - src_p); + le16_add_cpu(&bset(b, t)->u64s, -clobber_u64s); + set_btree_bset_end(b, t); + + bch2_bset_fix_lookup_table(b, t, where, clobber_u64s, 0); +} + +/* Lookup */ + +__flatten +static struct bkey_packed *bset_search_write_set(const struct btree *b, + struct bset_tree *t, + struct bpos *search) +{ + unsigned l = 0, r = t->size; + + while (l + 1 != r) { + unsigned m = (l + r) >> 1; + + if (bpos_lt(rw_aux_tree(b, t)[m].k, *search)) + l = m; + else + r = m; + } + + return rw_aux_to_bkey(b, t, l); +} + +static inline void prefetch_four_cachelines(void *p) +{ +#ifdef CONFIG_X86_64 + asm("prefetcht0 (-127 + 64 * 0)(%0);" + "prefetcht0 (-127 + 64 * 1)(%0);" + "prefetcht0 (-127 + 64 * 2)(%0);" + "prefetcht0 (-127 + 64 * 3)(%0);" + : + : "r" (p + 127)); +#else + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + prefetch(p + L1_CACHE_BYTES * 3); +#endif +} + +static inline bool bkey_mantissa_bits_dropped(const struct btree *b, + const struct bkey_float *f, + unsigned idx) +{ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + unsigned key_bits_start = b->format.key_u64s * 64 - b->nr_key_bits; + + return f->exponent > key_bits_start; +#else + unsigned key_bits_end = high_bit_offset + b->nr_key_bits; + + return f->exponent + BKEY_MANTISSA_BITS < key_bits_end; +#endif +} + +__flatten +static struct bkey_packed *bset_search_tree(const struct btree *b, + const struct bset_tree *t, + const struct bpos *search, + const struct bkey_packed *packed_search) +{ + struct ro_aux_tree *base = ro_aux_tree_base(b, t); + struct bkey_float *f; + struct bkey_packed *k; + unsigned inorder, n = 1, l, r; + int cmp; + + do { + if (likely(n << 4 < t->size)) + prefetch(&base->f[n << 4]); + + f = &base->f[n]; + if (unlikely(f->exponent >= BFLOAT_FAILED)) + goto slowpath; + + l = f->mantissa; + r = bkey_mantissa(packed_search, f, n); + + if (unlikely(l == r) && bkey_mantissa_bits_dropped(b, f, n)) + goto slowpath; + + n = n * 2 + (l < r); + continue; +slowpath: + k = tree_to_bkey(b, t, n); + cmp = bkey_cmp_p_or_unp(b, k, packed_search, search); + if (!cmp) + return k; + + n = n * 2 + (cmp < 0); + } while (n < t->size); + + inorder = __eytzinger1_to_inorder(n >> 1, t->size - 1, t->extra); + + /* + * n would have been the node we recursed to - the low bit tells us if + * we recursed left or recursed right. + */ + if (likely(!(n & 1))) { + --inorder; + if (unlikely(!inorder)) + return btree_bkey_first(b, t); + + f = &base->f[eytzinger1_prev(n >> 1, t->size - 1)]; + } + + return cacheline_to_bkey(b, t, inorder, f->key_offset); +} + +static __always_inline __flatten +struct bkey_packed *__bch2_bset_search(struct btree *b, + struct bset_tree *t, + struct bpos *search, + const struct bkey_packed *lossy_packed_search) +{ + + /* + * First, we search for a cacheline, then lastly we do a linear search + * within that cacheline. + * + * To search for the cacheline, there's three different possibilities: + * * The set is too small to have a search tree, so we just do a linear + * search over the whole set. + * * The set is the one we're currently inserting into; keeping a full + * auxiliary search tree up to date would be too expensive, so we + * use a much simpler lookup table to do a binary search - + * bset_search_write_set(). + * * Or we use the auxiliary search tree we constructed earlier - + * bset_search_tree() + */ + + switch (bset_aux_tree_type(t)) { + case BSET_NO_AUX_TREE: + return btree_bkey_first(b, t); + case BSET_RW_AUX_TREE: + return bset_search_write_set(b, t, search); + case BSET_RO_AUX_TREE: + return bset_search_tree(b, t, search, lossy_packed_search); + default: + unreachable(); + } +} + +static __always_inline __flatten +struct bkey_packed *bch2_bset_search_linear(struct btree *b, + struct bset_tree *t, + struct bpos *search, + struct bkey_packed *packed_search, + const struct bkey_packed *lossy_packed_search, + struct bkey_packed *m) +{ + if (lossy_packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_cmp_p_or_unp(b, m, + lossy_packed_search, search) < 0) + m = bkey_p_next(m); + + if (!packed_search) + while (m != btree_bkey_last(b, t) && + bkey_iter_pos_cmp(b, m, search) < 0) + m = bkey_p_next(m); + + if (bch2_expensive_debug_checks) { + struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); + + BUG_ON(prev && + bkey_iter_cmp_p_or_unp(b, prev, + packed_search, search) >= 0); + } + + return m; +} + +/* Btree node iterator */ + +static inline void __bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + if (k != end) { + struct btree_node_iter_set *pos; + + btree_node_iter_for_each(iter, pos) + ; + + BUG_ON(pos >= iter->data + ARRAY_SIZE(iter->data)); + *pos = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k), + __btree_node_key_to_offset(b, end) + }; + } +} + +void bch2_btree_node_iter_push(struct btree_node_iter *iter, + struct btree *b, + const struct bkey_packed *k, + const struct bkey_packed *end) +{ + __bch2_btree_node_iter_push(iter, b, k, end); + bch2_btree_node_iter_sort(iter, b); +} + +noinline __flatten __cold +static void btree_node_iter_init_pack_failed(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) +{ + struct bkey_packed *k; + + trace_bkey_pack_pos_fail(search); + + bch2_btree_node_iter_init_from_start(iter, b); + + while ((k = bch2_btree_node_iter_peek(iter, b)) && + bkey_iter_pos_cmp(b, k, search) < 0) + bch2_btree_node_iter_advance(iter, b); +} + +/** + * bch_btree_node_iter_init - initialize a btree node iterator, starting from a + * given position + * + * Main entry point to the lookup code for individual btree nodes: + * + * NOTE: + * + * When you don't filter out deleted keys, btree nodes _do_ contain duplicate + * keys. This doesn't matter for most code, but it does matter for lookups. + * + * Some adjacent keys with a string of equal keys: + * i j k k k k l m + * + * If you search for k, the lookup code isn't guaranteed to return you any + * specific k. The lookup code is conceptually doing a binary search and + * iterating backwards is very expensive so if the pivot happens to land at the + * last k that's what you'll get. + * + * This works out ok, but it's something to be aware of: + * + * - For non extents, we guarantee that the live key comes last - see + * btree_node_iter_cmp(), keys_out_of_order(). So the duplicates you don't + * see will only be deleted keys you don't care about. + * + * - For extents, deleted keys sort last (see the comment at the top of this + * file). But when you're searching for extents, you actually want the first + * key strictly greater than your search key - an extent that compares equal + * to the search key is going to have 0 sectors after the search key. + * + * But this does mean that we can't just search for + * bpos_successor(start_of_range) to get the first extent that overlaps with + * the range we want - if we're unlucky and there's an extent that ends + * exactly where we searched, then there could be a deleted key at the same + * position and we'd get that when we search instead of the preceding extent + * we needed. + * + * So we've got to search for start_of_range, then after the lookup iterate + * past any extents that compare equal to the position we searched for. + */ +__flatten +void bch2_btree_node_iter_init(struct btree_node_iter *iter, + struct btree *b, struct bpos *search) +{ + struct bkey_packed p, *packed_search = NULL; + struct btree_node_iter_set *pos = iter->data; + struct bkey_packed *k[MAX_BSETS]; + unsigned i; + + EBUG_ON(bpos_lt(*search, b->data->min_key)); + EBUG_ON(bpos_gt(*search, b->data->max_key)); + bset_aux_tree_verify(b); + + memset(iter, 0, sizeof(*iter)); + + switch (bch2_bkey_pack_pos_lossy(&p, *search, b)) { + case BKEY_PACK_POS_EXACT: + packed_search = &p; + break; + case BKEY_PACK_POS_SMALLER: + packed_search = NULL; + break; + case BKEY_PACK_POS_FAIL: + btree_node_iter_init_pack_failed(iter, b, search); + return; + } + + for (i = 0; i < b->nsets; i++) { + k[i] = __bch2_bset_search(b, b->set + i, search, &p); + prefetch_four_cachelines(k[i]); + } + + for (i = 0; i < b->nsets; i++) { + struct bset_tree *t = b->set + i; + struct bkey_packed *end = btree_bkey_last(b, t); + + k[i] = bch2_bset_search_linear(b, t, search, + packed_search, &p, k[i]); + if (k[i] != end) + *pos++ = (struct btree_node_iter_set) { + __btree_node_key_to_offset(b, k[i]), + __btree_node_key_to_offset(b, end) + }; + } + + bch2_btree_node_iter_sort(iter, b); +} + +void bch2_btree_node_iter_init_from_start(struct btree_node_iter *iter, + struct btree *b) +{ + struct bset_tree *t; + + memset(iter, 0, sizeof(*iter)); + + for_each_bset(b, t) + __bch2_btree_node_iter_push(iter, b, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + bch2_btree_node_iter_sort(iter, b); +} + +struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t) +{ + struct btree_node_iter_set *set; + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) + return __btree_node_offset_to_key(b, set->k); + + return btree_bkey_last(b, t); +} + +static inline bool btree_node_iter_sort_two(struct btree_node_iter *iter, + struct btree *b, + unsigned first) +{ + bool ret; + + if ((ret = (btree_node_iter_cmp(b, + iter->data[first], + iter->data[first + 1]) > 0))) + swap(iter->data[first], iter->data[first + 1]); + return ret; +} + +void bch2_btree_node_iter_sort(struct btree_node_iter *iter, + struct btree *b) +{ + /* unrolled bubble sort: */ + + if (!__btree_node_iter_set_end(iter, 2)) { + btree_node_iter_sort_two(iter, b, 0); + btree_node_iter_sort_two(iter, b, 1); + } + + if (!__btree_node_iter_set_end(iter, 1)) + btree_node_iter_sort_two(iter, b, 0); +} + +void bch2_btree_node_iter_set_drop(struct btree_node_iter *iter, + struct btree_node_iter_set *set) +{ + struct btree_node_iter_set *last = + iter->data + ARRAY_SIZE(iter->data) - 1; + + memmove(&set[0], &set[1], (void *) last - (void *) set); + *last = (struct btree_node_iter_set) { 0, 0 }; +} + +static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) +{ + iter->data->k += __bch2_btree_node_iter_peek_all(iter, b)->u64s; + + EBUG_ON(iter->data->k > iter->data->end); + + if (unlikely(__btree_node_iter_set_end(iter, 0))) { + /* avoid an expensive memmove call: */ + iter->data[0] = iter->data[1]; + iter->data[1] = iter->data[2]; + iter->data[2] = (struct btree_node_iter_set) { 0, 0 }; + return; + } + + if (__btree_node_iter_set_end(iter, 1)) + return; + + if (!btree_node_iter_sort_two(iter, b, 0)) + return; + + if (__btree_node_iter_set_end(iter, 2)) + return; + + btree_node_iter_sort_two(iter, b, 1); +} + +void bch2_btree_node_iter_advance(struct btree_node_iter *iter, + struct btree *b) +{ + if (bch2_expensive_debug_checks) { + bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_next_check(iter, b); + } + + __bch2_btree_node_iter_advance(iter, b); +} + +/* + * Expensive: + */ +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *k, *prev = NULL; + struct btree_node_iter_set *set; + struct bset_tree *t; + unsigned end = 0; + + if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + + for_each_bset(b, t) { + k = bch2_bkey_prev_all(b, t, + bch2_btree_node_iter_bset_pos(iter, b, t)); + if (k && + (!prev || bkey_iter_cmp(b, k, prev) > 0)) { + prev = k; + end = t->end_offset; + } + } + + if (!prev) + return NULL; + + /* + * We're manually memmoving instead of just calling sort() to ensure the + * prev we picked ends up in slot 0 - sort won't necessarily put it + * there because of duplicate deleted keys: + */ + btree_node_iter_for_each(iter, set) + if (set->end == end) + goto found; + + BUG_ON(set != &iter->data[__btree_node_iter_used(iter)]); +found: + BUG_ON(set >= iter->data + ARRAY_SIZE(iter->data)); + + memmove(&iter->data[1], + &iter->data[0], + (void *) set - (void *) &iter->data[0]); + + iter->data[0].k = __btree_node_key_to_offset(b, prev); + iter->data[0].end = end; + + if (bch2_expensive_debug_checks) + bch2_btree_node_iter_verify(iter, b); + return prev; +} + +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *iter, + struct btree *b) +{ + struct bkey_packed *prev; + + do { + prev = bch2_btree_node_iter_prev_all(iter, b); + } while (prev && bkey_deleted(prev)); + + return prev; +} + +struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *iter, + struct btree *b, + struct bkey *u) +{ + struct bkey_packed *k = bch2_btree_node_iter_peek(iter, b); + + return k ? bkey_disassemble(b, k, u) : bkey_s_c_null; +} + +/* Mergesort */ + +void bch2_btree_keys_stats(const struct btree *b, struct bset_stats *stats) +{ + const struct bset_tree *t; + + for_each_bset(b, t) { + enum bset_aux_tree_type type = bset_aux_tree_type(t); + size_t j; + + stats->sets[type].nr++; + stats->sets[type].bytes += le16_to_cpu(bset(b, t)->u64s) * + sizeof(u64); + + if (bset_has_ro_aux_tree(t)) { + stats->floats += t->size - 1; + + for (j = 1; j < t->size; j++) + stats->failed += + bkey_float(b, t, j)->exponent == + BFLOAT_FAILED; + } + } +} + +void bch2_bfloat_to_text(struct printbuf *out, struct btree *b, + struct bkey_packed *k) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + struct bkey uk; + unsigned j, inorder; + + if (!bset_has_ro_aux_tree(t)) + return; + + inorder = bkey_to_cacheline(b, t, k); + if (!inorder || inorder >= t->size) + return; + + j = __inorder_to_eytzinger1(inorder, t->size - 1, t->extra); + if (k != tree_to_bkey(b, t, j)) + return; + + switch (bkey_float(b, t, j)->exponent) { + case BFLOAT_FAILED: + uk = bkey_unpack_key(b, k); + prt_printf(out, + " failed unpacked at depth %u\n" + "\t", + ilog2(j)); + bch2_bpos_to_text(out, uk.p); + prt_printf(out, "\n"); + break; + } +} diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h new file mode 100644 index 000000000..632c2b8c5 --- /dev/null +++ b/fs/bcachefs/bset.h @@ -0,0 +1,541 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BSET_H +#define _BCACHEFS_BSET_H + +#include +#include + +#include "bcachefs.h" +#include "bkey.h" +#include "bkey_methods.h" +#include "btree_types.h" +#include "util.h" /* for time_stats */ +#include "vstructs.h" + +/* + * BKEYS: + * + * A bkey contains a key, a size field, a variable number of pointers, and some + * ancillary flag bits. + * + * We use two different functions for validating bkeys, bkey_invalid and + * bkey_deleted(). + * + * The one exception to the rule that ptr_invalid() filters out invalid keys is + * that it also filters out keys of size 0 - these are keys that have been + * completely overwritten. It'd be safe to delete these in memory while leaving + * them on disk, just unnecessary work - so we filter them out when resorting + * instead. + * + * We can't filter out stale keys when we're resorting, because garbage + * collection needs to find them to ensure bucket gens don't wrap around - + * unless we're rewriting the btree node those stale keys still exist on disk. + * + * We also implement functions here for removing some number of sectors from the + * front or the back of a bkey - this is mainly used for fixing overlapping + * extents, by removing the overlapping sectors from the older key. + * + * BSETS: + * + * A bset is an array of bkeys laid out contiguously in memory in sorted order, + * along with a header. A btree node is made up of a number of these, written at + * different times. + * + * There could be many of them on disk, but we never allow there to be more than + * 4 in memory - we lazily resort as needed. + * + * We implement code here for creating and maintaining auxiliary search trees + * (described below) for searching an individial bset, and on top of that we + * implement a btree iterator. + * + * BTREE ITERATOR: + * + * Most of the code in bcache doesn't care about an individual bset - it needs + * to search entire btree nodes and iterate over them in sorted order. + * + * The btree iterator code serves both functions; it iterates through the keys + * in a btree node in sorted order, starting from either keys after a specific + * point (if you pass it a search key) or the start of the btree node. + * + * AUXILIARY SEARCH TREES: + * + * Since keys are variable length, we can't use a binary search on a bset - we + * wouldn't be able to find the start of the next key. But binary searches are + * slow anyways, due to terrible cache behaviour; bcache originally used binary + * searches and that code topped out at under 50k lookups/second. + * + * So we need to construct some sort of lookup table. Since we only insert keys + * into the last (unwritten) set, most of the keys within a given btree node are + * usually in sets that are mostly constant. We use two different types of + * lookup tables to take advantage of this. + * + * Both lookup tables share in common that they don't index every key in the + * set; they index one key every BSET_CACHELINE bytes, and then a linear search + * is used for the rest. + * + * For sets that have been written to disk and are no longer being inserted + * into, we construct a binary search tree in an array - traversing a binary + * search tree in an array gives excellent locality of reference and is very + * fast, since both children of any node are adjacent to each other in memory + * (and their grandchildren, and great grandchildren...) - this means + * prefetching can be used to great effect. + * + * It's quite useful performance wise to keep these nodes small - not just + * because they're more likely to be in L2, but also because we can prefetch + * more nodes on a single cacheline and thus prefetch more iterations in advance + * when traversing this tree. + * + * Nodes in the auxiliary search tree must contain both a key to compare against + * (we don't want to fetch the key from the set, that would defeat the purpose), + * and a pointer to the key. We use a few tricks to compress both of these. + * + * To compress the pointer, we take advantage of the fact that one node in the + * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have + * a function (to_inorder()) that takes the index of a node in a binary tree and + * returns what its index would be in an inorder traversal, so we only have to + * store the low bits of the offset. + * + * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To + * compress that, we take advantage of the fact that when we're traversing the + * search tree at every iteration we know that both our search key and the key + * we're looking for lie within some range - bounded by our previous + * comparisons. (We special case the start of a search so that this is true even + * at the root of the tree). + * + * So we know the key we're looking for is between a and b, and a and b don't + * differ higher than bit 50, we don't need to check anything higher than bit + * 50. + * + * We don't usually need the rest of the bits, either; we only need enough bits + * to partition the key range we're currently checking. Consider key n - the + * key our auxiliary search tree node corresponds to, and key p, the key + * immediately preceding n. The lowest bit we need to store in the auxiliary + * search tree is the highest bit that differs between n and p. + * + * Note that this could be bit 0 - we might sometimes need all 80 bits to do the + * comparison. But we'd really like our nodes in the auxiliary search tree to be + * of fixed size. + * + * The solution is to make them fixed size, and when we're constructing a node + * check if p and n differed in the bits we needed them to. If they don't we + * flag that node, and when doing lookups we fallback to comparing against the + * real key. As long as this doesn't happen to often (and it seems to reliably + * happen a bit less than 1% of the time), we win - even on failures, that key + * is then more likely to be in cache than if we were doing binary searches all + * the way, since we're touching so much less memory. + * + * The keys in the auxiliary search tree are stored in (software) floating + * point, with an exponent and a mantissa. The exponent needs to be big enough + * to address all the bits in the original key, but the number of bits in the + * mantissa is somewhat arbitrary; more bits just gets us fewer failures. + * + * We need 7 bits for the exponent and 3 bits for the key's offset (since keys + * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes. + * We need one node per 128 bytes in the btree node, which means the auxiliary + * search trees take up 3% as much memory as the btree itself. + * + * Constructing these auxiliary search trees is moderately expensive, and we + * don't want to be constantly rebuilding the search tree for the last set + * whenever we insert another key into it. For the unwritten set, we use a much + * simpler lookup table - it's just a flat array, so index i in the lookup table + * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing + * within each byte range works the same as with the auxiliary search trees. + * + * These are much easier to keep up to date when we insert a key - we do it + * somewhat lazily; when we shift a key up we usually just increment the pointer + * to it, only when it would overflow do we go to the trouble of finding the + * first key in that range of bytes again. + */ + +enum bset_aux_tree_type { + BSET_NO_AUX_TREE, + BSET_RO_AUX_TREE, + BSET_RW_AUX_TREE, +}; + +#define BSET_TREE_NR_TYPES 3 + +#define BSET_NO_AUX_TREE_VAL (U16_MAX) +#define BSET_RW_AUX_TREE_VAL (U16_MAX - 1) + +static inline enum bset_aux_tree_type bset_aux_tree_type(const struct bset_tree *t) +{ + switch (t->extra) { + case BSET_NO_AUX_TREE_VAL: + EBUG_ON(t->size); + return BSET_NO_AUX_TREE; + case BSET_RW_AUX_TREE_VAL: + EBUG_ON(!t->size); + return BSET_RW_AUX_TREE; + default: + EBUG_ON(!t->size); + return BSET_RO_AUX_TREE; + } +} + +/* + * BSET_CACHELINE was originally intended to match the hardware cacheline size - + * it used to be 64, but I realized the lookup code would touch slightly less + * memory if it was 128. + * + * It definites the number of bytes (in struct bset) per struct bkey_float in + * the auxiliar search tree - when we're done searching the bset_float tree we + * have this many bytes left that we do a linear search over. + * + * Since (after level 5) every level of the bset_tree is on a new cacheline, + * we're touching one fewer cacheline in the bset tree in exchange for one more + * cacheline in the linear search - but the linear search might stop before it + * gets to the second cacheline. + */ + +#define BSET_CACHELINE 256 + +static inline size_t btree_keys_cachelines(const struct btree *b) +{ + return (1U << b->byte_order) / BSET_CACHELINE; +} + +static inline size_t btree_aux_data_bytes(const struct btree *b) +{ + return btree_keys_cachelines(b) * 8; +} + +static inline size_t btree_aux_data_u64s(const struct btree *b) +{ + return btree_aux_data_bytes(b) / sizeof(u64); +} + +#define for_each_bset(_b, _t) \ + for (_t = (_b)->set; _t < (_b)->set + (_b)->nsets; _t++) + +#define bset_tree_for_each_key(_b, _t, _k) \ + for (_k = btree_bkey_first(_b, _t); \ + _k != btree_bkey_last(_b, _t); \ + _k = bkey_p_next(_k)) + +static inline bool bset_has_ro_aux_tree(const struct bset_tree *t) +{ + return bset_aux_tree_type(t) == BSET_RO_AUX_TREE; +} + +static inline bool bset_has_rw_aux_tree(struct bset_tree *t) +{ + return bset_aux_tree_type(t) == BSET_RW_AUX_TREE; +} + +static inline void bch2_bset_set_no_aux_tree(struct btree *b, + struct bset_tree *t) +{ + BUG_ON(t < b->set); + + for (; t < b->set + ARRAY_SIZE(b->set); t++) { + t->size = 0; + t->extra = BSET_NO_AUX_TREE_VAL; + t->aux_data_offset = U16_MAX; + } +} + +static inline void btree_node_set_format(struct btree *b, + struct bkey_format f) +{ + int len; + + b->format = f; + b->nr_key_bits = bkey_format_key_bits(&f); + + len = bch2_compile_bkey_format(&b->format, b->aux_data); + BUG_ON(len < 0 || len > U8_MAX); + + b->unpack_fn_len = len; + + bch2_bset_set_no_aux_tree(b, b->set); +} + +static inline struct bset *bset_next_set(struct btree *b, + unsigned block_bytes) +{ + struct bset *i = btree_bset_last(b); + + EBUG_ON(!is_power_of_2(block_bytes)); + + return ((void *) i) + round_up(vstruct_bytes(i), block_bytes); +} + +void bch2_btree_keys_init(struct btree *); + +void bch2_bset_init_first(struct btree *, struct bset *); +void bch2_bset_init_next(struct bch_fs *, struct btree *, + struct btree_node_entry *); +void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool); + +void bch2_bset_insert(struct btree *, struct btree_node_iter *, + struct bkey_packed *, struct bkey_i *, unsigned); +void bch2_bset_delete(struct btree *, struct bkey_packed *, unsigned); + +/* Bkey utility code */ + +/* packed or unpacked */ +static inline int bkey_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + const struct bpos *r) +{ + EBUG_ON(r_packed && !bkey_packed(r_packed)); + + if (unlikely(!bkey_packed(l))) + return bpos_cmp(packed_to_bkey_c(l)->p, *r); + + if (likely(r_packed)) + return __bch2_bkey_cmp_packed_format_checked(l, r_packed, b); + + return __bch2_bkey_cmp_left_packed_format_checked(b, l, r); +} + +static inline struct bset_tree * +bch2_bkey_to_bset_inlined(struct btree *b, struct bkey_packed *k) +{ + unsigned offset = __btree_node_key_to_offset(b, k); + struct bset_tree *t; + + for_each_bset(b, t) + if (offset <= t->end_offset) { + EBUG_ON(offset < btree_bkey_first_offset(t)); + return t; + } + + BUG(); +} + +struct bset_tree *bch2_bkey_to_bset(struct btree *, struct bkey_packed *); + +struct bkey_packed *bch2_bkey_prev_filter(struct btree *, struct bset_tree *, + struct bkey_packed *, unsigned); + +static inline struct bkey_packed * +bch2_bkey_prev_all(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, 0); +} + +static inline struct bkey_packed * +bch2_bkey_prev(struct btree *b, struct bset_tree *t, struct bkey_packed *k) +{ + return bch2_bkey_prev_filter(b, t, k, 1); +} + +/* Btree key iteration */ + +void bch2_btree_node_iter_push(struct btree_node_iter *, struct btree *, + const struct bkey_packed *, + const struct bkey_packed *); +void bch2_btree_node_iter_init(struct btree_node_iter *, struct btree *, + struct bpos *); +void bch2_btree_node_iter_init_from_start(struct btree_node_iter *, + struct btree *); +struct bkey_packed *bch2_btree_node_iter_bset_pos(struct btree_node_iter *, + struct btree *, + struct bset_tree *); + +void bch2_btree_node_iter_sort(struct btree_node_iter *, struct btree *); +void bch2_btree_node_iter_set_drop(struct btree_node_iter *, + struct btree_node_iter_set *); +void bch2_btree_node_iter_advance(struct btree_node_iter *, struct btree *); + +#define btree_node_iter_for_each(_iter, _set) \ + for (_set = (_iter)->data; \ + _set < (_iter)->data + ARRAY_SIZE((_iter)->data) && \ + (_set)->k != (_set)->end; \ + _set++) + +static inline bool __btree_node_iter_set_end(struct btree_node_iter *iter, + unsigned i) +{ + return iter->data[i].k == iter->data[i].end; +} + +static inline bool bch2_btree_node_iter_end(struct btree_node_iter *iter) +{ + return __btree_node_iter_set_end(iter, 0); +} + +/* + * When keys compare equal, deleted keys compare first: + * + * XXX: only need to compare pointers for keys that are both within a + * btree_node_iterator - we need to break ties for prev() to work correctly + */ +static inline int bkey_iter_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r) +{ + return bch2_bkey_cmp_packed(b, l, r) + ?: (int) bkey_deleted(r) - (int) bkey_deleted(l) + ?: cmp_int(l, r); +} + +static inline int btree_node_iter_cmp(const struct btree *b, + struct btree_node_iter_set l, + struct btree_node_iter_set r) +{ + return bkey_iter_cmp(b, + __btree_node_offset_to_key(b, l.k), + __btree_node_offset_to_key(b, r.k)); +} + +/* These assume r (the search key) is not a deleted key: */ +static inline int bkey_iter_pos_cmp(const struct btree *b, + const struct bkey_packed *l, + const struct bpos *r) +{ + return bkey_cmp_left_packed(b, l, r) + ?: -((int) bkey_deleted(l)); +} + +static inline int bkey_iter_cmp_p_or_unp(const struct btree *b, + const struct bkey_packed *l, + const struct bkey_packed *r_packed, + const struct bpos *r) +{ + return bkey_cmp_p_or_unp(b, l, r_packed, r) + ?: -((int) bkey_deleted(l)); +} + +static inline struct bkey_packed * +__bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, + struct btree *b) +{ + return __btree_node_offset_to_key(b, iter->data->k); +} + +static inline struct bkey_packed * +bch2_btree_node_iter_peek_all(struct btree_node_iter *iter, struct btree *b) +{ + return !bch2_btree_node_iter_end(iter) + ? __btree_node_offset_to_key(b, iter->data->k) + : NULL; +} + +static inline struct bkey_packed * +bch2_btree_node_iter_peek(struct btree_node_iter *iter, struct btree *b) +{ + struct bkey_packed *k; + + while ((k = bch2_btree_node_iter_peek_all(iter, b)) && + bkey_deleted(k)) + bch2_btree_node_iter_advance(iter, b); + + return k; +} + +static inline struct bkey_packed * +bch2_btree_node_iter_next_all(struct btree_node_iter *iter, struct btree *b) +{ + struct bkey_packed *ret = bch2_btree_node_iter_peek_all(iter, b); + + if (ret) + bch2_btree_node_iter_advance(iter, b); + + return ret; +} + +struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *, + struct btree *); +struct bkey_packed *bch2_btree_node_iter_prev(struct btree_node_iter *, + struct btree *); + +struct bkey_s_c bch2_btree_node_iter_peek_unpack(struct btree_node_iter *, + struct btree *, + struct bkey *); + +#define for_each_btree_node_key(b, k, iter) \ + for (bch2_btree_node_iter_init_from_start((iter), (b)); \ + (k = bch2_btree_node_iter_peek((iter), (b))); \ + bch2_btree_node_iter_advance(iter, b)) + +#define for_each_btree_node_key_unpack(b, k, iter, unpacked) \ + for (bch2_btree_node_iter_init_from_start((iter), (b)); \ + (k = bch2_btree_node_iter_peek_unpack((iter), (b), (unpacked))).k;\ + bch2_btree_node_iter_advance(iter, b)) + +/* Accounting: */ + +static inline void btree_keys_account_key(struct btree_nr_keys *n, + unsigned bset, + struct bkey_packed *k, + int sign) +{ + n->live_u64s += k->u64s * sign; + n->bset_u64s[bset] += k->u64s * sign; + + if (bkey_packed(k)) + n->packed_keys += sign; + else + n->unpacked_keys += sign; +} + +static inline void btree_keys_account_val_delta(struct btree *b, + struct bkey_packed *k, + int delta) +{ + struct bset_tree *t = bch2_bkey_to_bset(b, k); + + b->nr.live_u64s += delta; + b->nr.bset_u64s[t - b->set] += delta; +} + +#define btree_keys_account_key_add(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, 1) +#define btree_keys_account_key_drop(_nr, _bset_idx, _k) \ + btree_keys_account_key(_nr, _bset_idx, _k, -1) + +#define btree_account_key_add(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, 1) +#define btree_account_key_drop(_b, _k) \ + btree_keys_account_key(&(_b)->nr, \ + bch2_bkey_to_bset(_b, _k) - (_b)->set, _k, -1) + +struct bset_stats { + struct { + size_t nr, bytes; + } sets[BSET_TREE_NR_TYPES]; + + size_t floats; + size_t failed; +}; + +void bch2_btree_keys_stats(const struct btree *, struct bset_stats *); +void bch2_bfloat_to_text(struct printbuf *, struct btree *, + struct bkey_packed *); + +/* Debug stuff */ + +void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); +void bch2_dump_btree_node(struct bch_fs *, struct btree *); +void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); + +#ifdef CONFIG_BCACHEFS_DEBUG + +void __bch2_verify_btree_nr_keys(struct btree *); +void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); +void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, + struct bkey_packed *, unsigned); + +#else + +static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} +static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) {} +#endif + +static inline void bch2_verify_btree_nr_keys(struct btree *b) +{ + if (bch2_debug_check_btree_accounting) + __bch2_verify_btree_nr_keys(b); +} + +#endif /* _BCACHEFS_BSET_H */ diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c new file mode 100644 index 000000000..13c88d953 --- /dev/null +++ b/fs/bcachefs/btree_cache.c @@ -0,0 +1,1277 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "debug.h" +#include "errcode.h" +#include "error.h" +#include "trace.h" + +#include +#include +#include + +#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ +do { \ + if (shrinker_counter) \ + bc->not_freed_##counter++; \ +} while (0) + +const char * const bch2_btree_node_flags[] = { +#define x(f) #f, + BTREE_FLAGS() +#undef x + NULL +}; + +void bch2_recalc_btree_reserve(struct bch_fs *c) +{ + unsigned i, reserve = 16; + + if (!c->btree_roots_known[0].b) + reserve += 8; + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + reserve += min_t(unsigned, 1, r->b->c.level) * 8; + } + + c->btree_cache.reserve = reserve; +} + +static inline unsigned btree_cache_can_free(struct btree_cache *bc) +{ + return max_t(int, 0, bc->used - bc->reserve); +} + +static void btree_node_to_freedlist(struct btree_cache *bc, struct btree *b) +{ + if (b->c.lock.readers) + list_move(&b->list, &bc->freed_pcpu); + else + list_move(&b->list, &bc->freed_nonpcpu); +} + +static void btree_node_data_free(struct bch_fs *c, struct btree *b) +{ + struct btree_cache *bc = &c->btree_cache; + + EBUG_ON(btree_node_write_in_flight(b)); + + clear_btree_node_just_written(b); + + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; +#ifdef __KERNEL__ + kvfree(b->aux_data); +#else + munmap(b->aux_data, btree_aux_data_bytes(b)); +#endif + b->aux_data = NULL; + + bc->used--; + + btree_node_to_freedlist(bc, b); +} + +static int bch2_btree_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct btree *b = obj; + const u64 *v = arg->key; + + return b->hash_val == *v ? 0 : 1; +} + +static const struct rhashtable_params bch_btree_cache_params = { + .head_offset = offsetof(struct btree, hash), + .key_offset = offsetof(struct btree, hash_val), + .key_len = sizeof(u64), + .obj_cmpfn = bch2_btree_cache_cmp_fn, +}; + +static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) +{ + BUG_ON(b->data || b->aux_data); + + b->data = kvpmalloc(btree_bytes(c), gfp); + if (!b->data) + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; +#ifdef __KERNEL__ + b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); +#else + b->aux_data = mmap(NULL, btree_aux_data_bytes(b), + PROT_READ|PROT_WRITE|PROT_EXEC, + MAP_PRIVATE|MAP_ANONYMOUS, 0, 0); + if (b->aux_data == MAP_FAILED) + b->aux_data = NULL; +#endif + if (!b->aux_data) { + kvpfree(b->data, btree_bytes(c)); + b->data = NULL; + return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + } + + return 0; +} + +static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp) +{ + struct btree *b; + + b = kzalloc(sizeof(struct btree), gfp); + if (!b) + return NULL; + + bkey_btree_ptr_init(&b->key); + INIT_LIST_HEAD(&b->list); + INIT_LIST_HEAD(&b->write_blocked); + b->byte_order = ilog2(btree_bytes(c)); + return b; +} + +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + return NULL; + + if (btree_node_data_alloc(c, b, GFP_KERNEL)) { + kfree(b); + return NULL; + } + + bch2_btree_lock_init(&b->c, 0); + + bc->used++; + list_add(&b->list, &bc->freeable); + return b; +} + +/* Btree in memory cache - hash table */ + +void bch2_btree_node_hash_remove(struct btree_cache *bc, struct btree *b) +{ + int ret = rhashtable_remove_fast(&bc->table, &b->hash, bch_btree_cache_params); + + BUG_ON(ret); + + /* Cause future lookups for this node to fail: */ + b->hash_val = 0; +} + +int __bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b) +{ + BUG_ON(b->hash_val); + b->hash_val = btree_ptr_hash_val(&b->key); + + return rhashtable_lookup_insert_fast(&bc->table, &b->hash, + bch_btree_cache_params); +} + +int bch2_btree_node_hash_insert(struct btree_cache *bc, struct btree *b, + unsigned level, enum btree_id id) +{ + int ret; + + b->c.level = level; + b->c.btree_id = id; + + mutex_lock(&bc->lock); + ret = __bch2_btree_node_hash_insert(bc, b); + if (!ret) + list_add_tail(&b->list, &bc->live); + mutex_unlock(&bc->lock); + + return ret; +} + +__flatten +static inline struct btree *btree_cache_find(struct btree_cache *bc, + const struct bkey_i *k) +{ + u64 v = btree_ptr_hash_val(k); + + return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; + + lockdep_assert_held(&bc->lock); +wait_on_io: + if (b->flags & ((1U << BTREE_NODE_dirty)| + (1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) { + if (btree_node_dirty(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + else if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + /* XXX: waiting on IO with btree cache lock held */ + bch2_btree_node_wait_on_read(b); + bch2_btree_node_wait_on_write(b); + } + + if (!six_trylock_intent(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + if (!six_trylock_write(&b->c.lock)) { + BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); + goto out_unlock_intent; + } + + /* recheck under lock */ + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| + (1U << BTREE_NODE_write_in_flight))) { + if (!flush) { + if (btree_node_read_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + else if (btree_node_write_in_flight(b)) + BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + goto out_unlock; + } + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + + if (btree_node_noevict(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(noevict); + goto out_unlock; + } + if (btree_node_write_blocked(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); + goto out_unlock; + } + if (btree_node_will_make_reachable(b)) { + BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); + goto out_unlock; + } + + if (btree_node_dirty(b)) { + if (!flush) { + BTREE_CACHE_NOT_FREED_INCREMENT(dirty); + goto out_unlock; + } + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (bch2_verify_btree_ondisk) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } +out: + if (b->hash_val && !ret) + trace_and_count(c, btree_cache_reap, c, b); + return ret; +out_unlock: + six_unlock_write(&b->c.lock); +out_unlock_intent: + six_unlock_intent(&b->c.lock); + ret = -BCH_ERR_ENOMEM_btree_node_reclaim; + goto out; +} + +static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +{ + return __btree_node_reclaim(c, b, false, shrinker_counter); +} + +static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) +{ + return __btree_node_reclaim(c, b, true, false); +} + +static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache.shrink); + struct btree_cache *bc = &c->btree_cache; + struct btree *b, *t; + unsigned long nr = sc->nr_to_scan; + unsigned long can_free = 0; + unsigned long freed = 0; + unsigned long touched = 0; + unsigned i, flags; + unsigned long ret = SHRINK_STOP; + bool trigger_writes = atomic_read(&bc->dirty) + nr >= + bc->used * 3 / 4; + + if (bch2_btree_shrinker_disabled) + return SHRINK_STOP; + + mutex_lock(&bc->lock); + flags = memalloc_nofs_save(); + + /* + * It's _really_ critical that we don't free too many btree nodes - we + * have to always leave ourselves a reserve. The reserve is how we + * guarantee that allocating memory for a new btree node can always + * succeed, so that inserting keys into the btree can always succeed and + * IO can always make forward progress: + */ + can_free = btree_cache_can_free(bc); + nr = min_t(unsigned long, nr, can_free); + + i = 0; + list_for_each_entry_safe(b, t, &bc->freeable, list) { + /* + * Leave a few nodes on the freeable list, so that a btree split + * won't have to hit the system allocator: + */ + if (++i <= 3) + continue; + + touched++; + + if (touched >= nr) + goto out; + + if (!btree_node_reclaim(c, b, true)) { + btree_node_data_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + freed++; + bc->freed++; + } + } +restart: + list_for_each_entry_safe(b, t, &bc->live, list) { + touched++; + + if (btree_node_accessed(b)) { + clear_btree_node_accessed(b); + bc->not_freed_access_bit++; + } else if (!btree_node_reclaim(c, b, true)) { + freed++; + btree_node_data_free(c, b); + bc->freed++; + + bch2_btree_node_hash_remove(bc, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + + if (freed == nr) + goto out_rotate; + } else if (trigger_writes && + btree_node_dirty(b) && + !btree_node_will_make_reachable(b) && + !btree_node_write_blocked(b) && + six_trylock_read(&b->c.lock)) { + list_move(&bc->live, &b->list); + mutex_unlock(&bc->lock); + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); + six_unlock_read(&b->c.lock); + if (touched >= nr) + goto out_nounlock; + mutex_lock(&bc->lock); + goto restart; + } + + if (touched >= nr) + break; + } +out_rotate: + if (&t->list != &bc->live) + list_move_tail(&bc->live, &t->list); +out: + mutex_unlock(&bc->lock); +out_nounlock: + ret = freed; + memalloc_nofs_restore(flags); + trace_and_count(c, btree_cache_scan, sc->nr_to_scan, can_free, ret); + return ret; +} + +static unsigned long bch2_btree_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache.shrink); + struct btree_cache *bc = &c->btree_cache; + + if (bch2_btree_shrinker_disabled) + return 0; + + return btree_cache_can_free(bc); +} + +static void bch2_btree_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_cache.shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_cache_to_text(&out, &c->btree_cache); + seq_buf_commit(s, out.pos); +} + +void bch2_fs_btree_cache_exit(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + unsigned i, flags; + + unregister_shrinker(&bc->shrink); + + /* vfree() can allocate memory: */ + flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + + if (c->verify_data) + list_move(&c->verify_data->list, &bc->live); + + kvpfree(c->verify_ondisk, btree_bytes(c)); + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->b) + list_add(&r->b->list, &bc->live); + } + + list_splice(&bc->freeable, &bc->live); + + while (!list_empty(&bc->live)) { + b = list_first_entry(&bc->live, struct btree, list); + + BUG_ON(btree_node_read_in_flight(b) || + btree_node_write_in_flight(b)); + + if (btree_node_dirty(b)) + bch2_btree_complete_write(c, b, btree_current_write(b)); + clear_btree_node_dirty_acct(c, b); + + btree_node_data_free(c, b); + } + + BUG_ON(atomic_read(&c->btree_cache.dirty)); + + list_splice(&bc->freed_pcpu, &bc->freed_nonpcpu); + + while (!list_empty(&bc->freed_nonpcpu)) { + b = list_first_entry(&bc->freed_nonpcpu, struct btree, list); + list_del(&b->list); + six_lock_exit(&b->c.lock); + kfree(b); + } + + mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); +} + +int bch2_fs_btree_cache_init(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + unsigned i; + int ret = 0; + + ret = rhashtable_init(&bc->table, &bch_btree_cache_params); + if (ret) + goto err; + + bc->table_init_done = true; + + bch2_recalc_btree_reserve(c); + + for (i = 0; i < bc->reserve; i++) + if (!__bch2_btree_node_mem_alloc(c)) + goto err; + + list_splice_init(&bc->live, &bc->freeable); + + mutex_init(&c->verify_lock); + + bc->shrink.count_objects = bch2_btree_cache_count; + bc->shrink.scan_objects = bch2_btree_cache_scan; + bc->shrink.to_text = bch2_btree_cache_shrinker_to_text; + bc->shrink.seeks = 4; + ret = register_shrinker(&bc->shrink, "%s/btree_cache", c->name); + if (ret) + goto err; + + return 0; +err: + return -BCH_ERR_ENOMEM_fs_btree_cache_init; +} + +void bch2_fs_btree_cache_init_early(struct btree_cache *bc) +{ + mutex_init(&bc->lock); + INIT_LIST_HEAD(&bc->live); + INIT_LIST_HEAD(&bc->freeable); + INIT_LIST_HEAD(&bc->freed_pcpu); + INIT_LIST_HEAD(&bc->freed_nonpcpu); +} + +/* + * We can only have one thread cannibalizing other cached btree nodes at a time, + * or we'll deadlock. We use an open coded mutex to ensure that, which a + * cannibalize_bucket() will take. This means every time we unlock the root of + * the btree, we need to release this lock if we have it held. + */ +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + + if (bc->alloc_lock == current) { + trace_and_count(c, btree_cache_cannibalize_unlock, c); + bc->alloc_lock = NULL; + closure_wake_up(&bc->alloc_wait); + } +} + +int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl) +{ + struct btree_cache *bc = &c->btree_cache; + struct task_struct *old; + + old = cmpxchg(&bc->alloc_lock, NULL, current); + if (old == NULL || old == current) + goto success; + + if (!cl) { + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + } + + closure_wait(&bc->alloc_wait, cl); + + /* Try again, after adding ourselves to waitlist */ + old = cmpxchg(&bc->alloc_lock, NULL, current); + if (old == NULL || old == current) { + /* We raced */ + closure_wake_up(&bc->alloc_wait); + goto success; + } + + trace_and_count(c, btree_cache_cannibalize_lock_fail, c); + return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + +success: + trace_and_count(c, btree_cache_cannibalize_lock, c); + return 0; +} + +static struct btree *btree_node_cannibalize(struct bch_fs *c) +{ + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + list_for_each_entry_reverse(b, &bc->live, list) + if (!btree_node_reclaim(c, b, false)) + return b; + + while (1) { + list_for_each_entry_reverse(b, &bc->live, list) + if (!btree_node_write_and_reclaim(c, b)) + return b; + + /* + * Rare case: all nodes were intent-locked. + * Just busy-wait. + */ + WARN_ONCE(1, "btree cache cannibalize failed\n"); + cond_resched(); + } +} + +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_read_locks) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct list_head *freed = pcpu_read_locks + ? &bc->freed_pcpu + : &bc->freed_nonpcpu; + struct btree *b, *b2; + u64 start_time = local_clock(); + unsigned flags; + + flags = memalloc_nofs_save(); + mutex_lock(&bc->lock); + + /* + * We never free struct btree itself, just the memory that holds the on + * disk node. Check the freed list before allocating a new one: + */ + list_for_each_entry(b, freed, list) + if (!btree_node_reclaim(c, b, false)) { + list_del_init(&b->list); + goto got_node; + } + + b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); + if (!b) { + mutex_unlock(&bc->lock); + bch2_trans_unlock(trans); + b = __btree_node_mem_alloc(c, GFP_KERNEL); + if (!b) + goto err; + mutex_lock(&bc->lock); + } + + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); + + BUG_ON(!six_trylock_intent(&b->c.lock)); + BUG_ON(!six_trylock_write(&b->c.lock)); +got_node: + + /* + * btree_free() doesn't free memory; it sticks the node on the end of + * the list. Check if there's any freed nodes there: + */ + list_for_each_entry(b2, &bc->freeable, list) + if (!btree_node_reclaim(c, b2, false)) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + goto got_mem; + } + + mutex_unlock(&bc->lock); + + if (btree_node_data_alloc(c, b, GFP_NOWAIT|__GFP_NOWARN)) { + bch2_trans_unlock(trans); + if (btree_node_data_alloc(c, b, GFP_KERNEL|__GFP_NOWARN)) + goto err; + } + + mutex_lock(&bc->lock); + bc->used++; +got_mem: + mutex_unlock(&bc->lock); + + BUG_ON(btree_node_hashed(b)); + BUG_ON(btree_node_dirty(b)); + BUG_ON(btree_node_write_in_flight(b)); +out: + b->flags = 0; + b->written = 0; + b->nsets = 0; + b->sib_u64s[0] = 0; + b->sib_u64s[1] = 0; + b->whiteout_u64s = 0; + bch2_btree_keys_init(b); + set_btree_node_accessed(b); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], + start_time); + + memalloc_nofs_restore(flags); + return b; +err: + mutex_lock(&bc->lock); + + /* Try to cannibalize another cached btree node: */ + if (bc->alloc_lock == current) { + b2 = btree_node_cannibalize(c); + clear_btree_node_just_written(b2); + bch2_btree_node_hash_remove(bc, b2); + + if (b) { + swap(b->data, b2->data); + swap(b->aux_data, b2->aux_data); + btree_node_to_freedlist(bc, b2); + six_unlock_write(&b2->c.lock); + six_unlock_intent(&b2->c.lock); + } else { + b = b2; + list_del_init(&b->list); + } + + mutex_unlock(&bc->lock); + + trace_and_count(c, btree_cache_cannibalize, c); + goto out; + } + + mutex_unlock(&bc->lock); + memalloc_nofs_restore(flags); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_node_mem_alloc); +} + +/* Slowpath, don't want it inlined into btree_iter_traverse() */ +static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, + struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level, + enum six_lock_type lock_type, + bool sync) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + u32 seq; + + BUG_ON(level + 1 >= BTREE_MAX_DEPTH); + /* + * Parent node must be locked, else we could read in a btree node that's + * been freed: + */ + if (path && !bch2_btree_node_relock(trans, path, level + 1)) { + trace_and_count(c, trans_restart_relock_parent_for_fill, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_relock)); + } + + b = bch2_btree_node_mem_alloc(trans, level != 0); + + if (bch2_err_matches(PTR_ERR_OR_ZERO(b), ENOMEM)) { + trans->memory_allocation_failure = true; + trace_and_count(c, trans_restart_memory_allocation_failure, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_fill_mem_alloc_fail)); + } + + if (IS_ERR(b)) + return b; + + /* + * Btree nodes read in from disk should not have the accessed bit set + * initially, so that linear scans don't thrash the cache: + */ + clear_btree_node_accessed(b); + + bkey_copy(&b->key, k); + if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) { + /* raced with another fill: */ + + /* mark as unhashed... */ + b->hash_val = 0; + + mutex_lock(&bc->lock); + list_add(&b->list, &bc->freeable); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + return NULL; + } + + set_btree_node_read_in_flight(b); + + six_unlock_write(&b->c.lock); + seq = six_lock_seq(&b->c.lock); + six_unlock_intent(&b->c.lock); + + /* Unlock before doing IO: */ + if (trans && sync) + bch2_trans_unlock_noassert(trans); + + bch2_btree_node_read(c, b, sync); + + if (!sync) + return NULL; + + if (path) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + BUG_ON(!trans->restarted); + return ERR_PTR(ret); + } + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) { + if (path) + trace_and_count(c, trans_restart_relock_after_fill, trans, _THIS_IP_, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_after_fill)); + } + + return b; +} + +static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) +{ + struct printbuf buf = PRINTBUF; + + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + return; + + prt_printf(&buf, + "btree node header doesn't match ptr\n" + "btree %s level %u\n" + "ptr: ", + bch2_btree_ids[b->c.btree_id], b->c.level); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + prt_printf(&buf, "\nheader: btree %s level %llu\n" + "min ", + bch2_btree_ids[BTREE_NODE_ID(b->data)], + BTREE_NODE_LEVEL(b->data)); + bch2_bpos_to_text(&buf, b->data->min_key); + + prt_printf(&buf, "\nmax "); + bch2_bpos_to_text(&buf, b->data->max_key); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); +} + +static inline void btree_check_header(struct bch_fs *c, struct btree *b) +{ + if (b->c.btree_id != BTREE_NODE_ID(b->data) || + b->c.level != BTREE_NODE_LEVEL(b->data) || + !bpos_eq(b->data->max_key, b->key.k.p) || + (b->key.k.type == KEY_TYPE_btree_ptr_v2 && + !bpos_eq(b->data->min_key, + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key))) + btree_bad_header(c, b); +} + +static struct btree *__bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + bool need_relock = false; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + /* + * We must have the parent locked to call bch2_btree_node_fill(), + * else we could read in a btree node from disk that's been + * freed: + */ + b = bch2_btree_node_fill(trans, path, k, path->btree_id, + level, lock_type, true); + need_relock = true; + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b)) + return b; + } else { + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(trans, path, level + 1)) + goto retry; + + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + } + + if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = six_lock_seq(&b->c.lock); + + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(trans); + need_relock = true; + + bch2_btree_node_wait_on_read(b); + + /* + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: + */ + if (!six_relock_type(&b->c.lock, lock_type, seq)) + goto retry; + } + + if (unlikely(need_relock)) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(ret); + } + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + + return b; +} + +/** + * bch_btree_node_get - find a btree node in the cache and lock it, reading it + * in from disk if necessary. + * + * The btree node will have either a read or a write lock held, depending on + * the @write parameter. + */ +struct btree *bch2_btree_node_get(struct btree_trans *trans, struct btree_path *path, + const struct bkey_i *k, unsigned level, + enum six_lock_type lock_type, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_node_mem_ptr(k); + + /* + * Check b->hash_val _before_ calling btree_node_lock() - this might not + * be the node we want anymore, and trying to lock the wrong node could + * cause an unneccessary transaction restart: + */ + if (unlikely(!c->opts.btree_node_mem_ptr_optimization || + !b || + b->hash_val != btree_ptr_hash_val(k))) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + ret = btree_node_lock(trans, path, &b->c, level, lock_type, trace_ip); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.level != level || + race_fault())) { + six_unlock_type(&b->c.lock, lock_type); + if (bch2_btree_node_relock(trans, path, level + 1)) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + + trace_and_count(c, trans_restart_btree_node_reused, trans, trace_ip, path); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_lock_node_reused)); + } + + if (unlikely(btree_node_read_in_flight(b))) { + u32 seq = six_lock_seq(&b->c.lock); + + six_unlock_type(&b->c.lock, lock_type); + bch2_trans_unlock(trans); + + bch2_btree_node_wait_on_read(b); + + /* + * should_be_locked is not set on this path yet, so we need to + * relock it specifically: + */ + if (trans) { + int ret = bch2_trans_relock(trans) ?: + bch2_btree_path_relock_intent(trans, path); + if (ret) { + BUG_ON(!trans->restarted); + return ERR_PTR(ret); + } + } + + if (!six_relock_type(&b->c.lock, lock_type, seq)) + return __bch2_btree_node_get(trans, path, k, level, lock_type, trace_ip); + } + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_type(&b->c.lock, lock_type); + return ERR_PTR(-EIO); + } + + EBUG_ON(b->c.btree_id != path->btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); + + return b; +} + +struct btree *bch2_btree_node_get_noiter(struct btree_trans *trans, + const struct bkey_i *k, + enum btree_id btree_id, + unsigned level, + bool nofill) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + struct bset_tree *t; + int ret; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + if (c->opts.btree_node_mem_ptr_optimization) { + b = btree_node_mem_ptr(k); + if (b) + goto lock_node; + } +retry: + b = btree_cache_find(bc, k); + if (unlikely(!b)) { + if (nofill) + goto out; + + b = bch2_btree_node_fill(trans, NULL, k, btree_id, + level, SIX_LOCK_read, true); + + /* We raced and found the btree node in the cache */ + if (!b) + goto retry; + + if (IS_ERR(b) && + !bch2_btree_cache_cannibalize_lock(c, NULL)) + goto retry; + + if (IS_ERR(b)) + goto out; + } else { +lock_node: + ret = btree_node_lock_nopath(trans, &b->c, SIX_LOCK_read, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ERR_PTR(ret); + + BUG_ON(ret); + + if (unlikely(b->hash_val != btree_ptr_hash_val(k) || + b->c.btree_id != btree_id || + b->c.level != level)) { + six_unlock_read(&b->c.lock); + goto retry; + } + } + + /* XXX: waiting on IO with btree locks held: */ + __bch2_btree_node_wait_on_read(b); + + prefetch(b->aux_data); + + for_each_bset(b, t) { + void *p = (u64 *) b->aux_data + t->aux_data_offset; + + prefetch(p + L1_CACHE_BYTES * 0); + prefetch(p + L1_CACHE_BYTES * 1); + prefetch(p + L1_CACHE_BYTES * 2); + } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); + + if (unlikely(btree_node_read_error(b))) { + six_unlock_read(&b->c.lock); + b = ERR_PTR(-EIO); + goto out; + } + + EBUG_ON(b->c.btree_id != btree_id); + EBUG_ON(BTREE_NODE_LEVEL(b->data) != level); + btree_check_header(c, b); +out: + bch2_btree_cache_cannibalize_unlock(c); + return b; +} + +int bch2_btree_node_prefetch(struct btree_trans *trans, + struct btree_path *path, + const struct bkey_i *k, + enum btree_id btree_id, unsigned level) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + BUG_ON(trans && !btree_node_locked(path, level + 1)); + BUG_ON(level >= BTREE_MAX_DEPTH); + + b = btree_cache_find(bc, k); + if (b) + return 0; + + b = bch2_btree_node_fill(trans, path, k, btree_id, + level, SIX_LOCK_read, false); + return PTR_ERR_OR_ZERO(b); +} + +void bch2_btree_node_evict(struct btree_trans *trans, const struct bkey_i *k) +{ + struct bch_fs *c = trans->c; + struct btree_cache *bc = &c->btree_cache; + struct btree *b; + + b = btree_cache_find(bc, k); + if (!b) + return; +wait_on_io: + /* not allowed to wait on io with btree locks held: */ + + /* XXX we're called from btree_gc which will be holding other btree + * nodes locked + */ + __bch2_btree_node_wait_on_read(b); + __bch2_btree_node_wait_on_write(b); + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + + if (btree_node_dirty(b)) { + __bch2_btree_node_write(c, b, BTREE_WRITE_cache_reclaim); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + goto wait_on_io; + } + + BUG_ON(btree_node_dirty(b)); + + mutex_lock(&bc->lock); + btree_node_data_free(c, b); + bch2_btree_node_hash_remove(bc, b); + mutex_unlock(&bc->lock); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); +} + +void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + const struct btree *b) +{ + const struct bkey_format *f = &b->format; + struct bset_stats stats; + + memset(&stats, 0, sizeof(stats)); + + bch2_btree_keys_stats(b, &stats); + + prt_printf(out, "l %u ", b->c.level); + bch2_bpos_to_text(out, b->data->min_key); + prt_printf(out, " - "); + bch2_bpos_to_text(out, b->data->max_key); + prt_printf(out, ":\n" + " ptrs: "); + bch2_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + + prt_printf(out, "\n" + " format: u64s %u fields %u %u %u %u %u\n" + " unpack fn len: %u\n" + " bytes used %zu/%zu (%zu%% full)\n" + " sib u64s: %u, %u (merge threshold %u)\n" + " nr packed keys %u\n" + " nr unpacked keys %u\n" + " floats %zu\n" + " failed unpacked %zu\n", + f->key_u64s, + f->bits_per_field[0], + f->bits_per_field[1], + f->bits_per_field[2], + f->bits_per_field[3], + f->bits_per_field[4], + b->unpack_fn_len, + b->nr.live_u64s * sizeof(u64), + btree_bytes(c) - sizeof(struct btree_node), + b->nr.live_u64s * 100 / btree_max_u64s(c), + b->sib_u64s[0], + b->sib_u64s[1], + c->btree_foreground_merge_threshold, + b->nr.packed_keys, + b->nr.unpacked_keys, + stats.floats, + stats.failed); +} + +void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc) +{ + prt_printf(out, "nr nodes:\t\t%u\n", bc->used); + prt_printf(out, "nr dirty:\t\t%u\n", atomic_read(&bc->dirty)); + prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + + prt_printf(out, "freed:\t\t\t\t%u\n", bc->freed); + prt_printf(out, "not freed, dirty:\t\t%u\n", bc->not_freed_dirty); + prt_printf(out, "not freed, write in flight:\t%u\n", bc->not_freed_write_in_flight); + prt_printf(out, "not freed, read in flight:\t%u\n", bc->not_freed_read_in_flight); + prt_printf(out, "not freed, lock intent failed:\t%u\n", bc->not_freed_lock_intent); + prt_printf(out, "not freed, lock write failed:\t%u\n", bc->not_freed_lock_write); + prt_printf(out, "not freed, access bit:\t\t%u\n", bc->not_freed_access_bit); + prt_printf(out, "not freed, no evict failed:\t%u\n", bc->not_freed_noevict); + prt_printf(out, "not freed, write blocked:\t%u\n", bc->not_freed_write_blocked); + prt_printf(out, "not freed, will make reachable:\t%u\n", bc->not_freed_will_make_reachable); + +} diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h new file mode 100644 index 000000000..00c9b9218 --- /dev/null +++ b/fs/bcachefs/btree_cache.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_CACHE_H +#define _BCACHEFS_BTREE_CACHE_H + +#include "bcachefs.h" +#include "btree_types.h" +#include "bkey_methods.h" + +extern const char * const bch2_btree_node_flags[]; + +struct btree_iter; + +void bch2_recalc_btree_reserve(struct bch_fs *); + +void bch2_btree_node_hash_remove(struct btree_cache *, struct btree *); +int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *); +int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *, + unsigned, enum btree_id); + +void bch2_btree_cache_cannibalize_unlock(struct bch_fs *); +int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *); + +struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *); +struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool); + +struct btree *bch2_btree_node_get(struct btree_trans *, struct btree_path *, + const struct bkey_i *, unsigned, + enum six_lock_type, unsigned long); + +struct btree *bch2_btree_node_get_noiter(struct btree_trans *, const struct bkey_i *, + enum btree_id, unsigned, bool); + +int bch2_btree_node_prefetch(struct btree_trans *, struct btree_path *, + const struct bkey_i *, enum btree_id, unsigned); + +void bch2_btree_node_evict(struct btree_trans *, const struct bkey_i *); + +void bch2_fs_btree_cache_exit(struct bch_fs *); +int bch2_fs_btree_cache_init(struct bch_fs *); +void bch2_fs_btree_cache_init_early(struct btree_cache *); + +static inline u64 btree_ptr_hash_val(const struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + return *((u64 *) bkey_i_to_btree_ptr_c(k)->v.start); + case KEY_TYPE_btree_ptr_v2: + /* + * The cast/deref is only necessary to avoid sparse endianness + * warnings: + */ + return *((u64 *) &bkey_i_to_btree_ptr_v2_c(k)->v.seq); + default: + return 0; + } +} + +static inline struct btree *btree_node_mem_ptr(const struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_btree_ptr_v2 + ? (void *)(unsigned long)bkey_i_to_btree_ptr_v2_c(k)->v.mem_ptr + : NULL; +} + +/* is btree node in hash table? */ +static inline bool btree_node_hashed(struct btree *b) +{ + return b->hash_val != 0; +} + +#define for_each_cached_btree(_b, _c, _tbl, _iter, _pos) \ + for ((_tbl) = rht_dereference_rcu((_c)->btree_cache.table.tbl, \ + &(_c)->btree_cache.table), \ + _iter = 0; _iter < (_tbl)->size; _iter++) \ + rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash) + +static inline size_t btree_bytes(struct bch_fs *c) +{ + return c->opts.btree_node_size; +} + +static inline size_t btree_max_u64s(struct bch_fs *c) +{ + return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64); +} + +static inline size_t btree_pages(struct bch_fs *c) +{ + return btree_bytes(c) / PAGE_SIZE; +} + +static inline unsigned btree_blocks(struct bch_fs *c) +{ + return btree_sectors(c) >> c->block_bits; +} + +#define BTREE_SPLIT_THRESHOLD(c) (btree_max_u64s(c) * 2 / 3) + +#define BTREE_FOREGROUND_MERGE_THRESHOLD(c) (btree_max_u64s(c) * 1 / 3) +#define BTREE_FOREGROUND_MERGE_HYSTERESIS(c) \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) + \ + (BTREE_FOREGROUND_MERGE_THRESHOLD(c) >> 2)) + +static inline unsigned btree_id_nr_alive(struct bch_fs *c) +{ + return BTREE_ID_NR + c->btree_roots_extra.nr; +} + +static inline struct btree_root *bch2_btree_id_root(struct bch_fs *c, unsigned id) +{ + if (likely(id < BTREE_ID_NR)) { + return &c->btree_roots_known[id]; + } else { + unsigned idx = id - BTREE_ID_NR; + + EBUG_ON(idx >= c->btree_roots_extra.nr); + return &c->btree_roots_extra.data[idx]; + } +} + +static inline struct btree *btree_node_root(struct bch_fs *c, struct btree *b) +{ + return bch2_btree_id_root(c, b->c.btree_id)->b; +} + +void bch2_btree_node_to_text(struct printbuf *, struct bch_fs *, + const struct btree *); +void bch2_btree_cache_to_text(struct printbuf *, const struct btree_cache *); + +#endif /* _BCACHEFS_BTREE_CACHE_H */ diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c new file mode 100644 index 000000000..be537b237 --- /dev/null +++ b/fs/bcachefs/btree_gc.c @@ -0,0 +1,2144 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet + * Copyright (C) 2014 Datera Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "bkey_buf.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "debug.h" +#include "ec.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "recovery.h" +#include "reflink.h" +#include "replicas.h" +#include "super-io.h" +#include "trace.h" + +#include +#include +#include +#include +#include +#include +#include + +#define DROP_THIS_NODE 10 +#define DROP_PREV_NODE 11 + +static inline void __gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + preempt_disable(); + write_seqcount_begin(&c->gc_pos_lock); + c->gc_pos = new_pos; + write_seqcount_end(&c->gc_pos_lock); + preempt_enable(); +} + +static inline void gc_pos_set(struct bch_fs *c, struct gc_pos new_pos) +{ + BUG_ON(gc_pos_cmp(new_pos, c->gc_pos) <= 0); + __gc_pos_set(c, new_pos); +} + +/* + * Missing: if an interior btree node is empty, we need to do something - + * perhaps just kill it + */ +static int bch2_gc_check_topology(struct bch_fs *c, + struct btree *b, + struct bkey_buf *prev, + struct bkey_buf cur, + bool is_last) +{ + struct bpos node_start = b->data->min_key; + struct bpos node_end = b->data->max_key; + struct bpos expected_start = bkey_deleted(&prev->k->k) + ? node_start + : bpos_successor(prev->k->k.p); + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (cur.k->k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(cur.k); + + if (!bpos_eq(expected_start, bp->v.min_key)) { + bch2_topology_error(c); + + if (bkey_deleted(&prev->k->k)) { + prt_printf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, node_start); + } else { + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(prev->k)); + } + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(cur.k)); + + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " cur %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = -BCH_ERR_need_topology_repair; + goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } + } + } + + if (is_last && !bpos_eq(cur.k->k.p, node_end)) { + bch2_topology_error(c); + + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(cur.k)); + bch2_bpos_to_text(&buf2, node_end); + + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + bch_info(c, "Halting mark and sweep to start topology repair pass"); + ret = -BCH_ERR_need_topology_repair; + goto err; + } else { + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + } + } + + bch2_bkey_buf_copy(prev, c, cur.k); +err: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static void btree_ptr_to_v2(struct btree *b, struct bkey_i_btree_ptr_v2 *dst) +{ + switch (b->key.k.type) { + case KEY_TYPE_btree_ptr: { + struct bkey_i_btree_ptr *src = bkey_i_to_btree_ptr(&b->key); + + dst->k.p = src->k.p; + dst->v.mem_ptr = 0; + dst->v.seq = b->data->keys.seq; + dst->v.sectors_written = 0; + dst->v.flags = 0; + dst->v.min_key = b->data->min_key; + set_bkey_val_bytes(&dst->k, sizeof(dst->v) + bkey_val_bytes(&src->k)); + memcpy(dst->v.start, src->v.start, bkey_val_bytes(&src->k)); + break; + } + case KEY_TYPE_btree_ptr_v2: + bkey_copy(&dst->k_i, &b->key); + break; + default: + BUG(); + } +} + +static void bch2_btree_node_update_key_early(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_i *new) +{ + struct bch_fs *c = trans->c; + struct btree *b; + struct bkey_buf tmp; + int ret; + + bch2_bkey_buf_init(&tmp); + bch2_bkey_buf_reassemble(&tmp, c, old); + + b = bch2_btree_node_get_noiter(trans, tmp.k, btree, level, true); + if (!IS_ERR_OR_NULL(b)) { + mutex_lock(&c->btree_cache.lock); + + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + + mutex_unlock(&c->btree_cache.lock); + six_unlock_read(&b->c.lock); + } + + bch2_bkey_buf_exit(&tmp, c); +} + +static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) +{ + struct bkey_i_btree_ptr_v2 *new; + int ret; + + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) + return -BCH_ERR_ENOMEM_gc_repair_key; + + btree_ptr_to_v2(b, new); + b->data->min_key = new_min; + new->v.min_key = new_min; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; + } + + bch2_btree_node_drop_keys_outside_node(b); + bkey_copy(&b->key, &new->k_i); + return 0; +} + +static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) +{ + struct bkey_i_btree_ptr_v2 *new; + int ret; + + ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level + 1, b->key.k.p); + if (ret) + return ret; + + new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); + if (!new) + return -BCH_ERR_ENOMEM_gc_repair_key; + + btree_ptr_to_v2(b, new); + b->data->max_key = new_max; + new->k.p = new_max; + SET_BTREE_PTR_RANGE_UPDATED(&new->v, true); + + ret = bch2_journal_key_insert_take(c, b->c.btree_id, b->c.level + 1, &new->k_i); + if (ret) { + kfree(new); + return ret; + } + + bch2_btree_node_drop_keys_outside_node(b); + + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, &new->k_i); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + return 0; +} + +static int btree_repair_node_boundaries(struct bch_fs *c, struct btree *b, + struct btree *prev, struct btree *cur) +{ + struct bpos expected_start = !prev + ? b->data->min_key + : bpos_successor(prev->key.k.p); + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + if (!prev) { + prt_printf(&buf1, "start of node: "); + bch2_bpos_to_text(&buf1, b->data->min_key); + } else { + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&prev->key)); + } + + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&cur->key)); + + if (prev && + bpos_gt(expected_start, cur->data->min_key) && + BTREE_NODE_SEQ(cur->data) > BTREE_NODE_SEQ(prev->data)) { + /* cur overwrites prev: */ + + if (mustfix_fsck_err_on(bpos_ge(prev->data->min_key, + cur->data->min_key), c, + "btree node overwritten by next node at btree %s level %u:\n" + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf)) { + ret = DROP_PREV_NODE; + goto out; + } + + if (mustfix_fsck_err_on(!bpos_eq(prev->key.k.p, + bpos_predecessor(cur->data->min_key)), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " node %s\n" + " next %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf)) + ret = set_node_max(c, prev, + bpos_predecessor(cur->data->min_key)); + } else { + /* prev overwrites cur: */ + + if (mustfix_fsck_err_on(bpos_ge(expected_start, + cur->data->max_key), c, + "btree node overwritten by prev node at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf)) { + ret = DROP_THIS_NODE; + goto out; + } + + if (mustfix_fsck_err_on(!bpos_eq(expected_start, cur->data->min_key), c, + "btree node with incorrect min_key at btree %s level %u:\n" + " prev %s\n" + " node %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf)) + ret = set_node_min(c, cur, expected_start); + } +out: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static int btree_repair_node_end(struct bch_fs *c, struct btree *b, + struct btree *child) +{ + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + int ret = 0; + + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(&child->key)); + bch2_bpos_to_text(&buf2, b->key.k.p); + + if (mustfix_fsck_err_on(!bpos_eq(child->key.k.p, b->key.k.p), c, + "btree node with incorrect max_key at btree %s level %u:\n" + " %s\n" + " expected %s", + bch2_btree_ids[b->c.btree_id], b->c.level, + buf1.buf, buf2.buf)) { + ret = set_node_max(c, child, b->key.k.p); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static int bch2_btree_repair_topology_recurse(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf prev_k, cur_k; + struct btree *prev = NULL, *cur = NULL; + bool have_child, dropped_children = false; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!b->c.level) + return 0; +again: + prev = NULL; + have_child = dropped_children = false; + bch2_bkey_buf_init(&prev_k); + bch2_bkey_buf_init(&cur_k); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + + bch2_btree_and_journal_iter_advance(&iter); + bch2_bkey_buf_reassemble(&cur_k, c, k); + + cur = bch2_btree_node_get_noiter(trans, cur_k.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(cur); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); + + if (mustfix_fsck_err_on(ret == -EIO, c, + "Topology repair: unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, + buf.buf)) { + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + if (ret) { + bch_err_msg(c, ret, "getting btree node"); + break; + } + + ret = btree_repair_node_boundaries(c, b, prev, cur); + + if (ret == DROP_THIS_NODE) { + six_unlock_read(&cur->c.lock); + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + cur = NULL; + if (ret) + break; + continue; + } + + if (prev) + six_unlock_read(&prev->c.lock); + prev = NULL; + + if (ret == DROP_PREV_NODE) { + bch2_btree_node_evict(trans, prev_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, prev_k.k->k.p); + if (ret) + break; + + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); + goto again; + } else if (ret) + break; + + prev = cur; + cur = NULL; + bch2_bkey_buf_copy(&prev_k, c, cur_k.k); + } + + if (!ret && !IS_ERR_OR_NULL(prev)) { + BUG_ON(cur); + ret = btree_repair_node_end(c, b, prev); + } + + if (!IS_ERR_OR_NULL(prev)) + six_unlock_read(&prev->c.lock); + prev = NULL; + if (!IS_ERR_OR_NULL(cur)) + six_unlock_read(&cur->c.lock); + cur = NULL; + + if (ret) + goto err; + + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + bch2_bkey_buf_reassemble(&cur_k, c, k); + bch2_btree_and_journal_iter_advance(&iter); + + cur = bch2_btree_node_get_noiter(trans, cur_k.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(cur); + + if (ret) { + bch_err_msg(c, ret, "getting btree node"); + goto err; + } + + ret = bch2_btree_repair_topology_recurse(trans, cur); + six_unlock_read(&cur->c.lock); + cur = NULL; + + if (ret == DROP_THIS_NODE) { + bch2_btree_node_evict(trans, cur_k.k); + ret = bch2_journal_key_delete(c, b->c.btree_id, + b->c.level, cur_k.k->k.p); + dropped_children = true; + } + + if (ret) + goto err; + + have_child = true; + } + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + + if (mustfix_fsck_err_on(!have_child, c, + "empty interior btree node at btree %s level %u\n" + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level, buf.buf)) + ret = DROP_THIS_NODE; +err: +fsck_err: + if (!IS_ERR_OR_NULL(prev)) + six_unlock_read(&prev->c.lock); + if (!IS_ERR_OR_NULL(cur)) + six_unlock_read(&cur->c.lock); + + bch2_btree_and_journal_iter_exit(&iter); + bch2_bkey_buf_exit(&prev_k, c); + bch2_bkey_buf_exit(&cur_k, c); + + if (!ret && dropped_children) + goto again; + + printbuf_exit(&buf); + return ret; +} + +static int bch2_repair_topology(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree *b; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < btree_id_nr_alive(c)&& !ret; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) + continue; + + b = r->b; + if (btree_node_fake(b)) + continue; + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + ret = bch2_btree_repair_topology_recurse(&trans, b); + six_unlock_read(&b->c.lock); + + if (ret == DROP_THIS_NODE) { + bch_err(c, "empty btree root - repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; + } + } + + bch2_trans_exit(&trans); + + return ret; +} + +static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(*k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + bool do_update = false; + struct printbuf buf = PRINTBUF; + int ret = 0; + + /* + * XXX + * use check_bucket_ref here + */ + bkey_for_each_ptr_decode(k->k, ptrs, p, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, &entry->ptr); + + if (!g->gen_valid && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + } else { + do_update = true; + } + } + + if (gen_cmp(p.ptr.gen, g->gen) > 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) { + if (!p.ptr.cached) { + g->gen_valid = true; + g->gen = p.ptr.gen; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + + if (gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen, + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + do_update = true; + + if (!p.ptr.cached && gen_cmp(p.ptr.gen, g->gen) < 0 && + (c->opts.reconstruct_alloc || + fsck_err(c, "bucket %u:%zu data type %s stale dirty ptr: %u < %u\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[ptr_data_type(k->k, &p.ptr)], + p.ptr.gen, g->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) + do_update = true; + + if (data_type != BCH_DATA_btree && p.ptr.gen != g->gen) + continue; + + if (fsck_err_on(bucket_data_type(g->data_type) && + bucket_data_type(g->data_type) != data_type, c, + "bucket %u:%zu different types of data in same bucket: %s, %s\n" + "while marking %s", + p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), + bch2_data_types[g->data_type], + bch2_data_types[data_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) { + if (data_type == BCH_DATA_btree) { + g->data_type = data_type; + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + } else { + do_update = true; + } + } + + if (p.has_ec) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, p.ec.idx); + + if (fsck_err_on(!m || !m->alive, c, + "pointer to nonexistent stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + + if (fsck_err_on(m && m->alive && !bch2_ptr_matches_stripe_m(m, p), c, + "pointer does not match stripe %llu\n" + "while marking %s", + (u64) p.ec.idx, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) + do_update = true; + } + } + + if (do_update) { + struct bkey_ptrs ptrs; + union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct bkey_i *new; + + if (is_root) { + bch_err(c, "cannot update btree roots yet"); + ret = -EINVAL; + goto err; + } + + new = kmalloc(bkey_bytes(k->k), GFP_KERNEL); + if (!new) { + bch_err_msg(c, ret, "allocating new key"); + ret = -BCH_ERR_ENOMEM_gc_repair_key; + goto err; + } + + bkey_reassemble(new, *k); + + if (level) { + /* + * We don't want to drop btree node pointers - if the + * btree node isn't there anymore, the read path will + * sort it out: + */ + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + + ptr->gen = g->gen; + } + } else { + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, ({ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket *g = PTR_GC_BUCKET(ca, ptr); + enum bch_data_type data_type = bch2_bkey_ptr_data_type(*k, ptr); + + (ptr->cached && + (!g->gen_valid || gen_cmp(ptr->gen, g->gen) > 0)) || + (!ptr->cached && + gen_cmp(ptr->gen, g->gen) < 0) || + gen_cmp(g->gen, ptr->gen) > BUCKET_GC_GEN_MAX || + (g->data_type && + g->data_type != data_type); + })); +again: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_extent_entry_for_each(ptrs, entry) { + if (extent_entry_type(entry) == BCH_EXTENT_ENTRY_stripe_ptr) { + struct gc_stripe *m = genradix_ptr(&c->gc_stripes, + entry->stripe_ptr.idx); + union bch_extent_entry *next_ptr; + + bkey_extent_entry_for_each_from(ptrs, next_ptr, entry) + if (extent_entry_type(next_ptr) == BCH_EXTENT_ENTRY_ptr) + goto found; + next_ptr = NULL; +found: + if (!next_ptr) { + bch_err(c, "aieee, found stripe ptr with no data ptr"); + continue; + } + + if (!m || !m->alive || + !__bch2_ptr_matches_stripe(&m->ptrs[entry->stripe_ptr.block], + &next_ptr->ptr, + m->sectors)) { + bch2_bkey_extent_entry_drop(new, entry); + goto again; + } + } + } + } + + ret = bch2_journal_key_insert_take(c, btree_id, level, new); + if (ret) { + kfree(new); + goto err; + } + + if (level) + bch2_btree_node_update_key_early(trans, btree_id, level - 1, *k, new); + + if (0) { + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, *k); + bch_info(c, "updated %s", buf.buf); + + printbuf_reset(&buf); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(new)); + bch_info(c, "new key %s", buf.buf); + } + + *k = bkey_i_to_s_c(new); + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* marking of btree keys/nodes: */ + +static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, + unsigned level, bool is_root, + struct bkey_s_c *k, + bool initial) +{ + struct bch_fs *c = trans->c; + struct bkey deleted = KEY(0, 0, 0); + struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL }; + unsigned flags = + BTREE_TRIGGER_GC| + (initial ? BTREE_TRIGGER_NOATOMIC : 0); + int ret = 0; + + deleted.p = k->k->p; + + if (initial) { + BUG_ON(bch2_journal_seq_verify && + k->k->version.lo > atomic64_read(&c->journal.seq)); + + ret = bch2_check_fix_ptrs(trans, btree_id, level, is_root, k); + if (ret) + goto err; + + if (fsck_err_on(k->k->version.lo > atomic64_read(&c->key_version), c, + "key version number higher than recorded: %llu > %llu", + k->k->version.lo, + atomic64_read(&c->key_version))) + atomic64_set(&c->key_version, k->k->version.lo); + } + + ret = commit_do(trans, NULL, NULL, 0, + bch2_mark_key(trans, btree_id, level, old, *k, flags)); +fsck_err: +err: + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int btree_gc_mark_node(struct btree_trans *trans, struct btree *b, bool initial) +{ + struct bch_fs *c = trans->c; + struct btree_node_iter iter; + struct bkey unpacked; + struct bkey_s_c k; + struct bkey_buf prev, cur; + int ret = 0; + + if (!btree_node_type_needs_gc(btree_node_type(b))) + return 0; + + bch2_btree_node_iter_init_from_start(&iter, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); + + while ((k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked)).k) { + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, false, + &k, initial); + if (ret) + break; + + bch2_btree_node_iter_advance(&iter, b); + + if (b->c.level) { + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_gc_check_topology(c, b, &prev, cur, + bch2_btree_node_iter_end(&iter)); + if (ret) + break; + } + } + + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + return ret; +} + +static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree_id, + bool initial, bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + unsigned depth = metadata_only ? 1 : 0; + int ret = 0; + + gc_pos_set(c, gc_pos_btree(btree_id, POS_MIN, 0)); + + __for_each_btree_node(trans, iter, btree_id, POS_MIN, + 0, depth, BTREE_ITER_PREFETCH, b, ret) { + bch2_verify_btree_nr_keys(b); + + gc_pos_set(c, gc_pos_btree_node(b)); + + ret = btree_gc_mark_node(trans, b, initial); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + mutex_lock(&c->btree_root_lock); + b = bch2_btree_id_root(c, btree_id)->b; + if (!btree_node_fake(b)) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, + true, &k, initial); + } + gc_pos_set(c, gc_pos_btree_root(b->c.btree_id)); + mutex_unlock(&c->btree_root_lock); + + return ret; +} + +static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b, + unsigned target_depth) +{ + struct bch_fs *c = trans->c; + struct btree_and_journal_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; + struct printbuf buf = PRINTBUF; + int ret = 0; + + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur); + bkey_init(&prev.k->k); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level, + false, &k, true); + if (ret) + goto fsck_err; + + if (b->c.level) { + bch2_bkey_buf_reassemble(&cur, c, k); + k = bkey_i_to_s_c(cur.k); + + bch2_btree_and_journal_iter_advance(&iter); + + ret = bch2_gc_check_topology(c, b, + &prev, cur, + !bch2_btree_and_journal_iter_peek(&iter).k); + if (ret) + goto fsck_err; + } else { + bch2_btree_and_journal_iter_advance(&iter); + } + } + + if (b->c.level > target_depth) { + bch2_btree_and_journal_iter_exit(&iter); + bch2_btree_and_journal_iter_init_node_iter(&iter, c, b); + + while ((k = bch2_btree_and_journal_iter_peek(&iter)).k) { + struct btree *child; + + bch2_bkey_buf_reassemble(&cur, c, k); + bch2_btree_and_journal_iter_advance(&iter); + + child = bch2_btree_node_get_noiter(trans, cur.k, + b->c.btree_id, b->c.level - 1, + false); + ret = PTR_ERR_OR_ZERO(child); + + if (ret == -EIO) { + bch2_topology_error(c); + + if (__fsck_err(c, + FSCK_CAN_FIX| + FSCK_CAN_IGNORE| + FSCK_NO_RATELIMIT, + "Unreadable btree node at btree %s level %u:\n" + " %s", + bch2_btree_ids[b->c.btree_id], + b->c.level - 1, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur.k)), buf.buf)) && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) { + ret = -BCH_ERR_need_topology_repair; + bch_info(c, "Halting mark and sweep to start topology repair pass"); + goto fsck_err; + } else { + /* Continue marking when opted to not + * fix the error: */ + ret = 0; + set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags); + continue; + } + } else if (ret) { + bch_err_msg(c, ret, "getting btree node"); + break; + } + + ret = bch2_gc_btree_init_recurse(trans, child, + target_depth); + six_unlock_read(&child->c.lock); + + if (ret) + break; + } + } +fsck_err: + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + bch2_btree_and_journal_iter_exit(&iter); + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_btree_init(struct btree_trans *trans, + enum btree_id btree_id, + bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct btree *b; + unsigned target_depth = metadata_only ? 1 : 0; + struct printbuf buf = PRINTBUF; + int ret = 0; + + b = bch2_btree_id_root(c, btree_id)->b; + + if (btree_node_fake(b)) + return 0; + + six_lock_read(&b->c.lock, NULL, NULL); + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->min_key); + if (mustfix_fsck_err_on(!bpos_eq(b->data->min_key, POS_MIN), c, + "btree root with incorrect min_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto fsck_err; + } + + printbuf_reset(&buf); + bch2_bpos_to_text(&buf, b->data->max_key); + if (mustfix_fsck_err_on(!bpos_eq(b->data->max_key, SPOS_MAX), c, + "btree root with incorrect max_key: %s", buf.buf)) { + bch_err(c, "repair unimplemented"); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto fsck_err; + } + + if (b->c.level >= target_depth) + ret = bch2_gc_btree_init_recurse(trans, b, target_depth); + + if (!ret) { + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + + ret = bch2_gc_mark_key(trans, b->c.btree_id, b->c.level + 1, true, + &k, true); + } +fsck_err: + six_unlock_read(&b->c.lock); + + if (ret < 0) + bch_err_fn(c, ret); + printbuf_exit(&buf); + return ret; +} + +static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) +{ + return (int) btree_id_to_gc_phase(l) - + (int) btree_id_to_gc_phase(r); +} + +static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only) +{ + struct btree_trans trans; + enum btree_id ids[BTREE_ID_NR]; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + if (initial) + trans.is_initial_gc = true; + + for (i = 0; i < BTREE_ID_NR; i++) + ids[i] = i; + bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); + + for (i = 0; i < BTREE_ID_NR && !ret; i++) + ret = initial + ? bch2_gc_btree_init(&trans, ids[i], metadata_only) + : bch2_gc_btree(&trans, ids[i], initial, metadata_only); + + for (i = BTREE_ID_NR; i < btree_id_nr_alive(c) && !ret; i++) { + if (!bch2_btree_id_root(c, i)->alive) + continue; + + ret = initial + ? bch2_gc_btree_init(&trans, i, metadata_only) + : bch2_gc_btree(&trans, i, initial, metadata_only); + } + + if (ret < 0) + bch_err_fn(c, ret); + + bch2_trans_exit(&trans); + return ret; +} + +static void mark_metadata_sectors(struct bch_fs *c, struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + unsigned flags) +{ + u64 b = sector_to_bucket(ca, start); + + do { + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + bch2_mark_metadata_bucket(c, ca, b, type, sectors, + gc_phase(GC_PHASE_SB), flags); + b++; + start += sectors; + } while (start < end); +} + +static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca, + unsigned flags) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + unsigned i; + u64 b; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) + mark_metadata_sectors(c, ca, 0, BCH_SB_SECTOR, + BCH_DATA_sb, flags); + + mark_metadata_sectors(c, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, flags); + } + + for (i = 0; i < ca->journal.nr; i++) { + b = ca->journal.buckets[i]; + bch2_mark_metadata_bucket(c, ca, b, BCH_DATA_journal, + ca->mi.bucket_size, + gc_phase(GC_PHASE_SB), flags); + } +} + +static void bch2_mark_superblocks(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + mutex_lock(&c->sb_lock); + gc_pos_set(c, gc_phase(GC_PHASE_SB)); + + for_each_online_member(ca, c, i) + bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC); + mutex_unlock(&c->sb_lock); +} + +#if 0 +/* Also see bch2_pending_btree_node_free_insert_done() */ +static void bch2_mark_pending_btree_node_frees(struct bch_fs *c) +{ + struct btree_update *as; + struct pending_btree_node_free *d; + + mutex_lock(&c->btree_interior_update_lock); + gc_pos_set(c, gc_phase(GC_PHASE_PENDING_DELETE)); + + for_each_pending_btree_node_free(c, as, d) + if (d->index_update_done) + bch2_mark_key(c, bkey_i_to_s_c(&d->key), BTREE_TRIGGER_GC); + + mutex_unlock(&c->btree_interior_update_lock); +} +#endif + +static void bch2_gc_free(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + genradix_free(&c->reflink_gc_table); + genradix_free(&c->gc_stripes); + + for_each_member_device(ca, c, i) { + kvpfree(rcu_dereference_protected(ca->buckets_gc, 1), + sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket)); + ca->buckets_gc = NULL; + + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; + } + + free_percpu(c->usage_gc); + c->usage_gc = NULL; +} + +static int bch2_gc_done(struct bch_fs *c, + bool initial, bool metadata_only) +{ + struct bch_dev *ca = NULL; + struct printbuf buf = PRINTBUF; + bool verify = !metadata_only && + !c->opts.reconstruct_alloc && + (!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info))); + unsigned i, dev; + int ret = 0; + + percpu_down_write(&c->mark_lock); + +#define copy_field(_f, _msg, ...) \ + if (dst->_f != src->_f && \ + (!verify || \ + fsck_err(c, _msg ": got %llu, should be %llu" \ + , ##__VA_ARGS__, dst->_f, src->_f))) \ + dst->_f = src->_f +#define copy_stripe_field(_f, _msg, ...) \ + if (dst->_f != src->_f && \ + (!verify || \ + fsck_err(c, "stripe %zu has wrong "_msg \ + ": got %u, should be %u", \ + iter.pos, ##__VA_ARGS__, \ + dst->_f, src->_f))) \ + dst->_f = src->_f +#define copy_dev_field(_f, _msg, ...) \ + copy_field(_f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__) +#define copy_fs_field(_f, _msg, ...) \ + copy_field(_f, "fs has wrong " _msg, ##__VA_ARGS__) + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for_each_member_device(ca, c, dev) { + struct bch_dev_usage *dst = ca->usage_base; + struct bch_dev_usage *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc, + dev_usage_u64s()); + + copy_dev_field(buckets_ec, "buckets_ec"); + + for (i = 0; i < BCH_DATA_NR; i++) { + copy_dev_field(d[i].buckets, "%s buckets", bch2_data_types[i]); + copy_dev_field(d[i].sectors, "%s sectors", bch2_data_types[i]); + copy_dev_field(d[i].fragmented, "%s fragmented", bch2_data_types[i]); + } + }; + + { + unsigned nr = fs_usage_u64s(c); + struct bch_fs_usage *dst = c->usage_base; + struct bch_fs_usage *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr); + + copy_fs_field(hidden, "hidden"); + copy_fs_field(btree, "btree"); + + if (!metadata_only) { + copy_fs_field(data, "data"); + copy_fs_field(cached, "cached"); + copy_fs_field(reserved, "reserved"); + copy_fs_field(nr_inodes,"nr_inodes"); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + copy_fs_field(persistent_reserved[i], + "persistent_reserved[%i]", i); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + if (metadata_only && + (e->data_type == BCH_DATA_user || + e->data_type == BCH_DATA_cached)) + continue; + + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, e); + + copy_fs_field(replicas[i], "%s", buf.buf); + } + } + +#undef copy_fs_field +#undef copy_dev_field +#undef copy_stripe_field +#undef copy_field +fsck_err: + if (ca) + percpu_ref_put(&ca->ref); + if (ret) + bch_err_fn(c, ret); + + percpu_up_write(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_start(struct bch_fs *c) +{ + struct bch_dev *ca = NULL; + unsigned i; + + BUG_ON(c->usage_gc); + + c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64), + sizeof(u64), GFP_KERNEL); + if (!c->usage_gc) { + bch_err(c, "error allocating c->usage_gc"); + return -BCH_ERR_ENOMEM_gc_start; + } + + for_each_member_device(ca, c, i) { + BUG_ON(ca->usage_gc); + + ca->usage_gc = alloc_percpu(struct bch_dev_usage); + if (!ca->usage_gc) { + bch_err(c, "error allocating ca->usage_gc"); + percpu_ref_put(&ca->ref); + return -BCH_ERR_ENOMEM_gc_start; + } + + this_cpu_write(ca->usage_gc->d[BCH_DATA_free].buckets, + ca->mi.nbuckets - ca->mi.first_bucket); + } + + return 0; +} + +static int bch2_gc_reset(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + free_percpu(ca->usage_gc); + ca->usage_gc = NULL; + } + + free_percpu(c->usage_gc); + c->usage_gc = NULL; + + return bch2_gc_start(c); +} + +/* returns true if not equal */ +static inline bool bch2_alloc_v4_cmp(struct bch_alloc_v4 l, + struct bch_alloc_v4 r) +{ + return l.gen != r.gen || + l.oldest_gen != r.oldest_gen || + l.data_type != r.data_type || + l.dirty_sectors != r.dirty_sectors || + l.cached_sectors != r.cached_sectors || + l.stripe_redundancy != r.stripe_redundancy || + l.stripe != r.stripe; +} + +static int bch2_alloc_write_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + bool metadata_only) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, iter->pos.inode); + struct bucket gc, *b; + struct bkey_i_alloc_v4 *a; + struct bch_alloc_v4 old_convert, new; + const struct bch_alloc_v4 *old; + enum bch_data_type type; + int ret; + + if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets))) + return 1; + + old = bch2_alloc_to_v4(k, &old_convert); + new = *old; + + percpu_down_read(&c->mark_lock); + b = gc_bucket(ca, iter->pos.offset); + + /* + * b->data_type doesn't yet include need_discard & need_gc_gen states - + * fix that here: + */ + type = __alloc_data_type(b->dirty_sectors, + b->cached_sectors, + b->stripe, + *old, + b->data_type); + if (b->data_type != type) { + struct bch_dev_usage *u; + + preempt_disable(); + u = this_cpu_ptr(ca->usage_gc); + u->d[b->data_type].buckets--; + b->data_type = type; + u->d[b->data_type].buckets++; + preempt_enable(); + } + + gc = *b; + percpu_up_read(&c->mark_lock); + + if (metadata_only && + gc.data_type != BCH_DATA_sb && + gc.data_type != BCH_DATA_journal && + gc.data_type != BCH_DATA_btree) + return 0; + + if (gen_after(old->gen, gc.gen)) + return 0; + + if (c->opts.reconstruct_alloc || + fsck_err_on(new.data_type != gc.data_type, c, + "bucket %llu:%llu gen %u has wrong data_type" + ": got %s, should be %s", + iter->pos.inode, iter->pos.offset, + gc.gen, + bch2_data_types[new.data_type], + bch2_data_types[gc.data_type])) + new.data_type = gc.data_type; + +#define copy_bucket_field(_f) \ + if (c->opts.reconstruct_alloc || \ + fsck_err_on(new._f != gc._f, c, \ + "bucket %llu:%llu gen %u data type %s has wrong " #_f \ + ": got %u, should be %u", \ + iter->pos.inode, iter->pos.offset, \ + gc.gen, \ + bch2_data_types[gc.data_type], \ + new._f, gc._f)) \ + new._f = gc._f; \ + + copy_bucket_field(gen); + copy_bucket_field(dirty_sectors); + copy_bucket_field(cached_sectors); + copy_bucket_field(stripe_redundancy); + copy_bucket_field(stripe); +#undef copy_bucket_field + + if (!bch2_alloc_v4_cmp(*old, new)) + return 0; + + a = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; + + a->v = new; + + /* + * The trigger normally makes sure this is set, but we're not running + * triggers: + */ + if (a->v.data_type == BCH_DATA_cached && !a->v.io_time[READ]) + a->v.io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now)); + + ret = bch2_trans_update(trans, iter, &a->k_i, BTREE_TRIGGER_NORUN); +fsck_err: + return ret; +} + +static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + unsigned i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS(ca->dev_idx, ca->mi.first_bucket), + BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW, + bch2_alloc_write_key(&trans, &iter, k, metadata_only)); + + if (ret < 0) { + bch_err(c, "error writing alloc info: %s", bch2_err_str(ret)); + percpu_ref_put(&ca->ref); + break; + } + } + + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; +} + +static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only) +{ + struct bch_dev *ca; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bucket *g; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + unsigned i; + int ret; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) + + ca->mi.nbuckets * sizeof(struct bucket), + GFP_KERNEL|__GFP_ZERO); + if (!buckets) { + percpu_ref_put(&ca->ref); + bch_err(c, "error allocating ca->buckets[gc]"); + return -BCH_ERR_ENOMEM_gc_alloc_start; + } + + buckets->first_bucket = ca->mi.first_bucket; + buckets->nbuckets = ca->mi.nbuckets; + rcu_assign_pointer(ca->buckets_gc, buckets); + }; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_alloc, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + ca = bch_dev_bkey_exists(c, k.k->p.inode); + g = gc_bucket(ca, k.k->p.offset); + + a = bch2_alloc_to_v4(k, &a_convert); + + g->gen_valid = 1; + g->gen = a->gen; + + if (metadata_only && + (a->data_type == BCH_DATA_user || + a->data_type == BCH_DATA_cached || + a->data_type == BCH_DATA_parity)) { + g->data_type = a->data_type; + g->dirty_sectors = a->dirty_sectors; + g->cached_sectors = a->cached_sectors; + g->stripe = a->stripe; + g->stripe_redundancy = a->stripe_redundancy; + } + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error reading alloc info at gc start: %s", bch2_err_str(ret)); + + return ret; +} + +static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only) +{ + struct bch_dev *ca; + unsigned i; + + for_each_member_device(ca, c, i) { + struct bucket_array *buckets = gc_bucket_array(ca); + struct bucket *g; + + for_each_bucket(g, buckets) { + if (metadata_only && + (g->data_type == BCH_DATA_user || + g->data_type == BCH_DATA_cached || + g->data_type == BCH_DATA_parity)) + continue; + g->data_type = 0; + g->dirty_sectors = 0; + g->cached_sectors = 0; + } + }; +} + +static int bch2_gc_write_reflink_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + size_t *idx) +{ + struct bch_fs *c = trans->c; + const __le64 *refcount = bkey_refcount_c(k); + struct printbuf buf = PRINTBUF; + struct reflink_gc *r; + int ret = 0; + + if (!refcount) + return 0; + + while ((r = genradix_ptr(&c->reflink_gc_table, *idx)) && + r->offset < k.k->p.offset) + ++*idx; + + if (!r || + r->offset != k.k->p.offset || + r->size != k.k->size) { + bch_err(c, "unexpected inconsistency walking reflink table at gc finish"); + return -EINVAL; + } + + if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), c, + "reflink key has wrong refcount:\n" + " %s\n" + " should be %u", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf), + r->refcount)) { + struct bkey_i *new = bch2_bkey_make_mut(trans, iter, &k, 0); + + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + if (!r->refcount) + new->k.type = KEY_TYPE_deleted; + else + *bkey_refcount(new) = cpu_to_le64(r->refcount); + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + size_t idx = 0; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_gc_write_reflink_key(&trans, &iter, k, &idx)); + + c->reflink_gc_nr = 0; + bch2_trans_exit(&trans); + return ret; +} + +static int bch2_gc_reflink_start(struct bch_fs *c, + bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct reflink_gc *r; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + c->reflink_gc_nr = 0; + + for_each_btree_key(&trans, iter, BTREE_ID_reflink, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + const __le64 *refcount = bkey_refcount_c(k); + + if (!refcount) + continue; + + r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, + GFP_KERNEL); + if (!r) { + ret = -BCH_ERR_ENOMEM_gc_reflink_start; + break; + } + + r->offset = k.k->p.offset; + r->size = k.k->size; + r->refcount = 0; + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + return ret; +} + +static void bch2_gc_reflink_reset(struct bch_fs *c, bool metadata_only) +{ + struct genradix_iter iter; + struct reflink_gc *r; + + genradix_for_each(&c->reflink_gc_table, iter, r) + r->refcount = 0; +} + +static int bch2_gc_write_stripes_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + const struct bch_stripe *s; + struct gc_stripe *m; + bool bad = false; + unsigned i; + int ret = 0; + + if (k.k->type != KEY_TYPE_stripe) + return 0; + + s = bkey_s_c_to_stripe(k).v; + m = genradix_ptr(&c->gc_stripes, k.k->p.offset); + + for (i = 0; i < s->nr_blocks; i++) { + u32 old = stripe_blockcount_get(s, i); + u32 new = (m ? m->block_sectors[i] : 0); + + if (old != new) { + prt_printf(&buf, "stripe block %u has wrong sector count: got %u, should be %u\n", + i, old, new); + bad = true; + } + } + + if (bad) + bch2_bkey_val_to_text(&buf, c, k); + + if (fsck_err_on(bad, c, "%s", buf.buf)) { + struct bkey_i_stripe *new; + + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bkey_reassemble(&new->k_i, k); + + for (i = 0; i < new->v.nr_blocks; i++) + stripe_blockcount_set(&new->v, i, m ? m->block_sectors[i] : 0); + + ret = bch2_trans_update(trans, iter, &new->k_i, 0); + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + if (metadata_only) + return 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key_commit(&trans, iter, + BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_gc_write_stripes_key(&trans, &iter, k)); + + bch2_trans_exit(&trans); + return ret; +} + +static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only) +{ + genradix_free(&c->gc_stripes); +} + +/** + * bch2_gc - walk _all_ references to buckets, and recompute them: + * + * Order matters here: + * - Concurrent GC relies on the fact that we have a total ordering for + * everything that GC walks - see gc_will_visit_node(), + * gc_will_visit_root() + * + * - also, references move around in the course of index updates and + * various other crap: everything needs to agree on the ordering + * references are allowed to move around in - e.g., we're allowed to + * start with a reference owned by an open_bucket (the allocator) and + * move it to the btree, but not the reverse. + * + * This is necessary to ensure that gc doesn't miss references that + * move around - if references move backwards in the ordering GC + * uses, GC could skip past them + */ +int bch2_gc(struct bch_fs *c, bool initial, bool metadata_only) +{ + unsigned iter = 0; + int ret; + + lockdep_assert_held(&c->state_lock); + + down_write(&c->gc_lock); + + bch2_btree_interior_updates_flush(c); + + ret = bch2_gc_start(c) ?: + bch2_gc_alloc_start(c, metadata_only) ?: + bch2_gc_reflink_start(c, metadata_only); + if (ret) + goto out; +again: + gc_pos_set(c, gc_phase(GC_PHASE_START)); + + bch2_mark_superblocks(c); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) || + (BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb) && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations && + c->opts.fix_errors != FSCK_FIX_no)) { + bch_info(c, "Starting topology repair pass"); + ret = bch2_repair_topology(c); + if (ret) + goto out; + bch_info(c, "Topology repair pass done"); + + set_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags); + } + + ret = bch2_gc_btrees(c, initial, metadata_only); + + if (ret == -BCH_ERR_need_topology_repair && + !test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags) && + c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) { + set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, true); + ret = 0; + } + + if (ret == -BCH_ERR_need_topology_repair) + ret = -BCH_ERR_fsck_errors_not_fixed; + + if (ret) + goto out; + +#if 0 + bch2_mark_pending_btree_node_frees(c); +#endif + c->gc_count++; + + if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) || + (!iter && bch2_test_restart_gc)) { + if (iter++ > 2) { + bch_info(c, "Unable to fix bucket gens, looping"); + ret = -EINVAL; + goto out; + } + + /* + * XXX: make sure gens we fixed got saved + */ + bch_info(c, "Second GC pass needed, restarting:"); + clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags); + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + bch2_gc_stripes_reset(c, metadata_only); + bch2_gc_alloc_reset(c, metadata_only); + bch2_gc_reflink_reset(c, metadata_only); + ret = bch2_gc_reset(c); + if (ret) + goto out; + + /* flush fsck errors, reset counters */ + bch2_flush_fsck_errs(c); + goto again; + } +out: + if (!ret) { + bch2_journal_block(&c->journal); + + ret = bch2_gc_stripes_done(c, metadata_only) ?: + bch2_gc_reflink_done(c, metadata_only) ?: + bch2_gc_alloc_done(c, metadata_only) ?: + bch2_gc_done(c, initial, metadata_only); + + bch2_journal_unblock(&c->journal); + } + + percpu_down_write(&c->mark_lock); + /* Indicates that gc is no longer in progress: */ + __gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING)); + + bch2_gc_free(c); + percpu_up_write(&c->mark_lock); + + up_write(&c->gc_lock); + + /* + * At startup, allocations can happen directly instead of via the + * allocator thread - issue wakeup in case they blocked on gc_lock: + */ + closure_wake_up(&c->freelist_wait); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int gc_btree_gens_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + struct bkey_i *u; + int ret; + + percpu_down_read(&c->mark_lock); + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ptr_stale(ca, ptr) > 16) { + percpu_up_read(&c->mark_lock); + goto update; + } + } + + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; + } + percpu_up_read(&c->mark_lock); + return 0; +update: + u = bch2_bkey_make_mut(trans, iter, &k, 0); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bch2_extent_normalize(c, bkey_i_to_s(u)); + return 0; +} + +static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_dev *ca = bch_dev_bkey_exists(trans->c, iter->pos.inode); + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert); + struct bkey_i_alloc_v4 *a_mut; + int ret; + + if (a->oldest_gen == ca->oldest_gen[iter->pos.offset]) + return 0; + + a_mut = bch2_alloc_to_v4_mut(trans, k); + ret = PTR_ERR_OR_ZERO(a_mut); + if (ret) + return ret; + + a_mut->v.oldest_gen = ca->oldest_gen[iter->pos.offset]; + a_mut->v.data_type = alloc_data_type(a_mut->v, a_mut->v.data_type); + + return bch2_trans_update(trans, iter, &a_mut->k_i, 0); +} + +int bch2_gc_gens(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_dev *ca; + u64 b, start_time = local_clock(); + unsigned i; + int ret; + + /* + * Ideally we would be using state_lock and not gc_lock here, but that + * introduces a deadlock in the RO path - we currently take the state + * lock at the start of going RO, thus the gc thread may get stuck: + */ + if (!mutex_trylock(&c->gc_gens_lock)) + return 0; + + trace_and_count(c, gc_gens_start, c); + down_read(&c->gc_lock); + bch2_trans_init(&trans, c, 0, 0); + + for_each_member_device(ca, c, i) { + struct bucket_gens *gens; + + BUG_ON(ca->oldest_gen); + + ca->oldest_gen = kvmalloc(ca->mi.nbuckets, GFP_KERNEL); + if (!ca->oldest_gen) { + percpu_ref_put(&ca->ref); + ret = -BCH_ERR_ENOMEM_gc_gens; + goto err; + } + + gens = bucket_gens(ca); + + for (b = gens->first_bucket; + b < gens->nbuckets; b++) + ca->oldest_gen[b] = gens->b[b]; + } + + for (i = 0; i < BTREE_ID_NR; i++) + if (btree_type_has_ptrs(i)) { + struct btree_iter iter; + struct bkey_s_c k; + + c->gc_gens_btree = i; + c->gc_gens_pos = POS_MIN; + ret = for_each_btree_key_commit(&trans, iter, i, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_NOFAIL, + gc_btree_gens_key(&trans, &iter, k)); + if (ret && !bch2_err_matches(ret, EROFS)) + bch_err(c, "error recalculating oldest_gen: %s", bch2_err_str(ret)); + if (ret) + goto err; + } + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_alloc, + POS_MIN, + BTREE_ITER_PREFETCH, + k, + NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_alloc_write_oldest_gen(&trans, &iter, k)); + if (ret && !bch2_err_matches(ret, EROFS)) + bch_err(c, "error writing oldest_gen: %s", bch2_err_str(ret)); + if (ret) + goto err; + + c->gc_gens_btree = 0; + c->gc_gens_pos = POS_MIN; + + c->gc_count++; + + bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time); + trace_and_count(c, gc_gens_end, c); +err: + for_each_member_device(ca, c, i) { + kvfree(ca->oldest_gen); + ca->oldest_gen = NULL; + } + + bch2_trans_exit(&trans); + up_read(&c->gc_lock); + mutex_unlock(&c->gc_gens_lock); + return ret; +} + +static int bch2_gc_thread(void *arg) +{ + struct bch_fs *c = arg; + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last = atomic64_read(&clock->now); + unsigned last_kick = atomic_read(&c->kick_gc); + int ret; + + set_freezable(); + + while (1) { + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + + if (atomic_read(&c->kick_gc) != last_kick) + break; + + if (c->btree_gc_periodic) { + unsigned long next = last + c->capacity / 16; + + if (atomic64_read(&clock->now) >= next) + break; + + bch2_io_clock_schedule_timeout(clock, next); + } else { + schedule(); + } + + try_to_freeze(); + } + __set_current_state(TASK_RUNNING); + + last = atomic64_read(&clock->now); + last_kick = atomic_read(&c->kick_gc); + + /* + * Full gc is currently incompatible with btree key cache: + */ +#if 0 + ret = bch2_gc(c, false, false); +#else + ret = bch2_gc_gens(c); +#endif + if (ret < 0) + bch_err(c, "btree gc failed: %s", bch2_err_str(ret)); + + debug_check_no_locks_held(); + } + + return 0; +} + +void bch2_gc_thread_stop(struct bch_fs *c) +{ + struct task_struct *p; + + p = c->gc_thread; + c->gc_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_gc_thread_start(struct bch_fs *c) +{ + struct task_struct *p; + + if (c->gc_thread) + return 0; + + p = kthread_create(bch2_gc_thread, c, "bch-gc/%s", c->name); + if (IS_ERR(p)) { + bch_err(c, "error creating gc thread: %s", bch2_err_str(PTR_ERR(p))); + return PTR_ERR(p); + } + + get_task_struct(p); + c->gc_thread = p; + wake_up_process(p); + return 0; +} diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h new file mode 100644 index 000000000..95d803b57 --- /dev/null +++ b/fs/bcachefs/btree_gc.h @@ -0,0 +1,112 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_GC_H +#define _BCACHEFS_BTREE_GC_H + +#include "btree_types.h" + +int bch2_gc(struct bch_fs *, bool, bool); +int bch2_gc_gens(struct bch_fs *); +void bch2_gc_thread_stop(struct bch_fs *); +int bch2_gc_thread_start(struct bch_fs *); + +/* + * For concurrent mark and sweep (with other index updates), we define a total + * ordering of _all_ references GC walks: + * + * Note that some references will have the same GC position as others - e.g. + * everything within the same btree node; in those cases we're relying on + * whatever locking exists for where those references live, i.e. the write lock + * on a btree node. + * + * That locking is also required to ensure GC doesn't pass the updater in + * between the updater adding/removing the reference and updating the GC marks; + * without that, we would at best double count sometimes. + * + * That part is important - whenever calling bch2_mark_pointers(), a lock _must_ + * be held that prevents GC from passing the position the updater is at. + * + * (What about the start of gc, when we're clearing all the marks? GC clears the + * mark with the gc pos seqlock held, and bch_mark_bucket checks against the gc + * position inside its cmpxchg loop, so crap magically works). + */ + +/* Position of (the start of) a gc phase: */ +static inline struct gc_pos gc_phase(enum gc_phase phase) +{ + return (struct gc_pos) { + .phase = phase, + .pos = POS_MIN, + .level = 0, + }; +} + +static inline int gc_pos_cmp(struct gc_pos l, struct gc_pos r) +{ + return cmp_int(l.phase, r.phase) ?: + bpos_cmp(l.pos, r.pos) ?: + cmp_int(l.level, r.level); +} + +static inline enum gc_phase btree_id_to_gc_phase(enum btree_id id) +{ + switch (id) { +#define x(name, v) case BTREE_ID_##name: return GC_PHASE_BTREE_##name; + BCH_BTREE_IDS() +#undef x + default: + BUG(); + } +} + +static inline struct gc_pos gc_pos_btree(enum btree_id id, + struct bpos pos, unsigned level) +{ + return (struct gc_pos) { + .phase = btree_id_to_gc_phase(id), + .pos = pos, + .level = level, + }; +} + +/* + * GC position of the pointers within a btree node: note, _not_ for &b->key + * itself, that lives in the parent node: + */ +static inline struct gc_pos gc_pos_btree_node(struct btree *b) +{ + return gc_pos_btree(b->c.btree_id, b->key.k.p, b->c.level); +} + +/* + * GC position of the pointer to a btree root: we don't use + * gc_pos_pointer_to_btree_node() here to avoid a potential race with + * btree_split() increasing the tree depth - the new root will have level > the + * old root and thus have a greater gc position than the old root, but that + * would be incorrect since once gc has marked the root it's not coming back. + */ +static inline struct gc_pos gc_pos_btree_root(enum btree_id id) +{ + return gc_pos_btree(id, SPOS_MAX, BTREE_MAX_DEPTH); +} + +static inline bool gc_visited(struct bch_fs *c, struct gc_pos pos) +{ + unsigned seq; + bool ret; + + do { + seq = read_seqcount_begin(&c->gc_pos_lock); + ret = gc_pos_cmp(pos, c->gc_pos) <= 0; + } while (read_seqcount_retry(&c->gc_pos_lock, seq)); + + return ret; +} + +static inline void bch2_do_gc_gens(struct bch_fs *c) +{ + atomic_inc(&c->kick_gc); + if (c->gc_thread) + wake_up_process(c->gc_thread); +} + +#endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c new file mode 100644 index 000000000..fa1229eb1 --- /dev/null +++ b/fs/bcachefs/btree_io.c @@ -0,0 +1,2266 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "bkey_sort.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "checksum.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "super-io.h" +#include "trace.h" + +#include + +void bch2_btree_node_io_unlock(struct btree *b) +{ + EBUG_ON(!btree_node_write_in_flight(b)); + + clear_btree_node_write_in_flight_inner(b); + clear_btree_node_write_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +void bch2_btree_node_io_lock(struct btree *b) +{ + bch2_assert_btree_nodes_not_locked(); + + wait_on_bit_lock_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_read(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void __bch2_btree_node_wait_on_write(struct btree *b) +{ + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_read(struct btree *b) +{ + bch2_assert_btree_nodes_not_locked(); + + wait_on_bit_io(&b->flags, BTREE_NODE_read_in_flight, + TASK_UNINTERRUPTIBLE); +} + +void bch2_btree_node_wait_on_write(struct btree *b) +{ + bch2_assert_btree_nodes_not_locked(); + + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight, + TASK_UNINTERRUPTIBLE); +} + +static void verify_no_dups(struct btree *b, + struct bkey_packed *start, + struct bkey_packed *end) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bkey_packed *k, *p; + + if (start == end) + return; + + for (p = start, k = bkey_p_next(start); + k != end; + p = k, k = bkey_p_next(k)) { + struct bkey l = bkey_unpack_key(b, p); + struct bkey r = bkey_unpack_key(b, k); + + BUG_ON(bpos_ge(l.p, bkey_start_pos(&r))); + } +#endif +} + +static void set_needs_whiteout(struct bset *i, int v) +{ + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) + k->needs_whiteout = v; +} + +static void btree_bounce_free(struct bch_fs *c, size_t size, + bool used_mempool, void *p) +{ + if (used_mempool) + mempool_free(p, &c->btree_bounce_pool); + else + vpfree(p, size); +} + +static void *btree_bounce_alloc_noprof(struct bch_fs *c, size_t size, + bool *used_mempool) +{ + unsigned flags = memalloc_nofs_save(); + void *p; + + BUG_ON(size > btree_bytes(c)); + + *used_mempool = false; + p = vpmalloc_noprof(size, __GFP_NOWARN|GFP_NOWAIT); + if (!p) { + *used_mempool = true; + p = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + } + memalloc_nofs_restore(flags); + return p; +} +#define btree_bounce_alloc(_c, _size, _used_mempool) \ + alloc_hooks(btree_bounce_alloc_noprof(_c, _size, _used_mempool)) + +static void sort_bkey_ptrs(const struct btree *bt, + struct bkey_packed **ptrs, unsigned nr) +{ + unsigned n = nr, a = nr / 2, b, c, d; + + if (!a) + return; + + /* Heap sort: see lib/sort.c: */ + while (1) { + if (a) + a--; + else if (--n) + swap(ptrs[0], ptrs[n]); + else + break; + + for (b = a; c = 2 * b + 1, (d = c + 1) < n;) + b = bch2_bkey_cmp_packed(bt, + ptrs[c], + ptrs[d]) >= 0 ? c : d; + if (d == n) + b = c; + + while (b != a && + bch2_bkey_cmp_packed(bt, + ptrs[a], + ptrs[b]) >= 0) + b = (b - 1) / 2; + c = b; + while (b != a) { + b = (b - 1) / 2; + swap(ptrs[b], ptrs[c]); + } + } +} + +static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b) +{ + struct bkey_packed *new_whiteouts, **ptrs, **ptrs_end, *k; + bool used_mempool = false; + size_t bytes = b->whiteout_u64s * sizeof(u64); + + if (!b->whiteout_u64s) + return; + + new_whiteouts = btree_bounce_alloc(c, bytes, &used_mempool); + + ptrs = ptrs_end = ((void *) new_whiteouts + bytes); + + for (k = unwritten_whiteouts_start(c, b); + k != unwritten_whiteouts_end(c, b); + k = bkey_p_next(k)) + *--ptrs = k; + + sort_bkey_ptrs(b, ptrs, ptrs_end - ptrs); + + k = new_whiteouts; + + while (ptrs != ptrs_end) { + bkey_copy(k, *ptrs); + k = bkey_p_next(k); + ptrs++; + } + + verify_no_dups(b, new_whiteouts, + (void *) ((u64 *) new_whiteouts + b->whiteout_u64s)); + + memcpy_u64s(unwritten_whiteouts_start(c, b), + new_whiteouts, b->whiteout_u64s); + + btree_bounce_free(c, bytes, used_mempool, new_whiteouts); +} + +static bool should_compact_bset(struct btree *b, struct bset_tree *t, + bool compacting, enum compact_mode mode) +{ + if (!bset_dead_u64s(b, t)) + return false; + + switch (mode) { + case COMPACT_LAZY: + return should_compact_bset_lazy(b, t) || + (compacting && !bset_written(b, bset(b, t))); + case COMPACT_ALL: + return true; + default: + BUG(); + } +} + +static bool bch2_drop_whiteouts(struct btree *b, enum compact_mode mode) +{ + struct bset_tree *t; + bool ret = false; + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k, *n, *out, *start, *end; + struct btree_node_entry *src = NULL, *dst = NULL; + + if (t != b->set && !bset_written(b, i)) { + src = container_of(i, struct btree_node_entry, keys); + dst = max(write_block(b), + (void *) btree_bkey_last(b, t - 1)); + } + + if (src != dst) + ret = true; + + if (!should_compact_bset(b, t, ret, mode)) { + if (src != dst) { + memmove(dst, src, sizeof(*src) + + le16_to_cpu(src->keys.u64s) * + sizeof(u64)); + i = &dst->keys; + set_btree_bset(b, t, i); + } + continue; + } + + start = btree_bkey_first(b, t); + end = btree_bkey_last(b, t); + + if (src != dst) { + memmove(dst, src, sizeof(*src)); + i = &dst->keys; + set_btree_bset(b, t, i); + } + + out = i->start; + + for (k = start; k != end; k = n) { + n = bkey_p_next(k); + + if (!bkey_deleted(k)) { + bkey_copy(out, k); + out = bkey_p_next(out); + } else { + BUG_ON(k->needs_whiteout); + } + } + + i->u64s = cpu_to_le16((u64 *) out - i->_data); + set_btree_bset_end(b, t); + bch2_bset_set_no_aux_tree(b, t); + ret = true; + } + + bch2_verify_btree_nr_keys(b); + + bch2_btree_build_aux_trees(b); + + return ret; +} + +bool bch2_compact_whiteouts(struct bch_fs *c, struct btree *b, + enum compact_mode mode) +{ + return bch2_drop_whiteouts(b, mode); +} + +static void btree_node_sort(struct bch_fs *c, struct btree *b, + unsigned start_idx, + unsigned end_idx, + bool filter_whiteouts) +{ + struct btree_node *out; + struct sort_iter sort_iter; + struct bset_tree *t; + struct bset *start_bset = bset(b, &b->set[start_idx]); + bool used_mempool = false; + u64 start_time, seq = 0; + unsigned i, u64s = 0, bytes, shift = end_idx - start_idx - 1; + bool sorting_entire_node = start_idx == 0 && + end_idx == b->nsets; + + sort_iter_init(&sort_iter, b); + + for (t = b->set + start_idx; + t < b->set + end_idx; + t++) { + u64s += le16_to_cpu(bset(b, t)->u64s); + sort_iter_add(&sort_iter, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + } + + bytes = sorting_entire_node + ? btree_bytes(c) + : __vstruct_bytes(struct btree_node, u64s); + + out = btree_bounce_alloc(c, bytes, &used_mempool); + + start_time = local_clock(); + + u64s = bch2_sort_keys(out->keys.start, &sort_iter, filter_whiteouts); + + out->keys.u64s = cpu_to_le16(u64s); + + BUG_ON(vstruct_end(&out->keys) > (void *) out + bytes); + + if (sorting_entire_node) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); + + /* Make sure we preserve bset journal_seq: */ + for (t = b->set + start_idx; t < b->set + end_idx; t++) + seq = max(seq, le64_to_cpu(bset(b, t)->journal_seq)); + start_bset->journal_seq = cpu_to_le64(seq); + + if (sorting_entire_node) { + unsigned u64s = le16_to_cpu(out->keys.u64s); + + BUG_ON(bytes != btree_bytes(c)); + + /* + * Our temporary buffer is the same size as the btree node's + * buffer, we can just swap buffers instead of doing a big + * memcpy() + */ + *out = *b->data; + out->keys.u64s = cpu_to_le16(u64s); + swap(out, b->data); + set_btree_bset(b, b->set, &b->data->keys); + } else { + start_bset->u64s = out->keys.u64s; + memcpy_u64s(start_bset->start, + out->keys.start, + le16_to_cpu(out->keys.u64s)); + } + + for (i = start_idx + 1; i < end_idx; i++) + b->nr.bset_u64s[start_idx] += + b->nr.bset_u64s[i]; + + b->nsets -= shift; + + for (i = start_idx + 1; i < b->nsets; i++) { + b->nr.bset_u64s[i] = b->nr.bset_u64s[i + shift]; + b->set[i] = b->set[i + shift]; + } + + for (i = b->nsets; i < MAX_BSETS; i++) + b->nr.bset_u64s[i] = 0; + + set_btree_bset_end(b, &b->set[start_idx]); + bch2_bset_set_no_aux_tree(b, &b->set[start_idx]); + + btree_bounce_free(c, bytes, used_mempool, out); + + bch2_verify_btree_nr_keys(b); +} + +void bch2_btree_sort_into(struct bch_fs *c, + struct btree *dst, + struct btree *src) +{ + struct btree_nr_keys nr; + struct btree_node_iter src_iter; + u64 start_time = local_clock(); + + BUG_ON(dst->nsets != 1); + + bch2_bset_set_no_aux_tree(dst, dst->set); + + bch2_btree_node_iter_init_from_start(&src_iter, src); + + nr = bch2_sort_repack(btree_bset_first(dst), + src, &src_iter, + &dst->format, + true); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_sort], + start_time); + + set_btree_bset_end(dst, dst->set); + + dst->nr.live_u64s += nr.live_u64s; + dst->nr.bset_u64s[0] += nr.bset_u64s[0]; + dst->nr.packed_keys += nr.packed_keys; + dst->nr.unpacked_keys += nr.unpacked_keys; + + bch2_verify_btree_nr_keys(dst); +} + +#define SORT_CRIT (4096 / sizeof(u64)) + +/* + * We're about to add another bset to the btree node, so if there's currently + * too many bsets - sort some of them together: + */ +static bool btree_node_compact(struct bch_fs *c, struct btree *b) +{ + unsigned unwritten_idx; + bool ret = false; + + for (unwritten_idx = 0; + unwritten_idx < b->nsets; + unwritten_idx++) + if (!bset_written(b, bset(b, &b->set[unwritten_idx]))) + break; + + if (b->nsets - unwritten_idx > 1) { + btree_node_sort(c, b, unwritten_idx, + b->nsets, false); + ret = true; + } + + if (unwritten_idx > 1) { + btree_node_sort(c, b, 0, unwritten_idx, false); + ret = true; + } + + return ret; +} + +void bch2_btree_build_aux_trees(struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) + bch2_bset_build_aux_tree(b, t, + !bset_written(b, bset(b, t)) && + t == bset_tree_last(b)); +} + +/* + * If we have MAX_BSETS (3) bsets, should we sort them all down to just one? + * + * The first bset is going to be of similar order to the size of the node, the + * last bset is bounded by btree_write_set_buffer(), which is set to keep the + * memmove on insert from being too expensive: the middle bset should, ideally, + * be the geometric mean of the first and the last. + * + * Returns true if the middle bset is greater than that geometric mean: + */ +static inline bool should_compact_all(struct bch_fs *c, struct btree *b) +{ + unsigned mid_u64s_bits = + (ilog2(btree_max_u64s(c)) + BTREE_WRITE_SET_U64s_BITS) / 2; + + return bset_u64s(&b->set[1]) > 1U << mid_u64s_bits; +} + +/* + * @bch_btree_init_next - initialize a new (unwritten) bset that can then be + * inserted into + * + * Safe to call if there already is an unwritten bset - will only add a new bset + * if @b doesn't already have one. + * + * Returns true if we sorted (i.e. invalidated iterators + */ +void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_node_entry *bne; + bool reinit_iter = false; + + EBUG_ON(!six_lock_counts(&b->c.lock).n[SIX_LOCK_write]); + BUG_ON(bset_written(b, bset(b, &b->set[1]))); + BUG_ON(btree_node_just_written(b)); + + if (b->nsets == MAX_BSETS && + !btree_node_write_in_flight(b) && + should_compact_all(c, b)) { + bch2_btree_node_write(c, b, SIX_LOCK_write, + BTREE_WRITE_init_next_bset); + reinit_iter = true; + } + + if (b->nsets == MAX_BSETS && + btree_node_compact(c, b)) + reinit_iter = true; + + BUG_ON(b->nsets >= MAX_BSETS); + + bne = want_new_bset(c, b); + if (bne) + bch2_bset_init_next(c, b, bne); + + bch2_btree_build_aux_trees(b); + + if (reinit_iter) + bch2_trans_node_reinit_iter(trans, b); +} + +static void btree_pos_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + prt_printf(out, "%s level %u/%u\n ", + bch2_btree_ids[b->c.btree_id], + b->c.level, + bch2_btree_id_root(c, b->c.btree_id)->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); +} + +static void btree_err_msg(struct printbuf *out, struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, int write) +{ + prt_printf(out, bch2_log_msg(c, "%s"), + write == READ + ? "error validating btree node " + : "corrupt btree node before write "); + if (ca) + prt_printf(out, "on %s ", ca->name); + prt_printf(out, "at btree "); + btree_pos_to_text(out, c, b); + + prt_printf(out, "\n node offset %u", b->written); + if (i) + prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); + prt_str(out, ": "); +} + +enum btree_err_type { + /* + * We can repair this locally, and we're after the checksum check so + * there's no need to try another replica: + */ + BTREE_ERR_FIXABLE, + /* + * We can repair this if we have to, but we should try reading another + * replica if we can: + */ + BTREE_ERR_WANT_RETRY, + /* + * Read another replica if we have one, otherwise consider the whole + * node bad: + */ + BTREE_ERR_MUST_RETRY, + BTREE_ERR_BAD_NODE, + BTREE_ERR_INCOMPATIBLE, +}; + +enum btree_validate_ret { + BTREE_RETRY_READ = 64, +}; + +static int __btree_err(enum btree_err_type type, + struct bch_fs *c, + struct bch_dev *ca, + struct btree *b, + struct bset *i, + int write, + bool have_retry, + const char *fmt, ...) +{ + struct printbuf out = PRINTBUF; + va_list args; + int ret = -BCH_ERR_fsck_fix; + + btree_err_msg(&out, c, ca, b, i, b->written, write); + + va_start(args, fmt); + prt_vprintf(&out, fmt, args); + va_end(args); + + if (write == WRITE) { + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = c->opts.errors == BCH_ON_ERROR_continue + ? 0 + : -BCH_ERR_fsck_errors_not_fixed; + goto out; + } + + if (!have_retry && type == BTREE_ERR_WANT_RETRY) + type = BTREE_ERR_FIXABLE; + if (!have_retry && type == BTREE_ERR_MUST_RETRY) + type = BTREE_ERR_BAD_NODE; + + switch (type) { + case BTREE_ERR_FIXABLE: + mustfix_fsck_err(c, "%s", out.buf); + ret = -BCH_ERR_fsck_fix; + break; + case BTREE_ERR_WANT_RETRY: + case BTREE_ERR_MUST_RETRY: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = BTREE_RETRY_READ; + break; + case BTREE_ERR_BAD_NODE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + bch2_topology_error(c); + ret = -BCH_ERR_need_topology_repair; + break; + case BTREE_ERR_INCOMPATIBLE: + bch2_print_string_as_lines(KERN_ERR, out.buf); + ret = -BCH_ERR_fsck_errors_not_fixed; + break; + default: + BUG(); + } +out: +fsck_err: + printbuf_exit(&out); + return ret; +} + +#define btree_err(type, c, ca, b, i, msg, ...) \ +({ \ + int _ret = __btree_err(type, c, ca, b, i, write, have_retry, msg, ##__VA_ARGS__);\ + \ + if (_ret != -BCH_ERR_fsck_fix) \ + goto fsck_err; \ + *saw_error = true; \ +}) + +#define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) + +/* + * When btree topology repair changes the start or end of a node, that might + * mean we have to drop keys that are no longer inside the node: + */ +__cold +void bch2_btree_node_drop_keys_outside_node(struct btree *b) +{ + struct bset_tree *t; + struct bkey_s_c k; + struct bkey unpacked; + struct btree_node_iter iter; + + for_each_bset(b, t) { + struct bset *i = bset(b, t); + struct bkey_packed *k; + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) + if (bkey_cmp_left_packed(b, k, &b->data->min_key) >= 0) + break; + + if (k != i->start) { + unsigned shift = (u64 *) k - (u64 *) i->start; + + memmove_u64s_down(i->start, k, + (u64 *) vstruct_end(i) - (u64 *) k); + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - shift); + set_btree_bset_end(b, t); + } + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) + if (bkey_cmp_left_packed(b, k, &b->data->max_key) > 0) + break; + + if (k != vstruct_last(i)) { + i->u64s = cpu_to_le16((u64 *) k - (u64 *) i->start); + set_btree_bset_end(b, t); + } + } + + /* + * Always rebuild search trees: eytzinger search tree nodes directly + * depend on the values of min/max key: + */ + bch2_bset_set_no_aux_tree(b, b->set); + bch2_btree_build_aux_trees(b); + + for_each_btree_node_key_unpack(b, k, &iter, &unpacked) { + BUG_ON(bpos_lt(k.k->p, b->data->min_key)); + BUG_ON(bpos_gt(k.k->p, b->data->max_key)); + } +} + +static int validate_bset(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, struct bset *i, + unsigned offset, unsigned sectors, + int write, bool have_retry, bool *saw_error) +{ + unsigned version = le16_to_cpu(i->version); + const char *err; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + btree_err_on(!bch2_version_compatible(version), + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + "unsupported bset version %u.%u", + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version)); + + if (btree_err_on(version < c->sb.version_min, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u older than superblock version_min %u", + version, c->sb.version_min)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version_min = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + if (btree_err_on(BCH_VERSION_MAJOR(version) > + BCH_VERSION_MAJOR(c->sb.version), + BTREE_ERR_FIXABLE, c, NULL, b, i, + "bset version %u newer than superblock version %u", + version, c->sb.version)) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->version = cpu_to_le16(version); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + btree_err_on(BSET_SEPARATE_WHITEOUTS(i), + BTREE_ERR_INCOMPATIBLE, c, ca, b, i, + "BSET_SEPARATE_WHITEOUTS no longer supported"); + + if (btree_err_on(offset + sectors > btree_sectors(c), + BTREE_ERR_FIXABLE, c, ca, b, i, + "bset past end of btree node")) { + i->u64s = 0; + ret = 0; + goto out; + } + + btree_err_on(offset && !i->u64s, + BTREE_ERR_FIXABLE, c, ca, b, i, + "empty bset"); + + btree_err_on(BSET_OFFSET(i) && + BSET_OFFSET(i) != offset, + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "bset at wrong sector offset"); + + if (!offset) { + struct btree_node *bn = + container_of(i, struct btree_node, keys); + /* These indicate that we read the wrong btree node: */ + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + /* XXX endianness */ + btree_err_on(bp->seq != bn->keys.seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect sequence number (wrong btree node)"); + } + + btree_err_on(BTREE_NODE_ID(bn) != b->c.btree_id, + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect btree id"); + + btree_err_on(BTREE_NODE_LEVEL(bn) != b->c.level, + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect level"); + + if (!write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + if (BTREE_PTR_RANGE_UPDATED(bp)) { + b->data->min_key = bp->min_key; + b->data->max_key = b->key.k.p; + } + + btree_err_on(!bpos_eq(b->data->min_key, bp->min_key), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "incorrect min_key: got %s should be %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->min_key), buf1.buf), + (printbuf_reset(&buf2), + bch2_bpos_to_text(&buf2, bp->min_key), buf2.buf)); + } + + btree_err_on(!bpos_eq(bn->max_key, b->key.k.p), + BTREE_ERR_MUST_RETRY, c, ca, b, i, + "incorrect max key %s", + (printbuf_reset(&buf1), + bch2_bpos_to_text(&buf1, bn->max_key), buf1.buf)); + + if (write) + compat_btree_node(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, bn); + + err = bch2_bkey_format_validate(&bn->format); + btree_err_on(err, + BTREE_ERR_BAD_NODE, c, ca, b, i, + "invalid bkey format: %s", err); + + compat_bformat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &bn->format); + } +out: +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static int bset_key_invalid(struct bch_fs *c, struct btree *b, + struct bkey_s_c k, + bool updated_range, int rw, + struct printbuf *err) +{ + return __bch2_bkey_invalid(c, k, btree_node_type(b), READ, err) ?: + (!updated_range ? bch2_bkey_in_btree_node(b, k, err) : 0) ?: + (rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0); +} + +static int validate_bset_keys(struct bch_fs *c, struct btree *b, + struct bset *i, int write, + bool have_retry, bool *saw_error) +{ + unsigned version = le16_to_cpu(i->version); + struct bkey_packed *k, *prev = NULL; + struct printbuf buf = PRINTBUF; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + int ret = 0; + + for (k = i->start; + k != vstruct_last(i);) { + struct bkey_s u; + struct bkey tmp; + + if (btree_err_on(bkey_p_next(k) > vstruct_last(i), + BTREE_ERR_FIXABLE, c, NULL, b, i, + "key extends past end of bset")) { + i->u64s = cpu_to_le16((u64 *) k - i->_data); + break; + } + + if (btree_err_on(k->format > KEY_FORMAT_CURRENT, + BTREE_ERR_FIXABLE, c, NULL, b, i, + "invalid bkey format %u", k->format)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + /* XXX: validate k->u64s */ + if (!write) + bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + + u = __bkey_disassemble(b, k, &tmp); + + printbuf_reset(&buf); + if (bset_key_invalid(c, b, u.s_c, updated_range, write, &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "invalid bkey: "); + bset_key_invalid(c, b, u.s_c, updated_range, write, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + + if (write) + bch2_bkey_compat(b->c.level, b->c.btree_id, version, + BSET_BIG_ENDIAN(i), write, + &b->format, k); + + if (prev && bkey_iter_cmp(b, prev, k) > 0) { + struct bkey up = bkey_unpack_key(b, prev); + + printbuf_reset(&buf); + prt_printf(&buf, "keys out of order: "); + bch2_bkey_to_text(&buf, &up); + prt_printf(&buf, " > "); + bch2_bkey_to_text(&buf, u.k); + + bch2_dump_bset(c, b, i, 0); + + if (btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf)) { + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + continue; + } + } + + prev = k; + k = bkey_p_next(k); + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, + struct btree *b, bool have_retry, bool *saw_error) +{ + struct btree_node_entry *bne; + struct sort_iter *iter; + struct btree_node *sorted; + struct bkey_packed *k; + struct bch_extent_ptr *ptr; + struct bset *i; + bool used_mempool, blacklisted; + bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && + BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); + unsigned u64s; + unsigned blacklisted_written, nonblacklisted_written = 0; + unsigned ptr_written = btree_ptr_sectors_written(&b->key); + struct printbuf buf = PRINTBUF; + int ret = 0, retry_read = 0, write = READ; + + b->version_ondisk = U16_MAX; + /* We might get called multiple times on read retry: */ + b->written = 0; + + iter = mempool_alloc(&c->fill_iter, GFP_NOFS); + sort_iter_init(iter, b); + iter->size = (btree_blocks(c) + 1) * 2; + + if (bch2_meta_read_fault("btree")) + btree_err(BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "dynamic fault"); + + btree_err_on(le64_to_cpu(b->data->magic) != bset_magic(c), + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(b->data->magic)); + + btree_err_on(!b->data->keys.seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "bad btree header: seq 0"); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *bp = + &bkey_i_to_btree_ptr_v2(&b->key)->v; + + btree_err_on(b->data->keys.seq != bp->seq, + BTREE_ERR_MUST_RETRY, c, ca, b, NULL, + "got wrong btree node (seq %llx want %llx)", + b->data->keys.seq, bp->seq); + } + + while (b->written < (ptr_written ?: btree_sectors(c))) { + unsigned sectors; + struct nonce nonce; + struct bch_csum csum; + bool first = !b->written; + + if (!b->written) { + i = &b->data->keys; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data); + + btree_err_on(bch2_crc_cmp(csum, b->data->csum), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i", ret)) + goto fsck_err; + + btree_err_on(btree_node_type_is_extents(btree_node_type(b)) && + !BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data), + BTREE_ERR_INCOMPATIBLE, c, NULL, b, NULL, + "btree node does not have NEW_EXTENT_OVERWRITE set"); + + sectors = vstruct_sectors(b->data, c->block_bits); + } else { + bne = write_block(b); + i = &bne->keys; + + if (i->seq != b->data->keys.seq) + break; + + btree_err_on(!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "unknown checksum type %llu", + BSET_CSUM_TYPE(i)); + + nonce = btree_nonce(i, b->written << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + btree_err_on(bch2_crc_cmp(csum, bne->csum), + BTREE_ERR_WANT_RETRY, c, ca, b, i, + "invalid checksum"); + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting btree node: %i\n", ret)) + goto fsck_err; + + sectors = vstruct_sectors(bne, c->block_bits); + } + + b->version_ondisk = min(b->version_ondisk, + le16_to_cpu(i->version)); + + ret = validate_bset(c, ca, b, i, b->written, sectors, + READ, have_retry, saw_error); + if (ret) + goto fsck_err; + + if (!b->written) + btree_node_set_format(b, b->data->format); + + ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); + if (ret) + goto fsck_err; + + SET_BSET_BIG_ENDIAN(i, CPU_BIG_ENDIAN); + + blacklisted = bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(i->journal_seq), + true); + + btree_err_on(blacklisted && first, + BTREE_ERR_FIXABLE, c, ca, b, i, + "first btree node bset has blacklisted journal seq (%llu)", + le64_to_cpu(i->journal_seq)); + + btree_err_on(blacklisted && ptr_written, + BTREE_ERR_FIXABLE, c, ca, b, i, + "found blacklisted bset (journal seq %llu) in btree node at offset %u-%u/%u", + le64_to_cpu(i->journal_seq), + b->written, b->written + sectors, ptr_written); + + b->written += sectors; + + if (blacklisted && !first) + continue; + + sort_iter_add(iter, + vstruct_idx(i, 0), + vstruct_last(i)); + + nonblacklisted_written = b->written; + } + + if (ptr_written) { + btree_err_on(b->written < ptr_written, + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + "btree node data missing: expected %u sectors, found %u", + ptr_written, b->written); + } else { + for (bne = write_block(b); + bset_byte_offset(b, bne) < btree_bytes(c); + bne = (void *) bne + block_bytes(c)) + btree_err_on(bne->keys.seq == b->data->keys.seq && + !bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), + true), + BTREE_ERR_WANT_RETRY, c, ca, b, NULL, + "found bset signature after last bset"); + + /* + * Blacklisted bsets are those that were written after the most recent + * (flush) journal write. Since there wasn't a flush, they may not have + * made it to all devices - which means we shouldn't write new bsets + * after them, as that could leave a gap and then reads from that device + * wouldn't find all the bsets in that btree node - which means it's + * important that we start writing new bsets after the most recent _non_ + * blacklisted bset: + */ + blacklisted_written = b->written; + b->written = nonblacklisted_written; + } + + sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool); + sorted->keys.u64s = 0; + + set_btree_bset(b, b->set, &b->data->keys); + + b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); + + u64s = le16_to_cpu(sorted->keys.u64s); + *sorted = *b->data; + sorted->keys.u64s = cpu_to_le16(u64s); + swap(sorted, b->data); + set_btree_bset(b, b->set, &b->data->keys); + b->nsets = 1; + + BUG_ON(b->nr.live_u64s != u64s); + + btree_bounce_free(c, btree_bytes(c), used_mempool, sorted); + + if (updated_range) + bch2_btree_node_drop_keys_outside_node(b); + + i = &b->data->keys; + for (k = i->start; k != vstruct_last(i);) { + struct bkey tmp; + struct bkey_s u = __bkey_disassemble(b, k, &tmp); + + printbuf_reset(&buf); + + if (bch2_bkey_val_invalid(c, u.s_c, READ, &buf) || + (bch2_inject_invalid_keys && + !bversion_cmp(u.k->version, MAX_VERSION))) { + printbuf_reset(&buf); + + prt_printf(&buf, "invalid bkey: "); + bch2_bkey_val_invalid(c, u.s_c, READ, &buf); + prt_printf(&buf, "\n "); + bch2_bkey_val_to_text(&buf, c, u.s_c); + + btree_err(BTREE_ERR_FIXABLE, c, NULL, b, i, "%s", buf.buf); + + btree_keys_account_key_drop(&b->nr, 0, k); + + i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s); + memmove_u64s_down(k, bkey_p_next(k), + (u64 *) vstruct_end(i) - (u64 *) k); + set_btree_bset_end(b, b->set); + continue; + } + + if (u.k->type == KEY_TYPE_btree_ptr_v2) { + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(u); + + bp.v->mem_ptr = 0; + } + + k = bkey_p_next(k); + } + + bch2_bset_build_aux_tree(b, b->set, false); + + set_needs_whiteout(btree_bset_first(b), true); + + btree_node_reset_sib_u64s(b); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + if (ca->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } + + if (!ptr_written) + set_btree_node_need_rewrite(b); +out: + mempool_free(iter, &c->fill_iter); + printbuf_exit(&buf); + return retry_read; +fsck_err: + if (ret == BTREE_RETRY_READ) + retry_read = 1; + else + set_btree_node_read_error(b); + goto out; +} + +static void btree_node_read_work(struct work_struct *work) +{ + struct btree_read_bio *rb = + container_of(work, struct btree_read_bio, work); + struct bch_fs *c = rb->c; + struct btree *b = rb->b; + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + struct bio *bio = &rb->bio; + struct bch_io_failures failed = { .nr = 0 }; + struct printbuf buf = PRINTBUF; + bool saw_error = false; + bool retry = false; + bool can_retry; + + goto start; + while (1) { + retry = true; + bch_info(c, "retrying read"); + ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = rb->pick.ptr.offset; + bio->bi_iter.bi_size = btree_bytes(c); + + if (rb->have_ioref) { + bio_set_dev(bio, ca->disk_sb.bdev); + submit_bio_wait(bio); + } else { + bio->bi_status = BLK_STS_REMOVED; + } +start: + printbuf_reset(&buf); + btree_pos_to_text(&buf, c, b); + bch2_dev_io_err_on(bio->bi_status, ca, "btree read error %s for %s", + bch2_blk_status_to_str(bio->bi_status), buf.buf); + if (rb->have_ioref) + percpu_ref_put(&ca->io_ref); + rb->have_ioref = false; + + bch2_mark_io_failure(&failed, &rb->pick); + + can_retry = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick) > 0; + + if (!bio->bi_status && + !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { + if (retry) + bch_info(c, "retry success"); + break; + } + + saw_error = true; + + if (!can_retry) { + set_btree_node_read_error(b); + break; + } + } + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); + printbuf_exit(&buf); + + if (saw_error && !btree_node_read_error(b)) { + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, b->key.k.p); + bch_info(c, "%s: rewriting btree node at btree=%s level=%u %s due to error", + __func__, bch2_btree_ids[b->c.btree_id], b->c.level, buf.buf); + printbuf_exit(&buf); + + bch2_btree_node_rewrite_async(c, b); + } + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + + bch2_latency_acct(ca, rb->start_time, READ); + } + + queue_work(c->io_complete_wq, &rb->work); +} + +struct btree_node_read_all { + struct closure cl; + struct bch_fs *c; + struct btree *b; + unsigned nr; + void *buf[BCH_REPLICAS_MAX]; + struct bio *bio[BCH_REPLICAS_MAX]; + blk_status_t err[BCH_REPLICAS_MAX]; +}; + +static unsigned btree_node_sectors_written(struct bch_fs *c, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + unsigned offset = 0; + + if (le64_to_cpu(bn->magic) != bset_magic(c)) + return 0; + + while (offset < btree_sectors(c)) { + if (!offset) { + offset += vstruct_sectors(bn, c->block_bits); + } else { + bne = data + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + offset += vstruct_sectors(bne, c->block_bits); + } + } + + return offset; +} + +static bool btree_node_has_extra_bsets(struct bch_fs *c, unsigned offset, void *data) +{ + struct btree_node *bn = data; + struct btree_node_entry *bne; + + if (!offset) + return false; + + while (offset < btree_sectors(c)) { + bne = data + (offset << 9); + if (bne->keys.seq == bn->keys.seq) + return true; + offset++; + } + + return false; + return offset; +} + +static void btree_node_read_all_replicas_done(struct closure *cl) +{ + struct btree_node_read_all *ra = + container_of(cl, struct btree_node_read_all, cl); + struct bch_fs *c = ra->c; + struct btree *b = ra->b; + struct printbuf buf = PRINTBUF; + bool dump_bset_maps = false; + bool have_retry = false; + int ret = 0, best = -1, write = READ; + unsigned i, written = 0, written2 = 0; + __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 + ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; + bool _saw_error = false, *saw_error = &_saw_error; + + for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; + + if (ra->err[i]) + continue; + + if (le64_to_cpu(bn->magic) != bset_magic(c) || + (seq && seq != bn->keys.seq)) + continue; + + if (best < 0) { + best = i; + written = btree_node_sectors_written(c, bn); + continue; + } + + written2 = btree_node_sectors_written(c, ra->buf[i]); + if (btree_err_on(written2 != written, BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node sectors written mismatch: %u != %u", + written, written2) || + btree_err_on(btree_node_has_extra_bsets(c, written2, ra->buf[i]), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "found bset signature after last bset") || + btree_err_on(memcmp(ra->buf[best], ra->buf[i], written << 9), + BTREE_ERR_FIXABLE, c, NULL, b, NULL, + "btree node replicas content mismatch")) + dump_bset_maps = true; + + if (written2 > written) { + written = written2; + best = i; + } + } +fsck_err: + if (dump_bset_maps) { + for (i = 0; i < ra->nr; i++) { + struct btree_node *bn = ra->buf[i]; + struct btree_node_entry *bne = NULL; + unsigned offset = 0, sectors; + bool gap = false; + + if (ra->err[i]) + continue; + + printbuf_reset(&buf); + + while (offset < btree_sectors(c)) { + if (!offset) { + sectors = vstruct_sectors(bn, c->block_bits); + } else { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq != bn->keys.seq) + break; + sectors = vstruct_sectors(bne, c->block_bits); + } + + prt_printf(&buf, " %u-%u", offset, offset + sectors); + if (bne && bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + prt_printf(&buf, "*"); + offset += sectors; + } + + while (offset < btree_sectors(c)) { + bne = ra->buf[i] + (offset << 9); + if (bne->keys.seq == bn->keys.seq) { + if (!gap) + prt_printf(&buf, " GAP"); + gap = true; + + sectors = vstruct_sectors(bne, c->block_bits); + prt_printf(&buf, " %u-%u", offset, offset + sectors); + if (bch2_journal_seq_is_blacklisted(c, + le64_to_cpu(bne->keys.journal_seq), false)) + prt_printf(&buf, "*"); + } + offset++; + } + + bch_err(c, "replica %u:%s", i, buf.buf); + } + } + + if (best >= 0) { + memcpy(b->data, ra->buf[best], btree_bytes(c)); + ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); + } else { + ret = -1; + } + + if (ret) + set_btree_node_read_error(b); + else if (*saw_error) + bch2_btree_node_rewrite_async(c, b); + + for (i = 0; i < ra->nr; i++) { + mempool_free(ra->buf[i], &c->btree_bounce_pool); + bio_put(ra->bio[i]); + } + + closure_debug_destroy(&ra->cl); + kfree(ra); + printbuf_exit(&buf); + + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); +} + +static void btree_node_read_all_replicas_endio(struct bio *bio) +{ + struct btree_read_bio *rb = + container_of(bio, struct btree_read_bio, bio); + struct bch_fs *c = rb->c; + struct btree_node_read_all *ra = rb->ra; + + if (rb->have_ioref) { + struct bch_dev *ca = bch_dev_bkey_exists(c, rb->pick.ptr.dev); + + bch2_latency_acct(ca, rb->start_time, READ); + } + + ra->err[rb->idx] = bio->bi_status; + closure_put(&ra->cl); +} + +/* + * XXX This allocates multiple times from the same mempools, and can deadlock + * under sufficient memory pressure (but is only a debug path) + */ +static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool sync) +{ + struct bkey_s_c k = bkey_i_to_s_c(&b->key); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded pick; + struct btree_node_read_all *ra; + unsigned i; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; + + closure_init(&ra->cl, NULL); + ra->c = c; + ra->b = b; + ra->nr = bch2_bkey_nr_ptrs(k); + + for (i = 0; i < ra->nr; i++) { + ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS); + ra->bio[i] = bio_alloc_bioset(NULL, + buf_pages(ra->buf[i], btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOFS, + &c->btree_bio); + } + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct btree_read_bio *rb = + container_of(ra->bio[i], struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = ra; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->idx = i; + rb->pick = pick; + rb->bio.bi_iter.bi_sector = pick.ptr.offset; + rb->bio.bi_end_io = btree_node_read_all_replicas_endio; + bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(&rb->bio)); + bio_set_dev(&rb->bio, ca->disk_sb.bdev); + + closure_get(&ra->cl); + submit_bio(&rb->bio); + } else { + ra->err[i] = BLK_STS_REMOVED; + } + + i++; + } + + if (sync) { + closure_sync(&ra->cl); + btree_node_read_all_replicas_done(&ra->cl); + } else { + continue_at(&ra->cl, btree_node_read_all_replicas_done, + c->io_complete_wq); + } + + return 0; +} + +void bch2_btree_node_read(struct bch_fs *c, struct btree *b, + bool sync) +{ + struct extent_ptr_decoded pick; + struct btree_read_bio *rb; + struct bch_dev *ca; + struct bio *bio; + int ret; + + trace_and_count(c, btree_node_read, c, b); + + if (bch2_verify_all_btree_replicas && + !btree_node_read_all_replicas(c, b, sync)) + return; + + ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), + NULL, &pick); + + if (ret <= 0) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "btree node read error: no device to read from\n at "); + btree_pos_to_text(&buf, c, b); + bch_err(c, "%s", buf.buf); + + if (test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) + bch2_fatal_error(c); + + set_btree_node_read_error(b); + clear_btree_node_read_in_flight(b); + wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); + printbuf_exit(&buf); + return; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + bio = bio_alloc_bioset(NULL, + buf_pages(b->data, btree_bytes(c)), + REQ_OP_READ|REQ_SYNC|REQ_META, + GFP_NOFS, + &c->btree_bio); + rb = container_of(bio, struct btree_read_bio, bio); + rb->c = c; + rb->b = b; + rb->ra = NULL; + rb->start_time = local_clock(); + rb->have_ioref = bch2_dev_get_ioref(ca, READ); + rb->pick = pick; + INIT_WORK(&rb->work, btree_node_read_work); + bio->bi_iter.bi_sector = pick.ptr.offset; + bio->bi_end_io = btree_node_read_endio; + bch2_bio_map(bio, b->data, btree_bytes(c)); + + if (rb->have_ioref) { + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], + bio_sectors(bio)); + bio_set_dev(bio, ca->disk_sb.bdev); + + if (sync) { + submit_bio_wait(bio); + + btree_node_read_work(&rb->work); + } else { + submit_bio(bio); + } + } else { + bio->bi_status = BLK_STS_REMOVED; + + if (sync) + btree_node_read_work(&rb->work); + else + queue_work(c->io_complete_wq, &rb->work); + } +} + +static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + struct bch_fs *c = trans->c; + struct closure cl; + struct btree *b; + int ret; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + + b = bch2_btree_node_mem_alloc(trans, level != 0); + bch2_btree_cache_cannibalize_unlock(c); + + BUG_ON(IS_ERR(b)); + + bkey_copy(&b->key, k); + BUG_ON(bch2_btree_node_hash_insert(&c->btree_cache, b, level, id)); + + set_btree_node_read_in_flight(b); + + bch2_btree_node_read(c, b, true); + + if (btree_node_read_error(b)) { + bch2_btree_node_hash_remove(&c->btree_cache, b); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + ret = -EIO; + goto err; + } + + bch2_btree_set_root_for_read(c, b); +err: + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + + return ret; +} + +int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, + const struct bkey_i *k, unsigned level) +{ + return bch2_trans_run(c, __bch2_btree_root_read(&trans, id, k, level)); + +} + +void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, + struct btree_write *w) +{ + unsigned long old, new, v = READ_ONCE(b->will_make_reachable); + + do { + old = new = v; + if (!(old & 1)) + break; + + new &= ~1UL; + } while ((v = cmpxchg(&b->will_make_reachable, old, new)) != old); + + if (old & 1) + closure_put(&((struct btree_update *) new)->cl); + + bch2_journal_pin_drop(&c->journal, &w->journal); +} + +static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_write *w = btree_prev_write(b); + unsigned long old, new, v; + unsigned type = 0; + + bch2_btree_complete_write(c, b, w); + + v = READ_ONCE(b->flags); + do { + old = new = v; + + if ((old & (1U << BTREE_NODE_dirty)) && + (old & (1U << BTREE_NODE_need_write)) && + !(old & (1U << BTREE_NODE_never_write)) && + !(old & (1U << BTREE_NODE_write_blocked)) && + !(old & (1U << BTREE_NODE_will_make_reachable))) { + new &= ~(1U << BTREE_NODE_dirty); + new &= ~(1U << BTREE_NODE_need_write); + new |= (1U << BTREE_NODE_write_in_flight); + new |= (1U << BTREE_NODE_write_in_flight_inner); + new |= (1U << BTREE_NODE_just_written); + new ^= (1U << BTREE_NODE_write_idx); + + type = new & BTREE_WRITE_TYPE_MASK; + new &= ~BTREE_WRITE_TYPE_MASK; + } else { + new &= ~(1U << BTREE_NODE_write_in_flight); + new &= ~(1U << BTREE_NODE_write_in_flight_inner); + } + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + if (new & (1U << BTREE_NODE_write_in_flight)) + __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); + else + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); +} + +static void btree_node_write_done(struct bch_fs *c, struct btree *b) +{ + struct btree_trans trans; + + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + __btree_node_write_done(c, b); + six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); +} + +static void btree_node_write_work(struct work_struct *work) +{ + struct btree_write_bio *wbio = + container_of(work, struct btree_write_bio, work); + struct bch_fs *c = wbio->wbio.c; + struct btree *b = wbio->wbio.bio.bi_private; + struct bch_extent_ptr *ptr; + int ret = 0; + + btree_bounce_free(c, + wbio->data_bytes, + wbio->wbio.used_mempool, + wbio->data); + + bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr, + bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) + goto err; + + if (wbio->wbio.first_btree_write) { + if (wbio->wbio.failed.nr) { + + } + } else { + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_btree_node_update_key_get_iter(&trans, b, &wbio->key, + BCH_WATERMARK_reclaim| + BTREE_INSERT_JOURNAL_RECLAIM| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW, + !wbio->wbio.failed.nr)); + if (ret) + goto err; + } +out: + bio_put(&wbio->wbio.bio); + btree_node_write_done(c, b); + return; +err: + set_btree_node_noevict(b); + if (!bch2_err_matches(ret, EROFS)) + bch2_fs_fatal_error(c, "fatal error writing btree node: %s", bch2_err_str(ret)); + goto out; +} + +static void btree_node_write_endio(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_write_bio *orig = parent ?: wbio; + struct btree_write_bio *wb = container_of(orig, struct btree_write_bio, wbio); + struct bch_fs *c = wbio->c; + struct btree *b = wbio->bio.bi_private; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + unsigned long flags; + + if (wbio->have_ioref) + bch2_latency_acct(ca, wbio->submit_time, WRITE); + + if (bch2_dev_io_err_on(bio->bi_status, ca, "btree write error: %s", + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("btree")) { + spin_lock_irqsave(&c->btree_write_error_lock, flags); + bch2_dev_list_add_dev(&orig->failed, wbio->dev); + spin_unlock_irqrestore(&c->btree_write_error_lock, flags); + } + + if (wbio->have_ioref) + percpu_ref_put(&ca->io_ref); + + if (parent) { + bio_put(bio); + bio_endio(&parent->bio); + return; + } + + clear_btree_node_write_in_flight_inner(b); + wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); + INIT_WORK(&wb->work, btree_node_write_work); + queue_work(c->btree_io_complete_wq, &wb->work); +} + +static int validate_bset_for_write(struct bch_fs *c, struct btree *b, + struct bset *i, unsigned sectors) +{ + struct printbuf buf = PRINTBUF; + bool saw_error; + int ret; + + ret = bch2_bkey_invalid(c, bkey_i_to_s_c(&b->key), + BKEY_TYPE_btree, WRITE, &buf); + + if (ret) + bch2_fs_inconsistent(c, "invalid btree node key before write: %s", buf.buf); + printbuf_exit(&buf); + if (ret) + return ret; + + ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); + if (ret) { + bch2_inconsistent_error(c); + dump_stack(); + } + + return ret; +} + +static void btree_write_submit(struct work_struct *work) +{ + struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work); + struct bch_extent_ptr *ptr; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + + bkey_copy(&tmp.k, &wbio->key); + + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&tmp.k)), ptr) + ptr->offset += wbio->sector_offset; + + bch2_submit_wbio_replicas(&wbio->wbio, wbio->wbio.c, BCH_DATA_btree, + &tmp.k, false); +} + +void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) +{ + struct btree_write_bio *wbio; + struct bset_tree *t; + struct bset *i; + struct btree_node *bn = NULL; + struct btree_node_entry *bne = NULL; + struct sort_iter sort_iter; + struct nonce nonce; + unsigned bytes_to_write, sectors_to_write, bytes, u64s; + u64 seq = 0; + bool used_mempool; + unsigned long old, new; + bool validate_before_checksum = false; + enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; + void *data; + int ret; + + if (flags & BTREE_WRITE_ALREADY_STARTED) + goto do_write; + + /* + * We may only have a read lock on the btree node - the dirty bit is our + * "lock" against racing with other threads that may be trying to start + * a write, we do a write iff we clear the dirty bit. Since setting the + * dirty bit requires a write lock, we can't race with other threads + * redirtying it: + */ + do { + old = new = READ_ONCE(b->flags); + + if (!(old & (1 << BTREE_NODE_dirty))) + return; + + if ((flags & BTREE_WRITE_ONLY_IF_NEED) && + !(old & (1 << BTREE_NODE_need_write))) + return; + + if (old & + ((1 << BTREE_NODE_never_write)| + (1 << BTREE_NODE_write_blocked))) + return; + + if (b->written && + (old & (1 << BTREE_NODE_will_make_reachable))) + return; + + if (old & (1 << BTREE_NODE_write_in_flight)) + return; + + if (flags & BTREE_WRITE_ONLY_IF_NEED) + type = new & BTREE_WRITE_TYPE_MASK; + new &= ~BTREE_WRITE_TYPE_MASK; + + new &= ~(1 << BTREE_NODE_dirty); + new &= ~(1 << BTREE_NODE_need_write); + new |= (1 << BTREE_NODE_write_in_flight); + new |= (1 << BTREE_NODE_write_in_flight_inner); + new |= (1 << BTREE_NODE_just_written); + new ^= (1 << BTREE_NODE_write_idx); + } while (cmpxchg_acquire(&b->flags, old, new) != old); + + if (new & (1U << BTREE_NODE_need_write)) + return; +do_write: + BUG_ON((type == BTREE_WRITE_initial) != (b->written == 0)); + + atomic_dec(&c->btree_cache.dirty); + + BUG_ON(btree_node_fake(b)); + BUG_ON((b->will_make_reachable != 0) != !b->written); + + BUG_ON(b->written >= btree_sectors(c)); + BUG_ON(b->written & (block_sectors(c) - 1)); + BUG_ON(bset_written(b, btree_bset_last(b))); + BUG_ON(le64_to_cpu(b->data->magic) != bset_magic(c)); + BUG_ON(memcmp(&b->data->format, &b->format, sizeof(b->format))); + + bch2_sort_whiteouts(c, b); + + sort_iter_init(&sort_iter, b); + + bytes = !b->written + ? sizeof(struct btree_node) + : sizeof(struct btree_node_entry); + + bytes += b->whiteout_u64s * sizeof(u64); + + for_each_bset(b, t) { + i = bset(b, t); + + if (bset_written(b, i)) + continue; + + bytes += le16_to_cpu(i->u64s) * sizeof(u64); + sort_iter_add(&sort_iter, + btree_bkey_first(b, t), + btree_bkey_last(b, t)); + seq = max(seq, le64_to_cpu(i->journal_seq)); + } + + BUG_ON(b->written && !seq); + + /* bch2_varint_decode may read up to 7 bytes past the end of the buffer: */ + bytes += 8; + + /* buffer must be a multiple of the block size */ + bytes = round_up(bytes, block_bytes(c)); + + data = btree_bounce_alloc(c, bytes, &used_mempool); + + if (!b->written) { + bn = data; + *bn = *b->data; + i = &bn->keys; + } else { + bne = data; + bne->keys = b->data->keys; + i = &bne->keys; + } + + i->journal_seq = cpu_to_le64(seq); + i->u64s = 0; + + sort_iter_add(&sort_iter, + unwritten_whiteouts_start(c, b), + unwritten_whiteouts_end(c, b)); + SET_BSET_SEPARATE_WHITEOUTS(i, false); + + b->whiteout_u64s = 0; + + u64s = bch2_sort_keys(i->start, &sort_iter, false); + le16_add_cpu(&i->u64s, u64s); + + BUG_ON(!b->written && i->u64s != b->data->keys.u64s); + + set_needs_whiteout(i, false); + + /* do we have data to write? */ + if (b->written && !i->u64s) + goto nowrite; + + bytes_to_write = vstruct_end(i) - data; + sectors_to_write = round_up(bytes_to_write, block_bytes(c)) >> 9; + + if (!b->written && + b->key.k.type == KEY_TYPE_btree_ptr_v2) + BUG_ON(btree_ptr_sectors_written(&b->key) != sectors_to_write); + + memset(data + bytes_to_write, 0, + (sectors_to_write << 9) - bytes_to_write); + + BUG_ON(b->written + sectors_to_write > btree_sectors(c)); + BUG_ON(BSET_BIG_ENDIAN(i) != CPU_BIG_ENDIAN); + BUG_ON(i->seq != b->data->keys.seq); + + i->version = cpu_to_le16(c->sb.version); + SET_BSET_OFFSET(i, b->written); + SET_BSET_CSUM_TYPE(i, bch2_meta_checksum_type(c)); + + if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(i))) + validate_before_checksum = true; + + /* validate_bset will be modifying: */ + if (le16_to_cpu(i->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; + + /* if we're going to be encrypting, check metadata validity first: */ + if (validate_before_checksum && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + + ret = bset_encrypt(c, i, b->written << 9); + if (bch2_fs_fatal_err_on(ret, c, + "error encrypting btree node: %i\n", ret)) + goto err; + + nonce = btree_nonce(i, b->written << 9); + + if (bn) + bn->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bn); + else + bne->csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + /* if we're not encrypting, check metadata after checksumming: */ + if (!validate_before_checksum && + validate_bset_for_write(c, b, i, sectors_to_write)) + goto err; + + /* + * We handle btree write errors by immediately halting the journal - + * after we've done that, we can't issue any subsequent btree writes + * because they might have pointers to new nodes that failed to write. + * + * Furthermore, there's no point in doing any more btree writes because + * with the journal stopped, we're never going to update the journal to + * reflect that those writes were done and the data flushed from the + * journal: + * + * Also on journal error, the pending write may have updates that were + * never journalled (interior nodes, see btree_update_nodes_written()) - + * it's critical that we don't do the write in that case otherwise we + * will have updates visible that weren't in the journal: + * + * Make sure to update b->written so bch2_btree_init_next() doesn't + * break: + */ + if (bch2_journal_error(&c->journal) || + c->opts.nochanges) + goto err; + + trace_and_count(c, btree_node_write, b, bytes_to_write, sectors_to_write); + + wbio = container_of(bio_alloc_bioset(NULL, + buf_pages(data, sectors_to_write << 9), + REQ_OP_WRITE|REQ_META, + GFP_NOFS, + &c->btree_bio), + struct btree_write_bio, wbio.bio); + wbio_init(&wbio->wbio.bio); + wbio->data = data; + wbio->data_bytes = bytes; + wbio->sector_offset = b->written; + wbio->wbio.c = c; + wbio->wbio.used_mempool = used_mempool; + wbio->wbio.first_btree_write = !b->written; + wbio->wbio.bio.bi_end_io = btree_node_write_endio; + wbio->wbio.bio.bi_private = b; + + bch2_bio_map(&wbio->wbio.bio, data, sectors_to_write << 9); + + bkey_copy(&wbio->key, &b->key); + + b->written += sectors_to_write; + + if (wbio->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&wbio->key)->v.sectors_written = + cpu_to_le16(b->written); + + atomic64_inc(&c->btree_write_stats[type].nr); + atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + + INIT_WORK(&wbio->work, btree_write_submit); + queue_work(c->io_complete_wq, &wbio->work); + return; +err: + set_btree_node_noevict(b); + b->written += sectors_to_write; +nowrite: + btree_bounce_free(c, bytes, used_mempool, data); + __btree_node_write_done(c, b); +} + +/* + * Work that must be done with write lock held: + */ +bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b) +{ + bool invalidated_iter = false; + struct btree_node_entry *bne; + struct bset_tree *t; + + if (!btree_node_just_written(b)) + return false; + + BUG_ON(b->whiteout_u64s); + + clear_btree_node_just_written(b); + + /* + * Note: immediately after write, bset_written() doesn't work - the + * amount of data we had to write after compaction might have been + * smaller than the offset of the last bset. + * + * However, we know that all bsets have been written here, as long as + * we're still holding the write lock: + */ + + /* + * XXX: decide if we really want to unconditionally sort down to a + * single bset: + */ + if (b->nsets > 1) { + btree_node_sort(c, b, 0, b->nsets, true); + invalidated_iter = true; + } else { + invalidated_iter = bch2_drop_whiteouts(b, COMPACT_ALL); + } + + for_each_bset(b, t) + set_needs_whiteout(bset(b, t), true); + + bch2_btree_verify(c, b); + + /* + * If later we don't unconditionally sort down to a single bset, we have + * to ensure this is still true: + */ + BUG_ON((void *) btree_bkey_last(b, bset_tree_last(b)) > write_block(b)); + + bne = want_new_bset(c, b); + if (bne) + bch2_bset_init_next(c, b, bne); + + bch2_btree_build_aux_trees(b); + + return invalidated_iter; +} + +/* + * Use this one if the node is intent locked: + */ +void bch2_btree_node_write(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_type_held, + unsigned flags) +{ + if (lock_type_held == SIX_LOCK_intent || + (lock_type_held == SIX_LOCK_read && + six_lock_tryupgrade(&b->c.lock))) { + __bch2_btree_node_write(c, b, flags); + + /* don't cycle lock unnecessarily: */ + if (btree_node_just_written(b) && + six_trylock_write(&b->c.lock)) { + bch2_btree_post_write_cleanup(c, b); + six_unlock_write(&b->c.lock); + } + + if (lock_type_held == SIX_LOCK_read) + six_lock_downgrade(&b->c.lock); + } else { + __bch2_btree_node_write(c, b, flags); + if (lock_type_held == SIX_LOCK_write && + btree_node_just_written(b)) + bch2_btree_post_write_cleanup(c, b); + } +} + +static bool __bch2_btree_flush_all(struct bch_fs *c, unsigned flag) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + unsigned i; + bool ret = false; +restart: + rcu_read_lock(); + for_each_cached_btree(b, c, tbl, i, pos) + if (test_bit(flag, &b->flags)) { + rcu_read_unlock(); + wait_on_bit_io(&b->flags, flag, TASK_UNINTERRUPTIBLE); + ret = true; + goto restart; + } + rcu_read_unlock(); + + return ret; +} + +bool bch2_btree_flush_all_reads(struct bch_fs *c) +{ + return __bch2_btree_flush_all(c, BTREE_NODE_read_in_flight); +} + +bool bch2_btree_flush_all_writes(struct bch_fs *c) +{ + return __bch2_btree_flush_all(c, BTREE_NODE_write_in_flight); +} + +static const char * const bch2_btree_write_types[] = { +#define x(t, n) [n] = #t, + BCH_BTREE_WRITE_TYPES() + NULL +}; + +void bch2_btree_write_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + printbuf_tabstop_push(out, 20); + printbuf_tabstop_push(out, 10); + + prt_tab(out); + prt_str(out, "nr"); + prt_tab(out); + prt_str(out, "size"); + prt_newline(out); + + for (unsigned i = 0; i < BTREE_WRITE_TYPE_NR; i++) { + u64 nr = atomic64_read(&c->btree_write_stats[i].nr); + u64 bytes = atomic64_read(&c->btree_write_stats[i].bytes); + + prt_printf(out, "%s:", bch2_btree_write_types[i]); + prt_tab(out); + prt_u64(out, nr); + prt_tab(out); + prt_human_readable_u64(out, nr ? div64_u64(bytes, nr) : 0); + prt_newline(out); + } +} diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h new file mode 100644 index 000000000..0cadf651e --- /dev/null +++ b/fs/bcachefs/btree_io.h @@ -0,0 +1,228 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_IO_H +#define _BCACHEFS_BTREE_IO_H + +#include "bkey_methods.h" +#include "bset.h" +#include "btree_locking.h" +#include "checksum.h" +#include "extents.h" +#include "io_types.h" + +struct bch_fs; +struct btree_write; +struct btree; +struct btree_iter; +struct btree_node_read_all; + +static inline void set_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) +{ + if (!test_and_set_bit(BTREE_NODE_dirty, &b->flags)) + atomic_inc(&c->btree_cache.dirty); +} + +static inline void clear_btree_node_dirty_acct(struct bch_fs *c, struct btree *b) +{ + if (test_and_clear_bit(BTREE_NODE_dirty, &b->flags)) + atomic_dec(&c->btree_cache.dirty); +} + +static inline unsigned btree_ptr_sectors_written(struct bkey_i *k) +{ + return k->k.type == KEY_TYPE_btree_ptr_v2 + ? le16_to_cpu(bkey_i_to_btree_ptr_v2(k)->v.sectors_written) + : 0; +} + +struct btree_read_bio { + struct bch_fs *c; + struct btree *b; + struct btree_node_read_all *ra; + u64 start_time; + unsigned have_ioref:1; + unsigned idx:7; + struct extent_ptr_decoded pick; + struct work_struct work; + struct bio bio; +}; + +struct btree_write_bio { + struct work_struct work; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + void *data; + unsigned data_bytes; + unsigned sector_offset; + struct bch_write_bio wbio; +}; + +void bch2_btree_node_io_unlock(struct btree *); +void bch2_btree_node_io_lock(struct btree *); +void __bch2_btree_node_wait_on_read(struct btree *); +void __bch2_btree_node_wait_on_write(struct btree *); +void bch2_btree_node_wait_on_read(struct btree *); +void bch2_btree_node_wait_on_write(struct btree *); + +enum compact_mode { + COMPACT_LAZY, + COMPACT_ALL, +}; + +bool bch2_compact_whiteouts(struct bch_fs *, struct btree *, + enum compact_mode); + +static inline bool should_compact_bset_lazy(struct btree *b, + struct bset_tree *t) +{ + unsigned total_u64s = bset_u64s(t); + unsigned dead_u64s = bset_dead_u64s(b, t); + + return dead_u64s > 64 && dead_u64s * 3 > total_u64s; +} + +static inline bool bch2_maybe_compact_whiteouts(struct bch_fs *c, struct btree *b) +{ + struct bset_tree *t; + + for_each_bset(b, t) + if (should_compact_bset_lazy(b, t)) + return bch2_compact_whiteouts(c, b, COMPACT_LAZY); + + return false; +} + +static inline struct nonce btree_nonce(struct bset *i, unsigned offset) +{ + return (struct nonce) {{ + [0] = cpu_to_le32(offset), + [1] = ((__le32 *) &i->seq)[0], + [2] = ((__le32 *) &i->seq)[1], + [3] = ((__le32 *) &i->journal_seq)[0]^BCH_NONCE_BTREE, + }}; +} + +static inline int bset_encrypt(struct bch_fs *c, struct bset *i, unsigned offset) +{ + struct nonce nonce = btree_nonce(i, offset); + int ret; + + if (!offset) { + struct btree_node *bn = container_of(i, struct btree_node, keys); + unsigned bytes = (void *) &bn->keys - (void *) &bn->flags; + + ret = bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, + &bn->flags, bytes); + if (ret) + return ret; + + nonce = nonce_add(nonce, round_up(bytes, CHACHA_BLOCK_SIZE)); + } + + return bch2_encrypt(c, BSET_CSUM_TYPE(i), nonce, i->_data, + vstruct_end(i) - (void *) i->_data); +} + +void bch2_btree_sort_into(struct bch_fs *, struct btree *, struct btree *); + +void bch2_btree_node_drop_keys_outside_node(struct btree *); + +void bch2_btree_build_aux_trees(struct btree *); +void bch2_btree_init_next(struct btree_trans *, struct btree *); + +int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, + struct btree *, bool, bool *); +void bch2_btree_node_read(struct bch_fs *, struct btree *, bool); +int bch2_btree_root_read(struct bch_fs *, enum btree_id, + const struct bkey_i *, unsigned); + +void bch2_btree_complete_write(struct bch_fs *, struct btree *, + struct btree_write *); + +bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); + +enum btree_write_flags { + __BTREE_WRITE_ONLY_IF_NEED = BTREE_WRITE_TYPE_BITS, + __BTREE_WRITE_ALREADY_STARTED, +}; +#define BTREE_WRITE_ONLY_IF_NEED (1U << __BTREE_WRITE_ONLY_IF_NEED ) +#define BTREE_WRITE_ALREADY_STARTED (1U << __BTREE_WRITE_ALREADY_STARTED) + +void __bch2_btree_node_write(struct bch_fs *, struct btree *, unsigned); +void bch2_btree_node_write(struct bch_fs *, struct btree *, + enum six_lock_type, unsigned); + +static inline void btree_node_write_if_need(struct bch_fs *c, struct btree *b, + enum six_lock_type lock_held) +{ + bch2_btree_node_write(c, b, lock_held, BTREE_WRITE_ONLY_IF_NEED); +} + +bool bch2_btree_flush_all_reads(struct bch_fs *); +bool bch2_btree_flush_all_writes(struct bch_fs *); + +static inline void compat_bformat(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, struct bkey_format *f) +{ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) { + swap(f->bits_per_field[BKEY_FIELD_INODE], + f->bits_per_field[BKEY_FIELD_OFFSET]); + swap(f->field_offset[BKEY_FIELD_INODE], + f->field_offset[BKEY_FIELD_OFFSET]); + } + + if (version < bcachefs_metadata_version_snapshot && + (level || btree_type_has_snapshots(btree_id))) { + u64 max_packed = + ~(~0ULL << f->bits_per_field[BKEY_FIELD_SNAPSHOT]); + + f->field_offset[BKEY_FIELD_SNAPSHOT] = write + ? 0 + : cpu_to_le64(U32_MAX - max_packed); + } +} + +static inline void compat_bpos(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, struct bpos *p) +{ + if (big_endian != CPU_BIG_ENDIAN) + bch2_bpos_swab(p); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id == BTREE_ID_inodes) + swap(p->inode, p->offset); +} + +static inline void compat_btree_node(unsigned level, enum btree_id btree_id, + unsigned version, unsigned big_endian, + int write, + struct btree_node *bn) +{ + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && + write) + bn->min_key = bpos_nosnap_predecessor(bn->min_key); + + if (version < bcachefs_metadata_version_snapshot && + write) + bn->max_key.snapshot = 0; + + compat_bpos(level, btree_id, version, big_endian, write, &bn->min_key); + compat_bpos(level, btree_id, version, big_endian, write, &bn->max_key); + + if (version < bcachefs_metadata_version_snapshot && + !write) + bn->max_key.snapshot = U32_MAX; + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id_is_extents(btree_id) && + !bpos_eq(bn->min_key, POS_MIN) && + !write) + bn->min_key = bpos_nosnap_successor(bn->min_key); +} + +void bch2_btree_write_stats_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_IO_H */ diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c new file mode 100644 index 000000000..e292c5a2a --- /dev/null +++ b/fs/bcachefs/btree_iter.c @@ -0,0 +1,3214 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "bkey_buf.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "recovery.h" +#include "replicas.h" +#include "subvolume.h" +#include "trace.h" + +#include +#include + +static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *); +static inline void btree_path_list_add(struct btree_trans *, struct btree_path *, + struct btree_path *); + +static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter) +{ +#ifdef TRACK_PATH_ALLOCATED + return iter->ip_allocated; +#else + return 0; +#endif +} + +static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *); + +/* + * Unlocks before scheduling + * Note: does not revalidate iterator + */ +static inline int bch2_trans_cond_resched(struct btree_trans *trans) +{ + if (need_resched() || race_fault()) + return drop_locks_do(trans, (schedule(), 0)); + else + return 0; +} + +static inline int __btree_path_cmp(const struct btree_path *l, + enum btree_id r_btree_id, + bool r_cached, + struct bpos r_pos, + unsigned r_level) +{ + /* + * Must match lock ordering as defined by __bch2_btree_node_lock: + */ + return cmp_int(l->btree_id, r_btree_id) ?: + cmp_int((int) l->cached, (int) r_cached) ?: + bpos_cmp(l->pos, r_pos) ?: + -cmp_int(l->level, r_level); +} + +static inline int btree_path_cmp(const struct btree_path *l, + const struct btree_path *r) +{ + return __btree_path_cmp(l, r->btree_id, r->cached, r->pos, r->level); +} + +static inline struct bpos bkey_successor(struct btree_iter *iter, struct bpos p) +{ + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_successor(p); + } else { + p = bpos_nosnap_successor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos bkey_predecessor(struct btree_iter *iter, struct bpos p) +{ + /* Are we iterating over keys in all snapshots? */ + if (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) { + p = bpos_predecessor(p); + } else { + p = bpos_nosnap_predecessor(p); + p.snapshot = iter->snapshot; + } + + return p; +} + +static inline struct bpos btree_iter_search_key(struct btree_iter *iter) +{ + struct bpos pos = iter->pos; + + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + !bkey_eq(pos, POS_MAX)) + pos = bkey_successor(iter, pos); + return pos; +} + +static inline bool btree_path_pos_before_node(struct btree_path *path, + struct btree *b) +{ + return bpos_lt(path->pos, b->data->min_key); +} + +static inline bool btree_path_pos_after_node(struct btree_path *path, + struct btree *b) +{ + return bpos_gt(path->pos, b->key.k.p); +} + +static inline bool btree_path_pos_in_node(struct btree_path *path, + struct btree *b) +{ + return path->btree_id == b->c.btree_id && + !btree_path_pos_before_node(path, b) && + !btree_path_pos_after_node(path, b); +} + +/* Btree iterator: */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +static void bch2_btree_path_verify_cached(struct btree_trans *trans, + struct btree_path *path) +{ + struct bkey_cached *ck; + bool locked = btree_node_locked(path, 0); + + if (!bch2_btree_node_relock(trans, path, 0)) + return; + + ck = (void *) path->l[0].b; + BUG_ON(ck->key.btree_id != path->btree_id || + !bkey_eq(ck->key.pos, path->pos)); + + if (!locked) + btree_node_unlock(trans, path, 0); +} + +static void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree_path_level *l; + struct btree_node_iter tmp; + bool locked; + struct bkey_packed *p, *k; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + struct printbuf buf3 = PRINTBUF; + const char *msg; + + if (!bch2_debug_check_iterators) + return; + + l = &path->l[level]; + tmp = l->iter; + locked = btree_node_locked(path, level); + + if (path->cached) { + if (!level) + bch2_btree_path_verify_cached(trans, path); + return; + } + + if (!btree_path_node(path, level)) + return; + + if (!bch2_btree_node_relock_notrace(trans, path, level)) + return; + + BUG_ON(!btree_path_pos_in_node(path, l->b)); + + bch2_btree_node_iter_verify(&l->iter, l->b); + + /* + * For interior nodes, the iterator will have skipped past deleted keys: + */ + p = level + ? bch2_btree_node_iter_prev(&tmp, l->b) + : bch2_btree_node_iter_prev_all(&tmp, l->b); + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + + if (p && bkey_iter_pos_cmp(l->b, p, &path->pos) >= 0) { + msg = "before"; + goto err; + } + + if (k && bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + msg = "after"; + goto err; + } + + if (!locked) + btree_node_unlock(trans, path, level); + return; +err: + bch2_bpos_to_text(&buf1, path->pos); + + if (p) { + struct bkey uk = bkey_unpack_key(l->b, p); + + bch2_bkey_to_text(&buf2, &uk); + } else { + prt_printf(&buf2, "(none)"); + } + + if (k) { + struct bkey uk = bkey_unpack_key(l->b, k); + + bch2_bkey_to_text(&buf3, &uk); + } else { + prt_printf(&buf3, "(none)"); + } + + panic("path should be %s key at level %u:\n" + "path pos %s\n" + "prev key %s\n" + "cur key %s\n", + msg, level, buf1.buf, buf2.buf, buf3.buf); +} + +static void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + unsigned i; + + EBUG_ON(path->btree_id >= BTREE_ID_NR); + + for (i = 0; i < (!path->cached ? BTREE_MAX_DEPTH : 1); i++) { + if (!path->l[i].b) { + BUG_ON(!path->cached && + bch2_btree_id_root(c, path->btree_id)->b->c.level > i); + break; + } + + bch2_btree_path_verify_level(trans, path, i); + } + + bch2_btree_path_verify_locks(path); +} + +void bch2_trans_verify_paths(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify(trans, path); +} + +static void bch2_btree_iter_verify(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + + BUG_ON(iter->btree_id >= BTREE_ID_NR); + + BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached); + + BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + + BUG_ON(!(iter->flags & __BTREE_ITER_ALL_SNAPSHOTS) && + (iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(iter->btree_id)); + + if (iter->update_path) + bch2_btree_path_verify(trans, iter->update_path); + bch2_btree_path_verify(trans, iter->path); +} + +static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + BUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !iter->pos.snapshot); + + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS) && + iter->pos.snapshot != iter->snapshot); + + BUG_ON(bkey_lt(iter->pos, bkey_start_pos(&iter->k)) || + bkey_gt(iter->pos, iter->k.p)); +} + +static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +{ + struct btree_trans *trans = iter->trans; + struct btree_iter copy; + struct bkey_s_c prev; + int ret = 0; + + if (!bch2_debug_check_iterators) + return 0; + + if (!(iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) + return 0; + + if (bkey_err(k) || !k.k) + return 0; + + BUG_ON(!bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)); + + bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, + BTREE_ITER_NOPRESERVE| + BTREE_ITER_ALL_SNAPSHOTS); + prev = bch2_btree_iter_prev(©); + if (!prev.k) + goto out; + + ret = bkey_err(prev); + if (ret) + goto out; + + if (bkey_eq(prev.k->p, k.k->p) && + bch2_snapshot_is_ancestor(trans->c, iter->snapshot, + prev.k->p.snapshot) > 0) { + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + bch2_bkey_to_text(&buf1, k.k); + bch2_bkey_to_text(&buf2, prev.k); + + panic("iter snap %u\n" + "k %s\n" + "prev %s\n", + iter->snapshot, + buf1.buf, buf2.buf); + } +out: + bch2_trans_iter_exit(trans, ©); + return ret; +} + +void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) +{ + struct btree_path *path; + unsigned idx; + struct printbuf buf = PRINTBUF; + + btree_trans_sort_paths(trans); + + trans_for_each_path_inorder(trans, path, idx) { + int cmp = cmp_int(path->btree_id, id) ?: + cmp_int(path->cached, key_cache); + + if (cmp > 0) + break; + if (cmp < 0) + continue; + + if (!btree_node_locked(path, 0) || + !path->should_be_locked) + continue; + + if (!key_cache) { + if (bkey_ge(pos, path->l[0].b->data->min_key) && + bkey_le(pos, path->l[0].b->key.k.p)) + return; + } else { + if (bkey_eq(pos, path->pos)) + return; + } + } + + bch2_dump_trans_paths_updates(trans); + bch2_bpos_to_text(&buf, pos); + + panic("not locked: %s %s%s\n", + bch2_btree_ids[id], buf.buf, + key_cache ? " cached" : ""); +} + +#else + +static inline void bch2_btree_path_verify_level(struct btree_trans *trans, + struct btree_path *path, unsigned l) {} +static inline void bch2_btree_path_verify(struct btree_trans *trans, + struct btree_path *path) {} +static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} +static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } + +#endif + +/* Btree path: fixups after btree updates */ + +static void btree_node_iter_set_set_pos(struct btree_node_iter *iter, + struct btree *b, + struct bset_tree *t, + struct bkey_packed *k) +{ + struct btree_node_iter_set *set; + + btree_node_iter_for_each(iter, set) + if (set->end == t->end_offset) { + set->k = __btree_node_key_to_offset(b, k); + bch2_btree_node_iter_sort(iter, b); + return; + } + + bch2_btree_node_iter_push(iter, b, k, btree_bkey_last(b, t)); +} + +static void __bch2_btree_path_fix_key_modified(struct btree_path *path, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_path_level *l = &path->l[b->c.level]; + + if (where != bch2_btree_node_iter_peek_all(&l->iter, l->b)) + return; + + if (bkey_iter_pos_cmp(l->b, where, &path->pos) < 0) + bch2_btree_node_iter_advance(&l->iter, l->b); +} + +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *b, + struct bkey_packed *where) +{ + struct btree_path *path; + + trans_for_each_path_with_node(trans, b, path) { + __bch2_btree_path_fix_key_modified(path, b, where); + bch2_btree_path_verify_level(trans, path, b->c.level); + } +} + +static void __bch2_btree_node_iter_fix(struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bset_tree *t, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + const struct bkey_packed *end = btree_bkey_last(b, t); + struct btree_node_iter_set *set; + unsigned offset = __btree_node_key_to_offset(b, where); + int shift = new_u64s - clobber_u64s; + unsigned old_end = t->end_offset - shift; + unsigned orig_iter_pos = node_iter->data[0].k; + bool iter_current_key_modified = + orig_iter_pos >= offset && + orig_iter_pos <= offset + clobber_u64s; + + btree_node_iter_for_each(node_iter, set) + if (set->end == old_end) + goto found; + + /* didn't find the bset in the iterator - might have to readd it: */ + if (new_u64s && + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { + bch2_btree_node_iter_push(node_iter, b, where, end); + goto fixup_done; + } else { + /* Iterator is after key that changed */ + return; + } +found: + set->end = t->end_offset; + + /* Iterator hasn't gotten to the key that changed yet: */ + if (set->k < offset) + return; + + if (new_u64s && + bkey_iter_pos_cmp(b, where, &path->pos) >= 0) { + set->k = offset; + } else if (set->k < offset + clobber_u64s) { + set->k = offset + new_u64s; + if (set->k == set->end) + bch2_btree_node_iter_set_drop(node_iter, set); + } else { + /* Iterator is after key that changed */ + set->k = (int) set->k + shift; + return; + } + + bch2_btree_node_iter_sort(node_iter, b); +fixup_done: + if (node_iter->data[0].k != orig_iter_pos) + iter_current_key_modified = true; + + /* + * When a new key is added, and the node iterator now points to that + * key, the iterator might have skipped past deleted keys that should + * come after the key the iterator now points to. We have to rewind to + * before those deleted keys - otherwise + * bch2_btree_node_iter_prev_all() breaks: + */ + if (!bch2_btree_node_iter_end(node_iter) && + iter_current_key_modified && + b->c.level) { + struct bset_tree *t; + struct bkey_packed *k, *k2, *p; + + k = bch2_btree_node_iter_peek_all(node_iter, b); + + for_each_bset(b, t) { + bool set_pos = false; + + if (node_iter->data[0].end == t->end_offset) + continue; + + k2 = bch2_btree_node_iter_bset_pos(node_iter, b, t); + + while ((p = bch2_bkey_prev_all(b, t, k2)) && + bkey_iter_cmp(b, k, p) < 0) { + k2 = p; + set_pos = true; + } + + if (set_pos) + btree_node_iter_set_set_pos(node_iter, + b, t, k2); + } + } +} + +void bch2_btree_node_iter_fix(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_packed *where, + unsigned clobber_u64s, + unsigned new_u64s) +{ + struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where); + struct btree_path *linked; + + if (node_iter != &path->l[b->c.level].iter) { + __bch2_btree_node_iter_fix(path, b, node_iter, t, + where, clobber_u64s, new_u64s); + + if (bch2_debug_check_iterators) + bch2_btree_node_iter_verify(node_iter, b); + } + + trans_for_each_path_with_node(trans, b, linked) { + __bch2_btree_node_iter_fix(linked, b, + &linked->l[b->c.level].iter, t, + where, clobber_u64s, new_u64s); + bch2_btree_path_verify_level(trans, linked, b->c.level); + } +} + +/* Btree path level: pointer to a particular btree node and node iter */ + +static inline struct bkey_s_c __btree_iter_unpack(struct bch_fs *c, + struct btree_path_level *l, + struct bkey *u, + struct bkey_packed *k) +{ + if (unlikely(!k)) { + /* + * signal to bch2_btree_iter_peek_slot() that we're currently at + * a hole + */ + u->type = KEY_TYPE_deleted; + return bkey_s_c_null; + } + + return bkey_disassemble(l->b, k, u); +} + +static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, + struct btree_path_level *l, + struct bkey *u) +{ + return __btree_iter_unpack(c, l, u, + bch2_btree_node_iter_peek_all(&l->iter, l->b)); +} + +static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) +{ + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->key.k.p; + trans->paths_sorted = false; + bch2_btree_path_verify_level(trans, path, l - path->l); + return k; +} + +static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, + struct btree_path *path, + struct btree_path_level *l, + struct bkey *u) +{ + struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, + bch2_btree_node_iter_prev(&l->iter, l->b)); + + path->pos = k.k ? k.k->p : l->b->data->min_key; + trans->paths_sorted = false; + bch2_btree_path_verify_level(trans, path, l - path->l); + return k; +} + +static inline bool btree_path_advance_to_pos(struct btree_path *path, + struct btree_path_level *l, + int max_advance) +{ + struct bkey_packed *k; + int nr_advanced = 0; + + while ((k = bch2_btree_node_iter_peek_all(&l->iter, l->b)) && + bkey_iter_pos_cmp(l->b, k, &path->pos) < 0) { + if (max_advance > 0 && nr_advanced >= max_advance) + return false; + + bch2_btree_node_iter_advance(&l->iter, l->b); + nr_advanced++; + } + + return true; +} + +static inline void __btree_path_level_init(struct btree_path *path, + unsigned level) +{ + struct btree_path_level *l = &path->l[level]; + + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (level) + bch2_btree_node_iter_peek(&l->iter, l->b); +} + +void bch2_btree_path_level_init(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + BUG_ON(path->cached); + + EBUG_ON(!btree_path_pos_in_node(path, b)); + + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); + path->l[b->c.level].b = b; + __btree_path_level_init(path, b->c.level); +} + +/* Btree path: fixups after btree node updates: */ + +static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + if (!i->cached && + i->level == b->c.level && + i->btree_id == b->c.btree_id && + bpos_cmp(i->k->k.p, b->data->min_key) >= 0 && + bpos_cmp(i->k->k.p, b->data->max_key) <= 0) { + i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, + i->k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } +} + +/* + * A btree node is being replaced - update the iterator to point to the new + * node: + */ +void bch2_trans_node_add(struct btree_trans *trans, struct btree *b) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->uptodate == BTREE_ITER_UPTODATE && + !path->cached && + btree_path_pos_in_node(path, b)) { + enum btree_node_locked_type t = + btree_lock_want(path, b->c.level); + + if (t != BTREE_NODE_UNLOCKED) { + btree_node_unlock(trans, path, b->c.level); + six_lock_increment(&b->c.lock, (enum six_lock_type) t); + mark_btree_node_locked(trans, path, b->c.level, (enum six_lock_type) t); + } + + bch2_btree_path_level_init(trans, path, b); + } + + bch2_trans_revalidate_updates_in_node(trans, b); +} + +/* + * A btree node has been modified in such a way as to invalidate iterators - fix + * them: + */ +void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b) +{ + struct btree_path *path; + + trans_for_each_path_with_node(trans, b, path) + __btree_path_level_init(path, b->c.level); + + bch2_trans_revalidate_updates_in_node(trans, b); +} + +/* Btree path: traverse, set_pos: */ + +static inline int btree_path_lock_root(struct btree_trans *trans, + struct btree_path *path, + unsigned depth_want, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree *b, **rootp = &bch2_btree_id_root(c, path->btree_id)->b; + enum six_lock_type lock_type; + unsigned i; + int ret; + + EBUG_ON(path->nodes_locked); + + while (1) { + b = READ_ONCE(*rootp); + path->level = READ_ONCE(b->c.level); + + if (unlikely(path->level < depth_want)) { + /* + * the root is at a lower depth than the depth we want: + * got to the end of the btree, or we're walking nodes + * greater than some depth and there are no nodes >= + * that depth + */ + path->level = depth_want; + for (i = path->level; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + return 1; + } + + lock_type = __btree_lock_want(path, path->level); + ret = btree_node_lock(trans, path, &b->c, + path->level, lock_type, trace_ip); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_lock_fail_root_changed)) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + BUG(); + } + + if (likely(b == READ_ONCE(*rootp) && + b->c.level == path->level && + !race_fault())) { + for (i = 0; i < path->level; i++) + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_lock_root); + path->l[path->level].b = b; + for (i = path->level + 1; i < BTREE_MAX_DEPTH; i++) + path->l[i].b = NULL; + + mark_btree_node_locked(trans, path, path->level, lock_type); + bch2_btree_path_level_init(trans, path, b); + return 0; + } + + six_unlock_type(&b->c.lock, lock_type); + } +} + +noinline +static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_node_iter node_iter = l->iter; + struct bkey_packed *k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_node_iter_advance(&node_iter, l->b); + k = bch2_btree_node_iter_peek(&node_iter, l->b); + if (!k) + break; + + bch2_bkey_buf_unpack(&tmp, c, l->b, k); + ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(trans, path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + +static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *path, + struct btree_and_journal_iter *jiter) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + struct bkey_buf tmp; + unsigned nr = test_bit(BCH_FS_STARTED, &c->flags) + ? (path->level > 1 ? 0 : 2) + : (path->level > 1 ? 1 : 16); + bool was_locked = btree_node_locked(path, path->level); + int ret = 0; + + bch2_bkey_buf_init(&tmp); + + while (nr-- && !ret) { + if (!bch2_btree_node_relock(trans, path, path->level)) + break; + + bch2_btree_and_journal_iter_advance(jiter); + k = bch2_btree_and_journal_iter_peek(jiter); + if (!k.k) + break; + + bch2_bkey_buf_reassemble(&tmp, c, k); + ret = bch2_btree_node_prefetch(trans, path, tmp.k, path->btree_id, + path->level - 1); + } + + if (!was_locked) + btree_node_unlock(trans, path, path->level); + + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + +static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, + struct btree_path *path, + unsigned plevel, struct btree *b) +{ + struct btree_path_level *l = &path->l[plevel]; + bool locked = btree_node_locked(path, plevel); + struct bkey_packed *k; + struct bch_btree_ptr_v2 *bp; + + if (!bch2_btree_node_relock(trans, path, plevel)) + return; + + k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + BUG_ON(k->type != KEY_TYPE_btree_ptr_v2); + + bp = (void *) bkeyp_val(&l->b->format, k); + bp->mem_ptr = (unsigned long)b; + + if (!locked) + btree_node_unlock(trans, path, plevel); +} + +static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + struct bkey_buf *out) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree_and_journal_iter jiter; + struct bkey_s_c k; + int ret = 0; + + __bch2_btree_and_journal_iter_init_node_iter(&jiter, c, l->b, l->iter, path->pos); + + k = bch2_btree_and_journal_iter_peek(&jiter); + + bch2_bkey_buf_reassemble(out, c, k); + + if (flags & BTREE_ITER_PREFETCH) + ret = btree_path_prefetch_j(trans, path, &jiter); + + bch2_btree_and_journal_iter_exit(&jiter); + return ret; +} + +static __always_inline int btree_path_down(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_path_level *l = path_l(path); + struct btree *b; + unsigned level = path->level - 1; + enum six_lock_type lock_type = __btree_lock_want(path, level); + struct bkey_buf tmp; + int ret; + + EBUG_ON(!btree_node_locked(path, path->level)); + + bch2_bkey_buf_init(&tmp); + + if (unlikely(trans->journal_replay_not_finished)) { + ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + if (ret) + goto err; + } else { + bch2_bkey_buf_unpack(&tmp, c, l->b, + bch2_btree_node_iter_peek(&l->iter, l->b)); + + if (flags & BTREE_ITER_PREFETCH) { + ret = btree_path_prefetch(trans, path); + if (ret) + goto err; + } + } + + b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + ret = PTR_ERR_OR_ZERO(b); + if (unlikely(ret)) + goto err; + + if (likely(!trans->journal_replay_not_finished && + tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && + unlikely(b != btree_node_mem_ptr(tmp.k))) + btree_node_mem_ptr_set(trans, path, level + 1, b); + + if (btree_node_read_locked(path, level + 1)) + btree_node_unlock(trans, path, level + 1); + + mark_btree_node_locked(trans, path, level, lock_type); + path->level = level; + bch2_btree_path_level_init(trans, path, b); + + bch2_btree_path_verify_locks(path); +err: + bch2_bkey_buf_exit(&tmp, c); + return ret; +} + + +static int bch2_btree_path_traverse_all(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned long trace_ip = _RET_IP_; + int i, ret = 0; + + if (trans->in_traverse_all) + return -BCH_ERR_transaction_restart_in_traverse_all; + + trans->in_traverse_all = true; +retry_all: + trans->restarted = 0; + trans->last_restarted_ip = 0; + + trans_for_each_path(trans, path) + path->should_be_locked = false; + + btree_trans_sort_paths(trans); + + bch2_trans_unlock(trans); + cond_resched(); + + if (unlikely(trans->memory_allocation_failure)) { + struct closure cl; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + } + + /* Now, redo traversals in correct order: */ + i = 0; + while (i < trans->nr_sorted) { + path = trans->paths + trans->sorted[i]; + + /* + * Traversing a path can cause another path to be added at about + * the same position: + */ + if (path->uptodate) { + __btree_path_get(path, false); + ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_); + __btree_path_put(path, false); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, ENOMEM)) + goto retry_all; + if (ret) + goto err; + } else { + i++; + } + } + + /* + * We used to assert that all paths had been traversed here + * (path->uptodate < BTREE_ITER_NEED_TRAVERSE); however, since + * path->Should_be_locked is not set yet, we we might have unlocked and + * then failed to relock a path - that's fine. + */ +err: + bch2_btree_cache_cannibalize_unlock(c); + + trans->in_traverse_all = false; + + trace_and_count(c, trans_traverse_all, trans, trace_ip); + return ret; +} + +static inline bool btree_path_check_pos_in_node(struct btree_path *path, + unsigned l, int check_pos) +{ + if (check_pos < 0 && btree_path_pos_before_node(path, path->l[l].b)) + return false; + if (check_pos > 0 && btree_path_pos_after_node(path, path->l[l].b)) + return false; + return true; +} + +static inline bool btree_path_good_node(struct btree_trans *trans, + struct btree_path *path, + unsigned l, int check_pos) +{ + return is_btree_node(path, l) && + bch2_btree_node_relock(trans, path, l) && + btree_path_check_pos_in_node(path, l, check_pos); +} + +static void btree_path_set_level_down(struct btree_trans *trans, + struct btree_path *path, + unsigned new_level) +{ + unsigned l; + + path->level = new_level; + + for (l = path->level + 1; l < BTREE_MAX_DEPTH; l++) + if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) + btree_node_unlock(trans, path, l); + + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + bch2_btree_path_verify(trans, path); +} + +static noinline unsigned __btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) +{ + unsigned i, l = path->level; +again: + while (btree_path_node(path, l) && + !btree_path_good_node(trans, path, l, check_pos)) + __btree_path_set_level_up(trans, path, l++); + + /* If we need intent locks, take them too: */ + for (i = l + 1; + i < path->locks_want && btree_path_node(path, i); + i++) + if (!bch2_btree_node_relock(trans, path, i)) { + while (l <= i) + __btree_path_set_level_up(trans, path, l++); + goto again; + } + + return l; +} + +static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans, + struct btree_path *path, + int check_pos) +{ + return likely(btree_node_locked(path, path->level) && + btree_path_check_pos_in_node(path, path->level, check_pos)) + ? path->level + : __btree_path_up_until_good_node(trans, path, check_pos); +} + +/* + * This is the main state machine for walking down the btree - walks down to a + * specified depth + * + * Returns 0 on success, -EIO on error (error reading in a btree node). + * + * On error, caller (peek_node()/peek_key()) must return NULL; the error is + * stashed in the iterator and returned from bch2_trans_exit(). + */ +int bch2_btree_path_traverse_one(struct btree_trans *trans, + struct btree_path *path, + unsigned flags, + unsigned long trace_ip) +{ + unsigned depth_want = path->level; + int ret = -((int) trans->restarted); + + if (unlikely(ret)) + goto out; + + /* + * Ensure we obey path->should_be_locked: if it's set, we can't unlock + * and re-traverse the path without a transaction restart: + */ + if (path->should_be_locked) { + ret = bch2_btree_path_relock(trans, path, trace_ip); + goto out; + } + + if (path->cached) { + ret = bch2_btree_path_traverse_cached(trans, path, flags); + goto out; + } + + if (unlikely(path->level >= BTREE_MAX_DEPTH)) + goto out; + + path->level = btree_path_up_until_good_node(trans, path, 0); + + EBUG_ON(btree_path_node(path, path->level) && + !btree_node_locked(path, path->level)); + + /* + * Note: path->nodes[path->level] may be temporarily NULL here - that + * would indicate to other code that we got to the end of the btree, + * here it indicates that relocking the root failed - it's critical that + * btree_path_lock_root() comes next and that it can't fail + */ + while (path->level > depth_want) { + ret = btree_path_node(path, path->level) + ? btree_path_down(trans, path, flags, trace_ip) + : btree_path_lock_root(trans, path, depth_want, trace_ip); + if (unlikely(ret)) { + if (ret == 1) { + /* + * No nodes at this level - got to the end of + * the btree: + */ + ret = 0; + goto out; + } + + __bch2_btree_path_unlock(trans, path); + path->level = depth_want; + path->l[path->level].b = ERR_PTR(ret); + goto out; + } + } + + path->uptodate = BTREE_ITER_UPTODATE; +out: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted) + panic("ret %s (%i) trans->restarted %s (%i)\n", + bch2_err_str(ret), ret, + bch2_err_str(trans->restarted), trans->restarted); + bch2_btree_path_verify(trans, path); + return ret; +} + +static inline void btree_path_copy(struct btree_trans *trans, struct btree_path *dst, + struct btree_path *src) +{ + unsigned i, offset = offsetof(struct btree_path, pos); + + memcpy((void *) dst + offset, + (void *) src + offset, + sizeof(struct btree_path) - offset); + + for (i = 0; i < BTREE_MAX_DEPTH; i++) { + unsigned t = btree_node_locked_type(dst, i); + + if (t != BTREE_NODE_UNLOCKED) + six_lock_increment(&dst->l[i].b->c.lock, t); + } +} + +static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src, + bool intent) +{ + struct btree_path *new = btree_path_alloc(trans, src); + + btree_path_copy(trans, new, src); + __btree_path_get(new, intent); + return new; +} + +__flatten +struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + __btree_path_put(path, intent); + path = btree_path_clone(trans, path, intent); + path->preserve = false; + return path; +} + +struct btree_path * __must_check +__bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip, int cmp) +{ + unsigned level = path->level; + + bch2_trans_verify_not_in_restart(trans); + EBUG_ON(!path->ref); + + path = bch2_btree_path_make_mut(trans, path, intent, ip); + + path->pos = new_pos; + trans->paths_sorted = false; + + if (unlikely(path->cached)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + goto out; + } + + level = btree_path_up_until_good_node(trans, path, cmp); + + if (btree_path_node(path, level)) { + struct btree_path_level *l = &path->l[level]; + + BUG_ON(!btree_node_locked(path, level)); + /* + * We might have to skip over many keys, or just a few: try + * advancing the node iterator, and if we have to skip over too + * many keys just reinit it (or if we're rewinding, since that + * is expensive). + */ + if (cmp < 0 || + !btree_path_advance_to_pos(path, l, 8)) + bch2_btree_node_iter_init(&l->iter, l->b, &path->pos); + + /* + * Iterators to interior nodes should always be pointed at the first non + * whiteout: + */ + if (unlikely(level)) + bch2_btree_node_iter_peek(&l->iter, l->b); + } + + if (unlikely(level != path->level)) { + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + __bch2_btree_path_unlock(trans, path); + } +out: + bch2_btree_path_verify(trans, path); + return path; +} + +/* Btree path: main interface: */ + +static struct btree_path *have_path_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *sib; + + sib = prev_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; + + sib = next_btree_path(trans, path); + if (sib && !btree_path_cmp(sib, path)) + return sib; + + return NULL; +} + +static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btree_path *path) +{ + struct btree_path *sib; + + sib = prev_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; + + sib = next_btree_path(trans, path); + if (sib && sib->level == path->level && path_l(sib)->b == path_l(path)->b) + return sib; + + return NULL; +} + +static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path) +{ + __bch2_btree_path_unlock(trans, path); + btree_path_list_remove(trans, path); + trans->paths_allocated &= ~(1ULL << path->idx); +} + +void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent) +{ + struct btree_path *dup; + + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + dup = path->preserve + ? have_path_at_pos(trans, path) + : have_node_at_pos(trans, path); + + if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + return; + + if (path->should_be_locked && + !trans->restarted && + (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_))) + return; + + if (dup) { + dup->preserve |= path->preserve; + dup->should_be_locked |= path->should_be_locked; + } + + __bch2_path_free(trans, path); +} + +static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path, + bool intent) +{ + EBUG_ON(trans->paths + path->idx != path); + EBUG_ON(!path->ref); + + if (!__btree_path_put(path, intent)) + return; + + __bch2_path_free(trans, path); +} + +void bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) +{ + panic("trans->restart_count %u, should be %u, last restarted by %pS\n", + trans->restart_count, restart_count, + (void *) trans->last_begin_ip); +} + +void bch2_trans_in_restart_error(struct btree_trans *trans) +{ + panic("in transaction restart: %s, last restarted by %pS\n", + bch2_err_str(trans->restarted), + (void *) trans->last_restarted_ip); +} + +noinline __cold +void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) +{ + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + + prt_printf(buf, "transaction updates for %s journal seq %llu", + trans->fn, trans->journal_res.seq); + prt_newline(buf); + printbuf_indent_add(buf, 2); + + trans_for_each_update(trans, i) { + struct bkey_s_c old = { &i->old_k, i->old_v }; + + prt_printf(buf, "update: btree=%s cached=%u %pS", + bch2_btree_ids[i->btree_id], + i->cached, + (void *) i->ip_allocated); + prt_newline(buf); + + prt_printf(buf, " old "); + bch2_bkey_val_to_text(buf, trans->c, old); + prt_newline(buf); + + prt_printf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(i->k)); + prt_newline(buf); + } + + trans_for_each_wb_update(trans, wb) { + prt_printf(buf, "update: btree=%s wb=1 %pS", + bch2_btree_ids[wb->btree], + (void *) i->ip_allocated); + prt_newline(buf); + + prt_printf(buf, " new "); + bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k)); + prt_newline(buf); + } + + printbuf_indent_sub(buf, 2); +} + +noinline __cold +void bch2_dump_trans_updates(struct btree_trans *trans) +{ + struct printbuf buf = PRINTBUF; + + bch2_trans_updates_to_text(&buf, trans); + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline __cold +void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path) +{ + prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ", + path->idx, path->ref, path->intent_ref, + path->preserve ? 'P' : ' ', + path->should_be_locked ? 'S' : ' ', + bch2_btree_ids[path->btree_id], + path->level); + bch2_bpos_to_text(out, path->pos); + + prt_printf(out, " locks %u", path->nodes_locked); +#ifdef TRACK_PATH_ALLOCATED + prt_printf(out, " %pS", (void *) path->ip_allocated); +#endif + prt_newline(out); +} + +static noinline __cold +void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans, + bool nosort) +{ + struct btree_path *path; + unsigned idx; + + if (!nosort) + btree_trans_sort_paths(trans); + + trans_for_each_path_inorder(trans, path, idx) + bch2_btree_path_to_text(out, path); +} + +noinline __cold +void bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans) +{ + __bch2_trans_paths_to_text(out, trans, false); +} + +static noinline __cold +void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) +{ + struct printbuf buf = PRINTBUF; + + __bch2_trans_paths_to_text(&buf, trans, nosort); + bch2_trans_updates_to_text(&buf, trans); + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline __cold +void bch2_dump_trans_paths_updates(struct btree_trans *trans) +{ + __bch2_dump_trans_paths_updates(trans, false); +} + +noinline __cold +static void bch2_trans_update_max_paths(struct btree_trans *trans) +{ + struct btree_transaction_stats *s = btree_trans_stats(trans); + struct printbuf buf = PRINTBUF; + + if (!s) + return; + + bch2_trans_paths_to_text(&buf, trans); + + if (!buf.allocation_failure) { + mutex_lock(&s->lock); + if (s->nr_max_paths < hweight64(trans->paths_allocated)) { + s->nr_max_paths = trans->nr_max_paths = + hweight64(trans->paths_allocated); + swap(s->max_paths_text, buf.buf); + } + mutex_unlock(&s->lock); + } + + printbuf_exit(&buf); + + trans->nr_max_paths = hweight64(trans->paths_allocated); +} + +static noinline void btree_path_overflow(struct btree_trans *trans) +{ + bch2_dump_trans_paths_updates(trans); + panic("trans path oveflow\n"); +} + +static inline struct btree_path *btree_path_alloc(struct btree_trans *trans, + struct btree_path *pos) +{ + struct btree_path *path; + unsigned idx; + + if (unlikely(trans->paths_allocated == + ~((~0ULL << 1) << (BTREE_ITER_MAX - 1)))) + btree_path_overflow(trans); + + idx = __ffs64(~trans->paths_allocated); + + /* + * Do this before marking the new path as allocated, since it won't be + * initialized yet: + */ + if (unlikely(idx > trans->nr_max_paths)) + bch2_trans_update_max_paths(trans); + + trans->paths_allocated |= 1ULL << idx; + + path = &trans->paths[idx]; + path->idx = idx; + path->ref = 0; + path->intent_ref = 0; + path->nodes_locked = 0; + + btree_path_list_add(trans, pos, path); + trans->paths_sorted = false; + return path; +} + +struct btree_path *bch2_path_get(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos, + unsigned locks_want, unsigned level, + unsigned flags, unsigned long ip) +{ + struct btree_path *path, *path_pos = NULL; + bool cached = flags & BTREE_ITER_CACHED; + bool intent = flags & BTREE_ITER_INTENT; + int i; + + bch2_trans_verify_not_in_restart(trans); + bch2_trans_verify_locks(trans); + + btree_trans_sort_paths(trans); + + trans_for_each_path_inorder(trans, path, i) { + if (__btree_path_cmp(path, + btree_id, + cached, + pos, + level) > 0) + break; + + path_pos = path; + } + + if (path_pos && + path_pos->cached == cached && + path_pos->btree_id == btree_id && + path_pos->level == level) { + __btree_path_get(path_pos, intent); + path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip); + } else { + path = btree_path_alloc(trans, path_pos); + path_pos = NULL; + + __btree_path_get(path, intent); + path->pos = pos; + path->btree_id = btree_id; + path->cached = cached; + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + path->should_be_locked = false; + path->level = level; + path->locks_want = locks_want; + path->nodes_locked = 0; + for (i = 0; i < ARRAY_SIZE(path->l); i++) + path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init); +#ifdef TRACK_PATH_ALLOCATED + path->ip_allocated = ip; +#endif + trans->paths_sorted = false; + } + + if (!(flags & BTREE_ITER_NOPRESERVE)) + path->preserve = true; + + if (path->intent_ref) + locks_want = max(locks_want, level + 1); + + /* + * If the path has locks_want greater than requested, we don't downgrade + * it here - on transaction restart because btree node split needs to + * upgrade locks, we might be putting/getting the iterator again. + * Downgrading iterators only happens via bch2_trans_downgrade(), after + * a successful transaction commit. + */ + + locks_want = min(locks_want, BTREE_MAX_DEPTH); + if (locks_want > path->locks_want) + bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want); + + return path; +} + +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u) +{ + + struct btree_path_level *l = path_l(path); + struct bkey_packed *_k; + struct bkey_s_c k; + + if (unlikely(!l->b)) + return bkey_s_c_null; + + EBUG_ON(path->uptodate != BTREE_ITER_UPTODATE); + EBUG_ON(!btree_node_locked(path, path->level)); + + if (!path->cached) { + _k = bch2_btree_node_iter_peek_all(&l->iter, l->b); + k = _k ? bkey_disassemble(l->b, _k, u) : bkey_s_c_null; + + EBUG_ON(k.k && bkey_deleted(k.k) && bpos_eq(k.k->p, path->pos)); + + if (!k.k || !bpos_eq(path->pos, k.k->p)) + goto hole; + } else { + struct bkey_cached *ck = (void *) path->l[0].b; + + EBUG_ON(ck && + (path->btree_id != ck->key.btree_id || + !bkey_eq(path->pos, ck->key.pos))); + if (!ck || !ck->valid) + return bkey_s_c_null; + + *u = ck->k->k; + k = bkey_i_to_s_c(ck->k); + } + + return k; +hole: + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +/* Btree iterators: */ + +int __must_check +__bch2_btree_iter_traverse(struct btree_iter *iter) +{ + return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); +} + +int __must_check +bch2_btree_iter_traverse(struct btree_iter *iter) +{ + int ret; + + iter->path = bch2_btree_path_set_pos(iter->trans, iter->path, + btree_iter_search_key(iter), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + if (ret) + return ret; + + btree_path_set_should_be_locked(iter->path); + return 0; +} + +/* Iterate across nodes (leaf and interior nodes) */ + +struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct btree *b = NULL; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (ret) + goto err; + + b = btree_path_node(iter->path, iter->path->level); + if (!b) + goto out; + + BUG_ON(bpos_lt(b->key.k.p, iter->pos)); + + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + btree_path_set_should_be_locked(iter->path); +out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + + return b; +err: + b = ERR_PTR(ret); + goto out; +} + +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +{ + struct btree *b; + + while (b = bch2_btree_iter_peek_node(iter), + bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) + bch2_trans_begin(iter->trans); + + return b; +} + +struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct btree_path *path = iter->path; + struct btree *b = NULL; + int ret; + + bch2_trans_verify_not_in_restart(trans); + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + /* already at end? */ + if (!btree_path_node(path, path->level)) + return NULL; + + /* got to end? */ + if (!btree_path_node(path, path->level + 1)) { + btree_path_set_level_up(trans, path); + return NULL; + } + + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + __bch2_btree_path_unlock(trans, path); + path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + goto err; + } + + b = btree_path_node(path, path->level + 1); + + if (bpos_eq(iter->pos, b->key.k.p)) { + __btree_path_set_level_up(trans, path, path->level++); + } else { + /* + * Haven't gotten to the end of the parent node: go back down to + * the next child node + */ + path = iter->path = + bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos), + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + btree_path_set_level_down(trans, path, iter->min_depth); + + ret = bch2_btree_path_traverse(trans, path, iter->flags); + if (ret) + goto err; + + b = path->l[path->level].b; + } + + bkey_init(&iter->k); + iter->k.p = iter->pos = b->key.k.p; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + btree_path_set_should_be_locked(iter->path); + BUG_ON(iter->path->uptodate); +out: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + + return b; +err: + b = ERR_PTR(ret); + goto out; +} + +/* Iterate across keys (in leaf nodes only) */ + +inline bool bch2_btree_iter_advance(struct btree_iter *iter) +{ + if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) { + struct bpos pos = iter->k.p; + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, SPOS_MAX) + : bkey_eq(pos, SPOS_MAX)); + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_successor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; + } else { + if (!btree_path_node(iter->path, iter->path->level)) + return true; + + iter->advanced = true; + return false; + } +} + +inline bool bch2_btree_iter_rewind(struct btree_iter *iter) +{ + struct bpos pos = bkey_start_pos(&iter->k); + bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS + ? bpos_eq(pos, POS_MIN) + : bkey_eq(pos, POS_MIN)); + + if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS)) + pos = bkey_predecessor(iter, pos); + bch2_btree_iter_set_pos(iter, pos); + return ret; +} + +static noinline +struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter) +{ + struct btree_insert_entry *i; + struct bkey_i *ret = NULL; + + trans_for_each_update(iter->trans, i) { + if (i->btree_id < iter->btree_id) + continue; + if (i->btree_id > iter->btree_id) + break; + if (bpos_lt(i->k->k.p, iter->path->pos)) + continue; + if (i->key_cache_already_flushed) + continue; + if (!ret || bpos_lt(i->k->k.p, ret->k.p)) + ret = i->k; + } + + return ret; +} + +static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter) +{ + return iter->flags & BTREE_ITER_WITH_UPDATES + ? __bch2_btree_trans_peek_updates(iter) + : NULL; +} + +static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end_pos) +{ + struct bkey_i *k; + + if (bpos_lt(iter->path->pos, iter->journal_pos)) + iter->journal_idx = 0; + + k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id, + iter->path->level, + iter->path->pos, + end_pos, + &iter->journal_idx); + + iter->journal_pos = k ? k->k.p : end_pos; + return k; +} + +static noinline +struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans, + struct btree_iter *iter) +{ + struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos); + + if (k) { + iter->k = k->k; + return bkey_i_to_s_c(k); + } else { + return bkey_s_c_null; + } +} + +static noinline +struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_i *next_journal = + bch2_btree_journal_peek(trans, iter, + k.k ? k.k->p : path_l(iter->path)->b->key.k.p); + + if (next_journal) { + iter->k = next_journal->k; + k = bkey_i_to_s_c(next_journal); + } + + return k; +} + +/* + * Checks btree key cache for key at iter->pos and returns it if present, or + * bkey_s_c_null: + */ +static noinline +struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +{ + struct btree_trans *trans = iter->trans; + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k; + int ret; + + if ((iter->flags & BTREE_ITER_KEY_CACHE_FILL) && + bpos_eq(iter->pos, pos)) + return bkey_s_c_null; + + if (!bch2_btree_key_cache_find(c, iter->btree_id, pos)) + return bkey_s_c_null; + + if (!iter->key_cache_path) + iter->key_cache_path = bch2_path_get(trans, iter->btree_id, pos, + iter->flags & BTREE_ITER_INTENT, 0, + iter->flags|BTREE_ITER_CACHED| + BTREE_ITER_CACHED_NOFILL, + _THIS_IP_); + + iter->key_cache_path = bch2_btree_path_set_pos(trans, iter->key_cache_path, pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + iter->flags|BTREE_ITER_CACHED) ?: + bch2_btree_path_relock(trans, iter->path, _THIS_IP_); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + btree_path_set_should_be_locked(iter->key_cache_path); + + k = bch2_btree_path_peek_slot(iter->key_cache_path, &u); + if (k.k && !bkey_err(k)) { + iter->k = u; + k.k = &iter->k; + } + return k; +} + +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +{ + struct btree_trans *trans = iter->trans; + struct bkey_i *next_update; + struct bkey_s_c k, k2; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + + while (1) { + struct btree_path_level *l; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out; + } + + l = path_l(iter->path); + + if (unlikely(!l->b)) { + /* No btree nodes at requested level: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; + } + + btree_path_set_should_be_locked(iter->path); + + k = btree_path_level_peek_all(trans->c, l, &iter->k); + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + k.k && + (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + k = k2; + ret = bkey_err(k); + if (ret) { + bch2_btree_iter_set_pos(iter, iter->pos); + goto out; + } + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL)) + k = btree_trans_peek_journal(trans, iter, k); + + next_update = btree_trans_peek_updates(iter); + + if (next_update && + bpos_le(next_update->k.p, + k.k ? k.k->p : l->b->key.k.p)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + } + + if (k.k && bkey_deleted(k.k)) { + /* + * If we've got a whiteout, and it's after the search + * key, advance the search key to the whiteout instead + * of just after the whiteout - it might be a btree + * whiteout, with a real key at the same position, since + * in the btree deleted keys sort before non deleted. + */ + search_key = !bpos_eq(search_key, k.k->p) + ? k.k->p + : bpos_successor(k.k->p); + continue; + } + + if (likely(k.k)) { + break; + } else if (likely(!bpos_eq(l->b->key.k.p, SPOS_MAX))) { + /* Advance to next leaf node: */ + search_key = bpos_successor(l->b->key.k.p); + } else { + /* End of btree: */ + bch2_btree_iter_set_pos(iter, SPOS_MAX); + k = bkey_s_c_null; + goto out; + } + } +out: + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_peek: returns first key greater than or equal to iterator's + * current position + */ +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos end) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = btree_iter_search_key(iter); + struct bkey_s_c k; + struct bpos iter_pos; + int ret; + + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX)); + + if (iter->update_path) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + bch2_btree_iter_verify_entry_exit(iter); + + while (1) { + k = __bch2_btree_iter_peek(iter, search_key); + if (unlikely(!k.k)) + goto end; + if (unlikely(bkey_err(k))) + goto out_no_locked; + + /* + * iter->pos should be mononotically increasing, and always be + * equal to the key we just returned - except extents can + * straddle iter->pos: + */ + if (!(iter->flags & BTREE_ITER_IS_EXTENTS)) + iter_pos = k.k->p; + else + iter_pos = bkey_max(iter->pos, bkey_start_pos(k.k)); + + if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS) + ? bkey_gt(iter_pos, end) + : bkey_ge(iter_pos, end))) + goto end; + + if (iter->update_path && + !bkey_eq(iter->update_path->pos, k.k->p)) { + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + } + + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + (iter->flags & BTREE_ITER_INTENT) && + !(iter->flags & BTREE_ITER_IS_EXTENTS) && + !iter->update_path) { + struct bpos pos = k.k->p; + + if (pos.snapshot < iter->snapshot) { + search_key = bpos_successor(k.k->p); + continue; + } + + pos.snapshot = iter->snapshot; + + /* + * advance, same as on exit for iter->path, but only up + * to snapshot + */ + __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT); + iter->update_path = iter->path; + + iter->update_path = bch2_btree_path_set_pos(trans, + iter->update_path, pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + ret = bch2_btree_path_traverse(trans, iter->update_path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + } + + /* + * We can never have a key in a leaf node at POS_MAX, so + * we don't have to check these successor() calls: + */ + if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && + !bch2_snapshot_is_ancestor(trans->c, + iter->snapshot, + k.k->p.snapshot)) { + search_key = bpos_successor(k.k->p); + continue; + } + + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_successor(iter, k.k->p); + continue; + } + + break; + } + + iter->pos = iter_pos; + + iter->path = bch2_btree_path_set_pos(trans, iter->path, k.k->p, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + btree_path_set_should_be_locked(iter->path); +out_no_locked: + if (iter->update_path) { + ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_); + if (unlikely(ret)) + k = bkey_s_c_err(ret); + else + btree_path_set_should_be_locked(iter->update_path); + } + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + iter->pos.snapshot = iter->snapshot; + + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) { + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + } + + bch2_btree_iter_verify_entry_exit(iter); + + return k; +end: + bch2_btree_iter_set_pos(iter, end); + k = bkey_s_c_null; + goto out_no_locked; +} + +/** + * bch2_btree_iter_peek_all_levels: returns the first key greater than or equal + * to iterator's current position, returning keys from every level of the btree. + * For keys at different levels of the btree that compare equal, the key from + * the lower level (leaf) is returned first. + */ +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bkey_s_c k; + int ret; + + EBUG_ON(iter->path->cached); + bch2_btree_iter_verify(iter); + BUG_ON(iter->path->level < iter->min_depth); + BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)); + EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS)); + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + /* Already at end? */ + if (!btree_path_node(iter->path, iter->path->level)) { + k = bkey_s_c_null; + goto out_no_locked; + } + + k = btree_path_level_peek_all(trans->c, + &iter->path->l[iter->path->level], &iter->k); + + /* Check if we should go up to the parent node: */ + if (!k.k || + (iter->advanced && + bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) { + iter->pos = path_l(iter->path)->b->key.k.p; + btree_path_set_level_up(trans, iter->path); + iter->advanced = false; + continue; + } + + /* + * Check if we should go back down to a leaf: + * If we're not in a leaf node, we only return the current key + * if it exactly matches iter->pos - otherwise we first have to + * go back to the leaf: + */ + if (iter->path->level != iter->min_depth && + (iter->advanced || + !k.k || + !bpos_eq(iter->pos, k.k->p))) { + btree_path_set_level_down(trans, iter->path, iter->min_depth); + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + /* Check if we should go to the next key: */ + if (iter->path->level == iter->min_depth && + iter->advanced && + k.k && + bpos_eq(iter->pos, k.k->p)) { + iter->pos = bpos_successor(iter->pos); + iter->advanced = false; + continue; + } + + if (iter->advanced && + iter->path->level == iter->min_depth && + !bpos_eq(k.k->p, iter->pos)) + iter->advanced = false; + + BUG_ON(iter->advanced); + BUG_ON(!k.k); + break; + } + + iter->pos = k.k->p; + btree_path_set_should_be_locked(iter->path); +out_no_locked: + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_next: returns first key greater than iterator's current + * position + */ +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +{ + if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek(iter); +} + +/** + * bch2_btree_iter_peek_prev: returns first key less than or equal to + * iterator's current position + */ +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key = iter->pos; + struct btree_path *saved_path = NULL; + struct bkey_s_c k; + struct bkey saved_k; + const struct bch_val *saved_v; + int ret; + + EBUG_ON(iter->path->cached || iter->path->level); + EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES); + + if (iter->flags & BTREE_ITER_WITH_JOURNAL) + return bkey_s_c_err(-EIO); + + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + + while (1) { + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + /* ensure that iter->k is consistent with iter->pos: */ + bch2_btree_iter_set_pos(iter, iter->pos); + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + k = btree_path_level_peek(trans, iter->path, + &iter->path->l[0], &iter->k); + if (!k.k || + ((iter->flags & BTREE_ITER_IS_EXTENTS) + ? bpos_ge(bkey_start_pos(k.k), search_key) + : bpos_gt(k.k->p, search_key))) + k = btree_path_level_prev(trans, iter->path, + &iter->path->l[0], &iter->k); + + if (likely(k.k)) { + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) { + if (k.k->p.snapshot == iter->snapshot) + goto got_key; + + /* + * If we have a saved candidate, and we're no + * longer at the same _key_ (not pos), return + * that candidate + */ + if (saved_path && !bkey_eq(k.k->p, saved_k.p)) { + bch2_path_put_nokeep(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + iter->path = saved_path; + saved_path = NULL; + iter->k = saved_k; + k.v = saved_v; + goto got_key; + } + + if (bch2_snapshot_is_ancestor(iter->trans->c, + iter->snapshot, + k.k->p.snapshot)) { + if (saved_path) + bch2_path_put_nokeep(trans, saved_path, + iter->flags & BTREE_ITER_INTENT); + saved_path = btree_path_clone(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + saved_k = *k.k; + saved_v = k.v; + } + + search_key = bpos_predecessor(k.k->p); + continue; + } +got_key: + if (bkey_whiteout(k.k) && + !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) { + search_key = bkey_predecessor(iter, k.k->p); + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + search_key.snapshot = U32_MAX; + continue; + } + + break; + } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) { + /* Advance to previous leaf node: */ + search_key = bpos_predecessor(iter->path->l[0].b->data->min_key); + } else { + /* Start of btree: */ + bch2_btree_iter_set_pos(iter, POS_MIN); + k = bkey_s_c_null; + goto out_no_locked; + } + } + + EBUG_ON(bkey_gt(bkey_start_pos(k.k), iter->pos)); + + /* Extents can straddle iter->pos: */ + if (bkey_lt(k.k->p, iter->pos)) + iter->pos = k.k->p; + + if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) + iter->pos.snapshot = iter->snapshot; + + btree_path_set_should_be_locked(iter->path); +out_no_locked: + if (saved_path) + bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT); + + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + + return k; +} + +/** + * bch2_btree_iter_prev: returns first key less than iterator's current + * position + */ +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +{ + if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_prev(iter); +} + +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +{ + struct btree_trans *trans = iter->trans; + struct bpos search_key; + struct bkey_s_c k; + int ret; + + bch2_btree_iter_verify(iter); + bch2_btree_iter_verify_entry_exit(iter); + EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS); + EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE)); + + /* extents can't span inode numbers: */ + if ((iter->flags & BTREE_ITER_IS_EXTENTS) && + unlikely(iter->pos.offset == KEY_OFFSET_MAX)) { + if (iter->pos.inode == KEY_INODE_MAX) + return bkey_s_c_null; + + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + } + + search_key = btree_iter_search_key(iter); + iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, + iter->flags & BTREE_ITER_INTENT, + btree_iter_ip_allocated(iter)); + + ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); + if (unlikely(ret)) { + k = bkey_s_c_err(ret); + goto out_no_locked; + } + + if ((iter->flags & BTREE_ITER_CACHED) || + !(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) { + struct bkey_i *next_update; + + if ((next_update = btree_trans_peek_updates(iter)) && + bpos_eq(next_update->k.p, iter->pos)) { + iter->k = next_update->k; + k = bkey_i_to_s_c(next_update); + goto out; + } + + if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) && + (k = btree_trans_peek_slot_journal(trans, iter)).k) + goto out; + + if (unlikely(iter->flags & BTREE_ITER_WITH_KEY_CACHE) && + (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + if (!bkey_err(k)) + iter->k = *k.k; + /* We're not returning a key from iter->path: */ + goto out_no_locked; + } + + k = bch2_btree_path_peek_slot(iter->path, &iter->k); + if (unlikely(!k.k)) + goto out_no_locked; + } else { + struct bpos next; + struct bpos end = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + end.offset = U64_MAX; + + EBUG_ON(iter->path->level); + + if (iter->flags & BTREE_ITER_INTENT) { + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); + k = bch2_btree_iter_peek_upto(&iter2, end); + + if (k.k && !bkey_err(k)) { + iter->k = iter2.k; + k.k = &iter->k; + } + bch2_trans_iter_exit(trans, &iter2); + } else { + struct bpos pos = iter->pos; + + k = bch2_btree_iter_peek_upto(iter, end); + if (unlikely(bkey_err(k))) + bch2_btree_iter_set_pos(iter, pos); + else + iter->pos = pos; + } + + if (unlikely(bkey_err(k))) + goto out_no_locked; + + next = k.k ? bkey_start_pos(k.k) : POS_MAX; + + if (bkey_lt(iter->pos, next)) { + bkey_init(&iter->k); + iter->k.p = iter->pos; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) { + bch2_key_resize(&iter->k, + min_t(u64, KEY_SIZE_MAX, + (next.inode == iter->pos.inode + ? next.offset + : KEY_OFFSET_MAX) - + iter->pos.offset)); + EBUG_ON(!iter->k.size); + } + + k = (struct bkey_s_c) { &iter->k, NULL }; + } + } +out: + btree_path_set_should_be_locked(iter->path); +out_no_locked: + bch2_btree_iter_verify_entry_exit(iter); + bch2_btree_iter_verify(iter); + ret = bch2_btree_iter_verify_ret(iter, k); + if (unlikely(ret)) + return bkey_s_c_err(ret); + + return k; +} + +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +{ + if (!bch2_btree_iter_advance(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +{ + if (!bch2_btree_iter_rewind(iter)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(iter->trans) || + (k = bch2_btree_iter_peek_type(iter, iter->flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(iter->trans); + + return k; +} + +/* new transactional stuff: */ + +#ifdef CONFIG_BCACHEFS_DEBUG +static void btree_trans_verify_sorted_refs(struct btree_trans *trans) +{ + struct btree_path *path; + unsigned i; + + BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated)); + + trans_for_each_path(trans, path) { + BUG_ON(path->sorted_idx >= trans->nr_sorted); + BUG_ON(trans->sorted[path->sorted_idx] != path->idx); + } + + for (i = 0; i < trans->nr_sorted; i++) { + unsigned idx = trans->sorted[i]; + + EBUG_ON(!(trans->paths_allocated & (1ULL << idx))); + BUG_ON(trans->paths[idx].sorted_idx != i); + } +} + +static void btree_trans_verify_sorted(struct btree_trans *trans) +{ + struct btree_path *path, *prev = NULL; + unsigned i; + + if (!bch2_debug_check_iterators) + return; + + trans_for_each_path_inorder(trans, path, i) { + if (prev && btree_path_cmp(prev, path) > 0) { + __bch2_dump_trans_paths_updates(trans, true); + panic("trans paths out of order!\n"); + } + prev = path; + } +} +#else +static inline void btree_trans_verify_sorted_refs(struct btree_trans *trans) {} +static inline void btree_trans_verify_sorted(struct btree_trans *trans) {} +#endif + +void __bch2_btree_trans_sort_paths(struct btree_trans *trans) +{ + int i, l = 0, r = trans->nr_sorted, inc = 1; + bool swapped; + + btree_trans_verify_sorted_refs(trans); + + if (trans->paths_sorted) + goto out; + + /* + * Cocktail shaker sort: this is efficient because iterators will be + * mostly sorted. + */ + do { + swapped = false; + + for (i = inc > 0 ? l : r - 2; + i + 1 < r && i >= l; + i += inc) { + if (btree_path_cmp(trans->paths + trans->sorted[i], + trans->paths + trans->sorted[i + 1]) > 0) { + swap(trans->sorted[i], trans->sorted[i + 1]); + trans->paths[trans->sorted[i]].sorted_idx = i; + trans->paths[trans->sorted[i + 1]].sorted_idx = i + 1; + swapped = true; + } + } + + if (inc > 0) + --r; + else + l++; + inc = -inc; + } while (swapped); + + trans->paths_sorted = true; +out: + btree_trans_verify_sorted(trans); +} + +static inline void btree_path_list_remove(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned i; + + EBUG_ON(path->sorted_idx >= trans->nr_sorted); +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + trans->nr_sorted--; + memmove_u64s_down_small(trans->sorted + path->sorted_idx, + trans->sorted + path->sorted_idx + 1, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); +#else + array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx); +#endif + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; + + path->sorted_idx = U8_MAX; +} + +static inline void btree_path_list_add(struct btree_trans *trans, + struct btree_path *pos, + struct btree_path *path) +{ + unsigned i; + + path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted; + +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1, + trans->sorted + path->sorted_idx, + DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8)); + trans->nr_sorted++; + trans->sorted[path->sorted_idx] = path->idx; +#else + array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx); +#endif + + for (i = path->sorted_idx; i < trans->nr_sorted; i++) + trans->paths[trans->sorted[i]].sorted_idx = i; + + btree_trans_verify_sorted_refs(trans); +} + +void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) +{ + if (iter->update_path) + bch2_path_put_nokeep(trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + if (iter->path) + bch2_path_put(trans, iter->path, + iter->flags & BTREE_ITER_INTENT); + if (iter->key_cache_path) + bch2_path_put(trans, iter->key_cache_path, + iter->flags & BTREE_ITER_INTENT); + iter->path = NULL; + iter->update_path = NULL; + iter->key_cache_path = NULL; +} + +static inline void bch2_trans_iter_init_inlined(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); +} + +void bch2_trans_iter_init_outlined(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); +} + +void bch2_trans_node_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + enum btree_id btree_id, + struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags) +{ + flags |= BTREE_ITER_NOT_EXTENTS; + flags |= __BTREE_ITER_ALL_SNAPSHOTS; + flags |= BTREE_ITER_ALL_SNAPSHOTS; + + bch2_trans_iter_init_common(trans, iter, btree_id, pos, locks_want, depth, + __bch2_btree_iter_flags(trans, btree_id, flags), + _RET_IP_); + + iter->min_depth = depth; + + BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH)); + BUG_ON(iter->path->level != depth); + BUG_ON(iter->min_depth != depth); +} + +void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +{ + *dst = *src; + if (src->path) + __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT); + if (src->update_path) + __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT); + dst->key_cache_path = NULL; +} + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + unsigned new_top = trans->mem_top + size; + size_t old_bytes = trans->mem_bytes; + size_t new_bytes = roundup_pow_of_two(new_top); + int ret; + void *new_mem; + void *p; + + trans->mem_max = max(trans->mem_max, new_top); + + WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + + new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN); + if (unlikely(!new_mem)) { + bch2_trans_unlock(trans); + + new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL); + if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) { + new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL); + new_bytes = BTREE_TRANS_MEM_MAX; + kfree(trans->mem); + } + + if (!new_mem) + return ERR_PTR(-BCH_ERR_ENOMEM_trans_kmalloc); + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + ret = bch2_trans_relock(trans); + if (ret) + return ERR_PTR(ret); + } + + trans->mem = new_mem; + trans->mem_bytes = new_bytes; + + if (old_bytes) { + trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes); + return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced)); + } + + p = trans->mem + trans->mem_top; + trans->mem_top += size; + memset(p, 0, size); + return p; +} + +static noinline void bch2_trans_reset_srcu_lock(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->cached && !btree_node_locked(path, 0)) + path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset); + + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; +} + +/** + * bch2_trans_begin() - reset a transaction after a interrupted attempt + * @trans: transaction to reset + * + * While iterating over nodes or updating nodes a attempt to lock a btree node + * may return BCH_ERR_transaction_restart when the trylock fails. When this + * occurs bch2_trans_begin() should be called and the transaction retried. + */ +u32 bch2_trans_begin(struct btree_trans *trans) +{ + struct btree_path *path; + u64 now; + + bch2_trans_reset_updates(trans); + + trans->restart_count++; + trans->mem_top = 0; + + trans_for_each_path(trans, path) { + path->should_be_locked = false; + + /* + * If the transaction wasn't restarted, we're presuming to be + * doing something new: dont keep iterators excpt the ones that + * are in use - except for the subvolumes btree: + */ + if (!trans->restarted && path->btree_id != BTREE_ID_subvolumes) + path->preserve = false; + + /* + * XXX: we probably shouldn't be doing this if the transaction + * was restarted, but currently we still overflow transaction + * iterators if we do that + */ + if (!path->ref && !path->preserve) + __bch2_path_free(trans, path); + else + path->preserve = false; + } + + now = local_clock(); + if (!trans->restarted && + (need_resched() || + now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) { + drop_locks_do(trans, (cond_resched(), 0)); + now = local_clock(); + } + trans->last_begin_time = now; + + if (unlikely(time_after(jiffies, trans->srcu_lock_time + msecs_to_jiffies(10)))) + bch2_trans_reset_srcu_lock(trans); + + trans->last_begin_ip = _RET_IP_; + if (trans->restarted) { + bch2_btree_path_traverse_all(trans); + trans->notrace_relock_fail = false; + } + + return trans->restart_count; +} + +static void bch2_trans_alloc_paths(struct btree_trans *trans, struct bch_fs *c) +{ + size_t paths_bytes = sizeof(struct btree_path) * BTREE_ITER_MAX; + size_t updates_bytes = sizeof(struct btree_insert_entry) * BTREE_ITER_MAX; + void *p = NULL; + + BUG_ON(trans->used_mempool); + +#ifdef __KERNEL__ + p = this_cpu_xchg(c->btree_paths_bufs->path, NULL); +#endif + if (!p) + p = mempool_alloc(&trans->c->btree_paths_pool, GFP_NOFS); + /* + * paths need to be zeroed, bch2_check_for_deadlock looks at paths in + * other threads + */ + + trans->paths = p; p += paths_bytes; + trans->updates = p; p += updates_bytes; +} + +const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; + +unsigned bch2_trans_get_fn_idx(const char *fn) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++) + if (!bch2_btree_transaction_fns[i] || + bch2_btree_transaction_fns[i] == fn) { + bch2_btree_transaction_fns[i] = fn; + return i; + } + + pr_warn_once("BCH_TRANSACTIONS_NR not big enough!"); + return i; +} + +void __bch2_trans_init(struct btree_trans *trans, struct bch_fs *c, unsigned fn_idx) + __acquires(&c->btree_trans_barrier) +{ + struct btree_transaction_stats *s; + + bch2_assert_btree_nodes_not_locked(); + + memset(trans, 0, sizeof(*trans)); + trans->c = c; + trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns) + ? bch2_btree_transaction_fns[fn_idx] : NULL; + trans->last_begin_time = local_clock(); + trans->fn_idx = fn_idx; + trans->locking_wait.task = current; + trans->journal_replay_not_finished = + !test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags); + closure_init_stack(&trans->ref); + + bch2_trans_alloc_paths(trans, c); + + s = btree_trans_stats(trans); + if (s && s->max_mem) { + unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem); + + trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL); + + if (!unlikely(trans->mem)) { + trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL); + trans->mem_bytes = BTREE_TRANS_MEM_MAX; + } else { + trans->mem_bytes = expected_mem_bytes; + } + } + + if (s) { + trans->nr_max_paths = s->nr_max_paths; + trans->wb_updates_size = s->wb_updates_size; + } + + trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + trans->srcu_lock_time = jiffies; + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + struct btree_trans *pos; + + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(pos, &c->btree_trans_list, list) { + /* + * We'd much prefer to be stricter here and completely + * disallow multiple btree_trans in the same thread - + * but the data move path calls bch2_write when we + * already have a btree_trans initialized. + */ + BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid && + bch2_trans_locked(pos)); + + if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) { + list_add_tail(&trans->list, &pos->list); + goto list_add_done; + } + } + list_add_tail(&trans->list, &c->btree_trans_list); +list_add_done: + seqmutex_unlock(&c->btree_trans_lock); + } +} + +static void check_btree_paths_leaked(struct btree_trans *trans) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->ref) + goto leaked; + return; +leaked: + bch_err(c, "btree paths leaked from %s!", trans->fn); + trans_for_each_path(trans, path) + if (path->ref) + printk(KERN_ERR " btree %s %pS\n", + bch2_btree_ids[path->btree_id], + (void *) path->ip_allocated); + /* Be noisy about this: */ + bch2_fatal_error(c); +#endif +} + +void bch2_trans_exit(struct btree_trans *trans) + __releases(&c->btree_trans_barrier) +{ + struct btree_insert_entry *i; + struct bch_fs *c = trans->c; + struct btree_transaction_stats *s = btree_trans_stats(trans); + + bch2_trans_unlock(trans); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) { + seqmutex_lock(&c->btree_trans_lock); + list_del(&trans->list); + seqmutex_unlock(&c->btree_trans_lock); + } + + closure_sync(&trans->ref); + + if (s) + s->max_mem = max(s->max_mem, trans->mem_max); + + trans_for_each_update(trans, i) + __btree_path_put(i->path, true); + trans->nr_updates = 0; + + check_btree_paths_leaked(trans); + + srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx); + + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + kfree(trans->extra_journal_entries.data); + + if (trans->fs_usage_deltas) { + if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) == + REPLICAS_DELTA_LIST_MAX) + mempool_free(trans->fs_usage_deltas, + &c->replicas_delta_pool); + else + kfree(trans->fs_usage_deltas); + } + + if (trans->mem_bytes == BTREE_TRANS_MEM_MAX) + mempool_free(trans->mem, &c->btree_trans_mem_pool); + else + kfree(trans->mem); + +#ifdef __KERNEL__ + /* + * Userspace doesn't have a real percpu implementation: + */ + trans->paths = this_cpu_xchg(c->btree_paths_bufs->path, trans->paths); +#endif + + if (trans->paths) + mempool_free(trans->paths, &c->btree_paths_pool); + + trans->mem = (void *) 0x1; + trans->paths = (void *) 0x1; +} + +static void __maybe_unused +bch2_btree_bkey_cached_common_to_text(struct printbuf *out, + struct btree_bkey_cached_common *b) +{ + struct six_lock_count c = six_lock_counts(&b->lock); + struct task_struct *owner; + pid_t pid; + + rcu_read_lock(); + owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + rcu_read_unlock(); + + prt_tab(out); + prt_printf(out, "%px %c l=%u %s:", b, b->cached ? 'c' : 'b', + b->level, bch2_btree_ids[b->btree_id]); + bch2_bpos_to_text(out, btree_node_pos(b)); + + prt_tab(out); + prt_printf(out, " locks %u:%u:%u held by pid %u", + c.n[0], c.n[1], c.n[2], pid); +} + +void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) +{ + struct btree_path *path; + struct btree_bkey_cached_common *b; + static char lock_types[] = { 'r', 'i', 'w' }; + unsigned l, idx; + + if (!out->nr_tabstops) { + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 32); + } + + prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn); + + trans_for_each_path_safe(trans, path, idx) { + if (!path->nodes_locked) + continue; + + prt_printf(out, " path %u %c l=%u %s:", + path->idx, + path->cached ? 'c' : 'b', + path->level, + bch2_btree_ids[path->btree_id]); + bch2_bpos_to_text(out, path->pos); + prt_newline(out); + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + if (btree_node_locked(path, l) && + !IS_ERR_OR_NULL(b = (void *) READ_ONCE(path->l[l].b))) { + prt_printf(out, " %c l=%u ", + lock_types[btree_node_locked_type(path, l)], l); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); + } + } + } + + b = READ_ONCE(trans->locking); + if (b) { + prt_printf(out, " blocked for %lluus on", + div_u64(local_clock() - trans->locking_wait.start_time, + 1000)); + prt_newline(out); + prt_printf(out, " %c", lock_types[trans->locking_wait.lock_want]); + bch2_btree_bkey_cached_common_to_text(out, b); + prt_newline(out); + } +} + +void bch2_fs_btree_iter_exit(struct bch_fs *c) +{ + struct btree_transaction_stats *s; + + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { + kfree(s->max_paths_text); + bch2_time_stats_exit(&s->lock_hold_times); + } + + if (c->btree_trans_barrier_initialized) + cleanup_srcu_struct(&c->btree_trans_barrier); + mempool_exit(&c->btree_trans_mem_pool); + mempool_exit(&c->btree_paths_pool); +} + +int bch2_fs_btree_iter_init(struct bch_fs *c) +{ + struct btree_transaction_stats *s; + unsigned nr = BTREE_ITER_MAX; + int ret; + + for (s = c->btree_transaction_stats; + s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); + s++) { + bch2_time_stats_init(&s->lock_hold_times); + mutex_init(&s->lock); + } + + INIT_LIST_HEAD(&c->btree_trans_list); + seqmutex_init(&c->btree_trans_lock); + + ret = mempool_init_kmalloc_pool(&c->btree_paths_pool, 1, + sizeof(struct btree_path) * nr + + sizeof(struct btree_insert_entry) * nr) ?: + mempool_init_kmalloc_pool(&c->btree_trans_mem_pool, 1, + BTREE_TRANS_MEM_MAX) ?: + init_srcu_struct(&c->btree_trans_barrier); + if (!ret) + c->btree_trans_barrier_initialized = true; + return ret; +} diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h new file mode 100644 index 000000000..c472aa8c5 --- /dev/null +++ b/fs/bcachefs/btree_iter.h @@ -0,0 +1,924 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_ITER_H +#define _BCACHEFS_BTREE_ITER_H + +#include "bset.h" +#include "btree_types.h" +#include "trace.h" + +static inline int __bkey_err(const struct bkey *k) +{ + return PTR_ERR_OR_ZERO(k); +} + +#define bkey_err(_k) __bkey_err((_k).k) + +static inline void __btree_path_get(struct btree_path *path, bool intent) +{ + path->ref++; + path->intent_ref += intent; +} + +static inline bool __btree_path_put(struct btree_path *path, bool intent) +{ + EBUG_ON(!path->ref); + EBUG_ON(!path->intent_ref && intent); + path->intent_ref -= intent; + return --path->ref == 0; +} + +static inline void btree_path_set_dirty(struct btree_path *path, + enum btree_path_uptodate u) +{ + path->uptodate = max_t(unsigned, path->uptodate, u); +} + +static inline struct btree *btree_path_node(struct btree_path *path, + unsigned level) +{ + return level < BTREE_MAX_DEPTH ? path->l[level].b : NULL; +} + +static inline bool btree_node_lock_seq_matches(const struct btree_path *path, + const struct btree *b, unsigned level) +{ + return path->l[level].lock_seq == six_lock_seq(&b->c.lock); +} + +static inline struct btree *btree_node_parent(struct btree_path *path, + struct btree *b) +{ + return btree_path_node(path, b->c.level + 1); +} + +/* Iterate over paths within a transaction: */ + +void __bch2_btree_trans_sort_paths(struct btree_trans *); + +static inline void btree_trans_sort_paths(struct btree_trans *trans) +{ + if (!IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + trans->paths_sorted) + return; + __bch2_btree_trans_sort_paths(trans); +} + +static inline struct btree_path * +__trans_next_path(struct btree_trans *trans, unsigned idx) +{ + u64 l; + + if (idx == BTREE_ITER_MAX) + return NULL; + + l = trans->paths_allocated >> idx; + if (!l) + return NULL; + + idx += __ffs64(l); + EBUG_ON(idx >= BTREE_ITER_MAX); + EBUG_ON(trans->paths[idx].idx != idx); + return &trans->paths[idx]; +} + +#define trans_for_each_path_from(_trans, _path, _start) \ + for (_path = __trans_next_path((_trans), _start); \ + (_path); \ + _path = __trans_next_path((_trans), (_path)->idx + 1)) + +#define trans_for_each_path(_trans, _path) \ + trans_for_each_path_from(_trans, _path, 0) + +static inline struct btree_path * +__trans_next_path_safe(struct btree_trans *trans, unsigned *idx) +{ + u64 l; + + if (*idx == BTREE_ITER_MAX) + return NULL; + + l = trans->paths_allocated >> *idx; + if (!l) + return NULL; + + *idx += __ffs64(l); + EBUG_ON(*idx >= BTREE_ITER_MAX); + return &trans->paths[*idx]; +} + +/* + * This version is intended to be safe for use on a btree_trans that is owned by + * another thread, for bch2_btree_trans_to_text(); + */ +#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \ + for (_idx = _start; \ + (_path = __trans_next_path_safe((_trans), &_idx)); \ + _idx++) + +#define trans_for_each_path_safe(_trans, _path, _idx) \ + trans_for_each_path_safe_from(_trans, _path, _idx, 0) + +static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path) +{ + unsigned idx = path ? path->sorted_idx + 1 : 0; + + EBUG_ON(idx > trans->nr_sorted); + + return idx < trans->nr_sorted + ? trans->paths + trans->sorted[idx] + : NULL; +} + +static inline struct btree_path *prev_btree_path(struct btree_trans *trans, struct btree_path *path) +{ + unsigned idx = path ? path->sorted_idx : trans->nr_sorted; + + return idx + ? trans->paths + trans->sorted[idx - 1] + : NULL; +} + +#define trans_for_each_path_inorder(_trans, _path, _i) \ + for (_i = 0; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\ + _i++) + +#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \ + for (_i = trans->nr_sorted - 1; \ + ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) >= 0;\ + --_i) + +static inline bool __path_has_node(const struct btree_path *path, + const struct btree *b) +{ + return path->l[b->c.level].b == b && + btree_node_lock_seq_matches(path, b, b->c.level); +} + +static inline struct btree_path * +__trans_next_path_with_node(struct btree_trans *trans, struct btree *b, + unsigned idx) +{ + struct btree_path *path = __trans_next_path(trans, idx); + + while (path && !__path_has_node(path, b)) + path = __trans_next_path(trans, path->idx + 1); + + return path; +} + +#define trans_for_each_path_with_node(_trans, _b, _path) \ + for (_path = __trans_next_path_with_node((_trans), (_b), 0); \ + (_path); \ + _path = __trans_next_path_with_node((_trans), (_b), \ + (_path)->idx + 1)) + +struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *, + bool, unsigned long); + +static inline struct btree_path * __must_check +bch2_btree_path_make_mut(struct btree_trans *trans, + struct btree_path *path, bool intent, + unsigned long ip) +{ + if (path->ref > 1 || path->preserve) + path = __bch2_btree_path_make_mut(trans, path, intent, ip); + path->should_be_locked = false; + return path; +} + +struct btree_path * __must_check +__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *, + struct bpos, bool, unsigned long, int); + +static inline struct btree_path * __must_check +bch2_btree_path_set_pos(struct btree_trans *trans, + struct btree_path *path, struct bpos new_pos, + bool intent, unsigned long ip) +{ + int cmp = bpos_cmp(new_pos, path->pos); + + return cmp + ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp) + : path; +} + +int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *, + unsigned, unsigned long); + +static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans, + struct btree_path *path, unsigned flags) +{ + if (path->uptodate < BTREE_ITER_NEED_RELOCK) + return 0; + + return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_); +} + +int __must_check bch2_btree_path_traverse(struct btree_trans *, + struct btree_path *, unsigned); +struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos, + unsigned, unsigned, unsigned, unsigned long); +struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *); + +struct bkey_i *bch2_btree_journal_peek_slot(struct btree_trans *, + struct btree_iter *, struct bpos); + +void bch2_btree_path_level_init(struct btree_trans *, struct btree_path *, struct btree *); + +int __bch2_trans_mutex_lock(struct btree_trans *, struct mutex *); + +static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex *lock) +{ + return mutex_trylock(lock) + ? 0 + : __bch2_trans_mutex_lock(trans, lock); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_trans_verify_paths(struct btree_trans *); +void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, + struct bpos, bool); +#else +static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, + struct bpos pos, bool key_cache) {} +#endif + +void bch2_btree_path_fix_key_modified(struct btree_trans *trans, + struct btree *, struct bkey_packed *); +void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_packed *, unsigned, unsigned); + +int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *); + +void bch2_path_put(struct btree_trans *, struct btree_path *, bool); + +int bch2_trans_relock(struct btree_trans *); +int bch2_trans_relock_notrace(struct btree_trans *); +void bch2_trans_unlock(struct btree_trans *); +bool bch2_trans_locked(struct btree_trans *); + +static inline bool trans_was_restarted(struct btree_trans *trans, u32 restart_count) +{ + return restart_count != trans->restart_count; +} + +void bch2_trans_restart_error(struct btree_trans *, u32); + +static inline void bch2_trans_verify_not_restarted(struct btree_trans *trans, + u32 restart_count) +{ + if (trans_was_restarted(trans, restart_count)) + bch2_trans_restart_error(trans, restart_count); +} + +void bch2_trans_in_restart_error(struct btree_trans *); + +static inline void bch2_trans_verify_not_in_restart(struct btree_trans *trans) +{ + if (trans->restarted) + bch2_trans_in_restart_error(trans); +} + +__always_inline +static int btree_trans_restart_nounlock(struct btree_trans *trans, int err) +{ + BUG_ON(err <= 0); + BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); + + trans->restarted = err; + trans->last_restarted_ip = _THIS_IP_; + return -err; +} + +__always_inline +static int btree_trans_restart(struct btree_trans *trans, int err) +{ + btree_trans_restart_nounlock(trans, err); + return -err; +} + +bool bch2_btree_node_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +void __bch2_btree_path_downgrade(struct btree_trans *, struct btree_path *, unsigned); + +static inline void bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned new_locks_want = path->level + !!path->intent_ref; + + if (path->locks_want > new_locks_want) + __bch2_btree_path_downgrade(trans, path, new_locks_want); +} + +void bch2_trans_downgrade(struct btree_trans *); + +void bch2_trans_node_add(struct btree_trans *trans, struct btree *); +void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); + +int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); +int __must_check bch2_btree_iter_traverse(struct btree_iter *); + +struct btree *bch2_btree_iter_peek_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *); + +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +{ + return bch2_btree_iter_peek_upto(iter, SPOS_MAX); +} + +struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); + +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); + +bool bch2_btree_iter_advance(struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_iter *); + +static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + iter->k.type = KEY_TYPE_deleted; + iter->k.p.inode = iter->pos.inode = new_pos.inode; + iter->k.p.offset = iter->pos.offset = new_pos.offset; + iter->k.p.snapshot = iter->pos.snapshot = new_pos.snapshot; + iter->k.size = 0; +} + +static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +{ + if (unlikely(iter->update_path)) + bch2_path_put(iter->trans, iter->update_path, + iter->flags & BTREE_ITER_INTENT); + iter->update_path = NULL; + + if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS)) + new_pos.snapshot = iter->snapshot; + + __bch2_btree_iter_set_pos(iter, new_pos); +} + +static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *iter) +{ + BUG_ON(!(iter->flags & BTREE_ITER_IS_EXTENTS)); + iter->pos = bkey_start_pos(&iter->k); +} + +static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) +{ + struct bpos pos = iter->pos; + + iter->snapshot = snapshot; + pos.snapshot = snapshot; + bch2_btree_iter_set_pos(iter, pos); +} + +void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); + +static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned flags) +{ + if (flags & BTREE_ITER_ALL_LEVELS) + flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) && + btree_node_type_is_extents(btree_id)) + flags |= BTREE_ITER_IS_EXTENTS; + + if (!(flags & __BTREE_ITER_ALL_SNAPSHOTS) && + !btree_type_has_snapshots(btree_id)) + flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + + if (!(flags & BTREE_ITER_ALL_SNAPSHOTS) && + btree_type_has_snapshots(btree_id)) + flags |= BTREE_ITER_FILTER_SNAPSHOTS; + + if (trans->journal_replay_not_finished) + flags |= BTREE_ITER_WITH_JOURNAL; + + return flags; +} + +static inline unsigned bch2_btree_iter_flags(struct btree_trans *trans, + unsigned btree_id, + unsigned flags) +{ + if (!btree_id_cached(trans->c, btree_id)) { + flags &= ~BTREE_ITER_CACHED; + flags &= ~BTREE_ITER_WITH_KEY_CACHE; + } else if (!(flags & BTREE_ITER_CACHED)) + flags |= BTREE_ITER_WITH_KEY_CACHE; + + return __bch2_btree_iter_flags(trans, btree_id, flags); +} + +static inline void bch2_trans_iter_init_common(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned locks_want, + unsigned depth, + unsigned flags, + unsigned long ip) +{ + memset(iter, 0, sizeof(*iter)); + iter->trans = trans; + iter->btree_id = btree_id; + iter->flags = flags; + iter->snapshot = pos.snapshot; + iter->pos = pos; + iter->k.p = pos; + +#ifdef CONFIG_BCACHEFS_DEBUG + iter->ip_allocated = ip; +#endif + iter->path = bch2_path_get(trans, btree_id, iter->pos, + locks_want, depth, flags, ip); +} + +void bch2_trans_iter_init_outlined(struct btree_trans *, struct btree_iter *, + unsigned, struct bpos, unsigned); + +static inline void bch2_trans_iter_init(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + if (__builtin_constant_p(btree_id) && + __builtin_constant_p(flags)) + bch2_trans_iter_init_common(trans, iter, btree_id, pos, 0, 0, + bch2_btree_iter_flags(trans, btree_id, flags), + _THIS_IP_); + else + bch2_trans_iter_init_outlined(trans, iter, btree_id, pos, flags); +} + +void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos, + unsigned, unsigned, unsigned); +void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); + +static inline void set_btree_iter_dontneed(struct btree_iter *iter) +{ + if (!iter->trans->restarted) + iter->path->preserve = false; +} + +void *__bch2_trans_kmalloc(struct btree_trans *, size_t); + +static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + size = roundup(size, 8); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; + memset(p, 0, size); + return p; + } else { + return __bch2_trans_kmalloc(trans, size); + } +} + +static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + size = roundup(size, 8); + + if (likely(trans->mem_top + size <= trans->mem_bytes)) { + void *p = trans->mem + trans->mem_top; + + trans->mem_top += size; + return p; + } else { + return __bch2_trans_kmalloc(trans, size); + } +} + +static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type) +{ + struct bkey_s_c k; + + bch2_trans_iter_init(trans, iter, btree_id, pos, flags); + k = bch2_btree_iter_peek_slot(iter); + + if (!bkey_err(k) && type && k.k->type != type) + k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); + if (unlikely(bkey_err(k))) + bch2_trans_iter_exit(trans, iter); + return k; +} + +static inline struct bkey_s_c bch2_bkey_get_iter(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_iter(trans, iter, btree_id, pos, flags, 0); +} + +#define bch2_bkey_get_iter_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ + bkey_s_c_to_##_type(__bch2_bkey_get_iter(_trans, _iter, \ + _btree_id, _pos, _flags, KEY_TYPE_##_type)) + +static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, + unsigned val_size, void *val) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = __bch2_bkey_get_iter(trans, &iter, btree_id, pos, flags, type); + ret = bkey_err(k); + if (!ret) { + unsigned b = min_t(unsigned, bkey_val_bytes(k.k), val_size); + + memcpy(val, k.v, b); + if (unlikely(b < sizeof(*val))) + memset((void *) val + b, 0, sizeof(*val) - b); + bch2_trans_iter_exit(trans, &iter); + } + + return ret; +} + +#define bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, _type, _val)\ + __bch2_bkey_get_val_typed(_trans, _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(*_val), _val) + +u32 bch2_trans_begin(struct btree_trans *); + +/* + * XXX + * this does not handle transaction restarts from bch2_btree_iter_next_node() + * correctly + */ +#define __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _locks_want, _depth, _flags, _b, _ret) \ + for (bch2_trans_node_iter_init((_trans), &(_iter), (_btree_id), \ + _start, _locks_want, _depth, _flags); \ + (_b) = bch2_btree_iter_peek_node_and_restart(&(_iter)), \ + !((_ret) = PTR_ERR_OR_ZERO(_b)) && (_b); \ + (_b) = bch2_btree_iter_next_node(&(_iter))) + +#define for_each_btree_node(_trans, _iter, _btree_id, _start, \ + _flags, _b, _ret) \ + __for_each_btree_node(_trans, _iter, _btree_id, _start, \ + 0, 0, _flags, _b, _ret) + +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, + unsigned flags) +{ + BUG_ON(flags & BTREE_ITER_ALL_LEVELS); + + return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek_prev(iter); +} + +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, + unsigned flags) +{ + return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) : + flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) : + bch2_btree_iter_peek(iter); +} + +static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + if (!(flags & BTREE_ITER_SLOTS)) + return bch2_btree_iter_peek_upto(iter, end); + + if (bkey_gt(iter->pos, end)) + return bkey_s_c_null; + + return bch2_btree_iter_peek_slot(iter); +} + +static inline int btree_trans_too_many_iters(struct btree_trans *trans) +{ + if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) { + trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters); + } + + return 0; +} + +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); + +static inline struct bkey_s_c +__bch2_btree_iter_peek_and_restart(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(iter, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +static inline struct bkey_s_c +__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + unsigned flags) +{ + struct bkey_s_c k; + + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_upto_type(iter, end, flags), + bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) + bch2_trans_begin(trans); + + return k; +} + +#define lockrestart_do(_trans, _do) \ +({ \ + u32 _restart_count; \ + int _ret; \ + \ + do { \ + _restart_count = bch2_trans_begin(_trans); \ + _ret = (_do); \ + } while (bch2_err_matches(_ret, BCH_ERR_transaction_restart)); \ + \ + if (!_ret) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ + _ret; \ +}) + +/* + * nested_lockrestart_do(), nested_commit_do(): + * + * These are like lockrestart_do() and commit_do(), with two differences: + * + * - We don't call bch2_trans_begin() unless we had a transaction restart + * - We return -BCH_ERR_transaction_restart_nested if we succeeded after a + * transaction restart + */ +#define nested_lockrestart_do(_trans, _do) \ +({ \ + u32 _restart_count, _orig_restart_count; \ + int _ret; \ + \ + _restart_count = _orig_restart_count = (_trans)->restart_count; \ + \ + while (bch2_err_matches(_ret = (_do), BCH_ERR_transaction_restart))\ + _restart_count = bch2_trans_begin(_trans); \ + \ + if (!_ret) \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + \ + if (!_ret && trans_was_restarted(_trans, _orig_restart_count)) \ + _ret = -BCH_ERR_transaction_restart_nested; \ + \ + _ret; \ +}) + +#define for_each_btree_key2(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + \ + _ret = 0; \ + (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \ + if (!(_k).k) \ + break; \ + \ + _ret = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_advance(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret; \ +}) + +#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _do) \ +({ \ + int _ret = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + \ + _ret = 0; \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\ + if (!(_k).k) \ + break; \ + \ + _ret = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_advance(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret; \ +}) + +#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \ + _start, _flags, _k, _do) \ +({ \ + int _ret = 0; \ + \ + bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + \ + while (1) { \ + u32 _restart_count = bch2_trans_begin(_trans); \ + (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\ + if (!(_k).k) { \ + _ret = 0; \ + break; \ + } \ + \ + _ret = bkey_err(_k) ?: (_do); \ + if (bch2_err_matches(_ret, BCH_ERR_transaction_restart))\ + continue; \ + if (_ret) \ + break; \ + bch2_trans_verify_not_restarted(_trans, _restart_count);\ + if (!bch2_btree_iter_rewind(&(_iter))) \ + break; \ + } \ + \ + bch2_trans_iter_exit((_trans), &(_iter)); \ + _ret; \ +}) + +#define for_each_btree_key_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key_reverse_commit(_trans, _iter, _btree_id, \ + _start, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key_reverse(_trans, _iter, _btree_id, _start, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key_upto_commit(_trans, _iter, _btree_id, \ + _start, _end, _iter_flags, _k, \ + _disk_res, _journal_seq, _commit_flags,\ + _do) \ + for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\ + (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_commit_flags))) + +#define for_each_btree_key(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \ + &(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, \ + _start, _end, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \ + for (; \ + (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ + for (; \ + (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\ + for (; \ + (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_advance(&(_iter))) + +#define drop_locks_do(_trans, _do) \ +({ \ + bch2_trans_unlock(_trans); \ + _do ?: bch2_trans_relock(_trans); \ +}) + +#define allocate_dropping_locks_errcode(_trans, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + int _ret = _do; \ + \ + if (bch2_err_matches(_ret, ENOMEM)) { \ + _gfp = GFP_KERNEL; \ + _ret = drop_locks_do(trans, _do); \ + } \ + _ret; \ +}) + +#define allocate_dropping_locks(_trans, _ret, _do) \ +({ \ + gfp_t _gfp = GFP_NOWAIT|__GFP_NOWARN; \ + typeof(_do) _p = _do; \ + \ + _ret = 0; \ + if (unlikely(!_p)) { \ + _gfp = GFP_KERNEL; \ + _ret = drop_locks_do(trans, ((_p = _do), 0)); \ + } \ + _p; \ +}) + +/* new multiple iterator interface: */ + +void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); +void bch2_btree_path_to_text(struct printbuf *, struct btree_path *); +void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); +void bch2_dump_trans_updates(struct btree_trans *); +void bch2_dump_trans_paths_updates(struct btree_trans *); +void __bch2_trans_init(struct btree_trans *, struct bch_fs *, unsigned); +void bch2_trans_exit(struct btree_trans *); + +extern const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR]; +unsigned bch2_trans_get_fn_idx(const char *); + +#define bch2_trans_init(_trans, _c, _nr_iters, _mem) \ +do { \ + static unsigned trans_fn_idx; \ + \ + if (unlikely(!trans_fn_idx)) \ + trans_fn_idx = bch2_trans_get_fn_idx(__func__); \ + \ + __bch2_trans_init(_trans, _c, trans_fn_idx); \ +} while (0) + +void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); + +void bch2_fs_btree_iter_exit(struct bch_fs *); +int bch2_fs_btree_iter_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_ITER_H */ diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c new file mode 100644 index 000000000..f7c001d42 --- /dev/null +++ b/fs/bcachefs/btree_key_cache.c @@ -0,0 +1,1088 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "trace.h" + +#include +#include + +static inline bool btree_uses_pcpu_readers(enum btree_id id) +{ + return id == BTREE_ID_subvolumes; +} + +static struct kmem_cache *bch2_key_cache; + +static int bch2_btree_key_cache_cmp_fn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct bkey_cached *ck = obj; + const struct bkey_cached_key *key = arg->key; + + return ck->key.btree_id != key->btree_id || + !bpos_eq(ck->key.pos, key->pos); +} + +static const struct rhashtable_params bch2_btree_key_cache_params = { + .head_offset = offsetof(struct bkey_cached, hash), + .key_offset = offsetof(struct bkey_cached, key), + .key_len = sizeof(struct bkey_cached_key), + .obj_cmpfn = bch2_btree_key_cache_cmp_fn, +}; + +__flatten +inline struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *c, enum btree_id btree_id, struct bpos pos) +{ + struct bkey_cached_key key = { + .btree_id = btree_id, + .pos = pos, + }; + + return rhashtable_lookup_fast(&c->btree_key_cache.table, &key, + bch2_btree_key_cache_params); +} + +static bool bkey_cached_lock_for_evict(struct bkey_cached *ck) +{ + if (!six_trylock_intent(&ck->c.lock)) + return false; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + six_unlock_intent(&ck->c.lock); + return false; + } + + if (!six_trylock_write(&ck->c.lock)) { + six_unlock_intent(&ck->c.lock); + return false; + } + + return true; +} + +static void bkey_cached_evict(struct btree_key_cache *c, + struct bkey_cached *ck) +{ + BUG_ON(rhashtable_remove_fast(&c->table, &ck->hash, + bch2_btree_key_cache_params)); + memset(&ck->key, ~0, sizeof(ck->key)); + + atomic_long_dec(&c->nr_keys); +} + +static void bkey_cached_free(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + if (ck->c.lock.readers) + list_move_tail(&ck->list, &bc->freed_pcpu); + else + list_move_tail(&ck->list, &bc->freed_nonpcpu); + atomic_long_inc(&bc->nr_freed); + + kfree(ck->k); + ck->k = NULL; + ck->u64s = 0; + + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); +} + +#ifdef __KERNEL__ +static void __bkey_cached_move_to_freelist_ordered(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bkey_cached *pos; + + list_for_each_entry_reverse(pos, &bc->freed_nonpcpu, list) { + if (ULONG_CMP_GE(ck->btree_trans_barrier_seq, + pos->btree_trans_barrier_seq)) { + list_move(&ck->list, &pos->list); + return; + } + } + + list_move(&ck->list, &bc->freed_nonpcpu); +} +#endif + +static void bkey_cached_move_to_freelist(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + BUG_ON(test_bit(BKEY_CACHED_DIRTY, &ck->flags)); + + if (!ck->c.lock.readers) { +#ifdef __KERNEL__ + struct btree_key_cache_freelist *f; + bool freed = false; + + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + if (f->nr < ARRAY_SIZE(f->objs)) { + f->objs[f->nr++] = ck; + freed = true; + } + preempt_enable(); + + if (!freed) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (f->nr > ARRAY_SIZE(f->objs) / 2) { + struct bkey_cached *ck2 = f->objs[--f->nr]; + + __bkey_cached_move_to_freelist_ordered(bc, ck2); + } + preempt_enable(); + + __bkey_cached_move_to_freelist_ordered(bc, ck); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_nonpcpu); + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + list_move_tail(&ck->list, &bc->freed_pcpu); + mutex_unlock(&bc->lock); + } +} + +static void bkey_cached_free_fast(struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + + ck->btree_trans_barrier_seq = + start_poll_synchronize_srcu(&c->btree_trans_barrier); + + list_del_init(&ck->list); + atomic_long_inc(&bc->nr_freed); + + kfree(ck->k); + ck->k = NULL; + ck->u64s = 0; + + bkey_cached_move_to_freelist(bc, ck); + + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); +} + +static struct bkey_cached * +bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, + bool *was_new) +{ + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck = NULL; + bool pcpu_readers = btree_uses_pcpu_readers(path->btree_id); + int ret; + + if (!pcpu_readers) { +#ifdef __KERNEL__ + struct btree_key_cache_freelist *f; + + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + if (f->nr) + ck = f->objs[--f->nr]; + preempt_enable(); + + if (!ck) { + mutex_lock(&bc->lock); + preempt_disable(); + f = this_cpu_ptr(bc->pcpu_freed); + + while (!list_empty(&bc->freed_nonpcpu) && + f->nr < ARRAY_SIZE(f->objs) / 2) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + f->objs[f->nr++] = ck; + } + + ck = f->nr ? f->objs[--f->nr] : NULL; + preempt_enable(); + mutex_unlock(&bc->lock); + } +#else + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_nonpcpu)) { + ck = list_last_entry(&bc->freed_nonpcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); +#endif + } else { + mutex_lock(&bc->lock); + if (!list_empty(&bc->freed_pcpu)) { + ck = list_last_entry(&bc->freed_pcpu, struct bkey_cached, list); + list_del_init(&ck->list); + } + mutex_unlock(&bc->lock); + } + + if (ck) { + int ret; + + ret = btree_node_lock_nopath(trans, &ck->c, SIX_LOCK_intent, _THIS_IP_); + if (unlikely(ret)) { + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + path->l[0].b = (void *) ck; + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + + ret = bch2_btree_node_lock_write(trans, path, &ck->c); + if (unlikely(ret)) { + btree_node_unlock(trans, path, 0); + bkey_cached_move_to_freelist(bc, ck); + return ERR_PTR(ret); + } + + return ck; + } + + ck = allocate_dropping_locks(trans, ret, + kmem_cache_zalloc(bch2_key_cache, _gfp)); + if (ret) { + kmem_cache_free(bch2_key_cache, ck); + return ERR_PTR(ret); + } + + if (!ck) + return NULL; + + INIT_LIST_HEAD(&ck->list); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + + ck->c.cached = true; + BUG_ON(!six_trylock_intent(&ck->c.lock)); + BUG_ON(!six_trylock_write(&ck->c.lock)); + *was_new = true; + return ck; +} + +static struct bkey_cached * +bkey_cached_reuse(struct btree_key_cache *c) +{ + struct bucket_table *tbl; + struct rhash_head *pos; + struct bkey_cached *ck; + unsigned i; + + mutex_lock(&c->lock); + rcu_read_lock(); + tbl = rht_dereference_rcu(c->table.tbl, &c->table); + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(c, ck); + goto out; + } + } + ck = NULL; +out: + rcu_read_unlock(); + mutex_unlock(&c->lock); + return ck; +} + +static struct bkey_cached * +btree_key_cache_create(struct btree_trans *trans, struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct btree_key_cache *bc = &c->btree_key_cache; + struct bkey_cached *ck; + bool was_new = false; + + ck = bkey_cached_alloc(trans, path, &was_new); + if (IS_ERR(ck)) + return ck; + + if (unlikely(!ck)) { + ck = bkey_cached_reuse(bc); + if (unlikely(!ck)) { + bch_err(c, "error allocating memory for key cache item, btree %s", + bch2_btree_ids[path->btree_id]); + return ERR_PTR(-BCH_ERR_ENOMEM_btree_key_cache_create); + } + + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + } + + ck->c.level = 0; + ck->c.btree_id = path->btree_id; + ck->key.btree_id = path->btree_id; + ck->key.pos = path->pos; + ck->valid = false; + ck->flags = 1U << BKEY_CACHED_ACCESSED; + + if (unlikely(rhashtable_lookup_insert_fast(&bc->table, + &ck->hash, + bch2_btree_key_cache_params))) { + /* We raced with another fill: */ + + if (likely(was_new)) { + six_unlock_write(&ck->c.lock); + six_unlock_intent(&ck->c.lock); + kfree(ck); + } else { + bkey_cached_free_fast(bc, ck); + } + + mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); + return NULL; + } + + atomic_long_inc(&bc->nr_keys); + + six_unlock_write(&ck->c.lock); + + return ck; +} + +static int btree_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_cached *ck) +{ + struct btree_iter iter; + struct bkey_s_c k; + unsigned new_u64s = 0; + struct bkey_i *new_k = NULL; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, ck->key.btree_id, ck->key.pos, + BTREE_ITER_KEY_CACHE_FILL| + BTREE_ITER_CACHED_NOFILL); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + /* + * bch2_varint_decode can read past the end of the buffer by at + * most 7 bytes (it won't be used): + */ + new_u64s = k.k->u64s + 1; + + /* + * Allocate some extra space so that the transaction commit path is less + * likely to have to reallocate, since that requires a transaction + * restart: + */ + new_u64s = min(256U, (new_u64s * 3) / 2); + + if (new_u64s > ck->u64s) { + new_u64s = roundup_pow_of_two(new_u64s); + new_k = kmalloc(new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN); + if (!new_k) { + bch2_trans_unlock(trans); + + new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); + if (!new_k) { + bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[ck->key.btree_id], new_u64s); + ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + goto err; + } + + if (!bch2_btree_node_relock(trans, ck_path, 0)) { + kfree(new_k); + trace_and_count(trans->c, trans_restart_relock_key_cache_fill, trans, _THIS_IP_, ck_path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_fill); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) { + kfree(new_k); + goto err; + } + } + } + + ret = bch2_btree_node_lock_write(trans, ck_path, &ck_path->l[0].b->c); + if (ret) { + kfree(new_k); + goto err; + } + + if (new_k) { + kfree(ck->k); + ck->u64s = new_u64s; + ck->k = new_k; + } + + bkey_reassemble(ck->k, k); + ck->valid = true; + bch2_btree_node_unlock_write(trans, ck_path, ck_path->l[0].b); + + /* We're not likely to need this iterator again: */ + set_btree_iter_dontneed(&iter); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline int +bch2_btree_path_traverse_cached_slowpath(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + BUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock_notrace(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + ck = btree_key_cache_create(trans, path); + ret = PTR_ERR_OR_ZERO(ck); + if (ret) + goto err; + if (!ck) + goto retry; + + mark_btree_node_locked(trans, path, 0, SIX_LOCK_intent); + path->locks_want = 1; + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto err; + + BUG_ON(ret); + + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; +fill: + path->uptodate = BTREE_ITER_UPTODATE; + + if (!ck->valid && !(flags & BTREE_ITER_CACHED_NOFILL)) { + /* + * Using the underscore version because we haven't set + * path->uptodate yet: + */ + if (!path->locks_want && + !__bch2_btree_path_upgrade(trans, path, 1)) { + trace_and_count(trans->c, trans_restart_key_cache_upgrade, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_upgrade); + goto err; + } + + ret = btree_key_cache_fill(trans, path, ck); + if (ret) + goto err; + + ret = bch2_btree_path_relock(trans, path, _THIS_IP_); + if (ret) + goto err; + + path->uptodate = BTREE_ITER_UPTODATE; + } + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + BUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + BUG_ON(path->uptodate); + + return ret; +err: + path->uptodate = BTREE_ITER_NEED_TRAVERSE; + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + btree_node_unlock(trans, path, 0); + path->l[0].b = ERR_PTR(ret); + } + return ret; +} + +int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck; + int ret = 0; + + EBUG_ON(path->level); + + path->l[1].b = NULL; + + if (bch2_btree_node_relock_notrace(trans, path, 0)) { + ck = (void *) path->l[0].b; + goto fill; + } +retry: + ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); + if (!ck) { + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + } else { + enum six_lock_type lock_want = __btree_lock_want(path, 0); + + ret = btree_node_lock(trans, path, (void *) ck, 0, + lock_want, _THIS_IP_); + EBUG_ON(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret) + return ret; + + if (ck->key.btree_id != path->btree_id || + !bpos_eq(ck->key.pos, path->pos)) { + six_unlock_type(&ck->c.lock, lock_want); + goto retry; + } + + mark_btree_node_locked(trans, path, 0, lock_want); + } + + path->l[0].lock_seq = six_lock_seq(&ck->c.lock); + path->l[0].b = (void *) ck; +fill: + if (!ck->valid) + return bch2_btree_path_traverse_cached_slowpath(trans, path, flags); + + if (!test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + set_bit(BKEY_CACHED_ACCESSED, &ck->flags); + + path->uptodate = BTREE_ITER_UPTODATE; + EBUG_ON(!ck->valid); + EBUG_ON(btree_node_locked_type(path, 0) != btree_lock_want(path, 0)); + + return ret; +} + +static int btree_key_cache_flush_pos(struct btree_trans *trans, + struct bkey_cached_key key, + u64 journal_seq, + unsigned commit_flags, + bool evict) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_iter c_iter, b_iter; + struct bkey_cached *ck = NULL; + int ret; + + bch2_trans_iter_init(trans, &b_iter, key.btree_id, key.pos, + BTREE_ITER_SLOTS| + BTREE_ITER_INTENT| + BTREE_ITER_ALL_SNAPSHOTS); + bch2_trans_iter_init(trans, &c_iter, key.btree_id, key.pos, + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + b_iter.flags &= ~BTREE_ITER_WITH_KEY_CACHE; + + ret = bch2_btree_iter_traverse(&c_iter); + if (ret) + goto out; + + ck = (void *) c_iter.path->l[0].b; + if (!ck) + goto out; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + if (evict) + goto evict; + goto out; + } + + BUG_ON(!ck->valid); + + if (journal_seq && ck->journal.seq != journal_seq) + goto out; + + /* + * Since journal reclaim depends on us making progress here, and the + * allocator/copygc depend on journal reclaim making progress, we need + * to be using alloc reserves: + */ + ret = bch2_btree_iter_traverse(&b_iter) ?: + bch2_trans_update(trans, &b_iter, ck->k, + BTREE_UPDATE_KEY_CACHE_RECLAIM| + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + (ck->journal.seq == journal_last_seq(j) + ? BCH_WATERMARK_reclaim + : 0)| + commit_flags); + + bch2_fs_fatal_err_on(ret && + !bch2_err_matches(ret, BCH_ERR_transaction_restart) && + !bch2_err_matches(ret, BCH_ERR_journal_reclaim_would_deadlock) && + !bch2_journal_error(j), c, + "error flushing key cache: %s", bch2_err_str(ret)); + if (ret) + goto out; + + bch2_journal_pin_drop(j, &ck->journal); + bch2_journal_preres_put(j, &ck->res); + + BUG_ON(!btree_node_locked(c_iter.path, 0)); + + if (!evict) { + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + } + } else { + struct btree_path *path2; +evict: + trans_for_each_path(trans, path2) + if (path2 != c_iter.path) + __bch2_btree_path_unlock(trans, path2); + + bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c); + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + } + + mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED); + bkey_cached_evict(&c->btree_key_cache, ck); + bkey_cached_free_fast(&c->btree_key_cache, ck); + } +out: + bch2_trans_iter_exit(trans, &b_iter); + bch2_trans_iter_exit(trans, &c_iter); + return ret; +} + +int bch2_btree_key_cache_journal_flush(struct journal *j, + struct journal_entry_pin *pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bkey_cached *ck = + container_of(pin, struct bkey_cached, journal); + struct bkey_cached_key key; + struct btree_trans trans; + int srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &ck->c, SIX_LOCK_read); + key = ck->key; + + if (ck->journal.seq != seq || + !test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + six_unlock_read(&ck->c.lock); + goto unlock; + } + + if (ck->seq != seq) { + bch2_journal_pin_update(&c->journal, ck->seq, &ck->journal, + bch2_btree_key_cache_journal_flush); + six_unlock_read(&ck->c.lock); + goto unlock; + } + six_unlock_read(&ck->c.lock); + + ret = commit_do(&trans, NULL, NULL, 0, + btree_key_cache_flush_pos(&trans, key, seq, + BTREE_INSERT_JOURNAL_RECLAIM, false)); +unlock: + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + + bch2_trans_exit(&trans); + return ret; +} + +/* + * Flush and evict a key from the key cache: + */ +int bch2_btree_key_cache_flush(struct btree_trans *trans, + enum btree_id id, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct bkey_cached_key key = { id, pos }; + + /* Fastpath - assume it won't be found: */ + if (!bch2_btree_key_cache_find(c, id, pos)) + return 0; + + return btree_key_cache_flush_pos(trans, key, 0, 0, true); +} + +bool bch2_btree_insert_key_cached(struct btree_trans *trans, + unsigned flags, + struct btree_insert_entry *insert_entry) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) insert_entry->path->l[0].b; + struct bkey_i *insert = insert_entry->k; + bool kick_reclaim = false; + + BUG_ON(insert->k.u64s > ck->u64s); + + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + int difference; + + BUG_ON(jset_u64s(insert->k.u64s) > trans->journal_preres.u64s); + + difference = jset_u64s(insert->k.u64s) - ck->res.u64s; + if (difference > 0) { + trans->journal_preres.u64s -= difference; + ck->res.u64s += difference; + } + } + + bkey_copy(ck->k, insert); + ck->valid = true; + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_inc(&c->btree_key_cache.nr_dirty); + + if (bch2_nr_btree_keys_need_flush(c)) + kick_reclaim = true; + } + + /* + * To minimize lock contention, we only add the journal pin here and + * defer pin updates to the flush callback via ->seq. Be careful not to + * update ->seq on nojournal commits because we don't want to update the + * pin to a seq that doesn't include journal updates on disk. Otherwise + * we risk losing the update after a crash. + * + * The only exception is if the pin is not active in the first place. We + * have to add the pin because journal reclaim drives key cache + * flushing. The flush callback will not proceed unless ->seq matches + * the latest pin, so make sure it starts with a consistent value. + */ + if (!(insert_entry->flags & BTREE_UPDATE_NOJOURNAL) || + !journal_pin_active(&ck->journal)) { + ck->seq = trans->journal_res.seq; + } + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + &ck->journal, bch2_btree_key_cache_journal_flush); + + if (kick_reclaim) + journal_reclaim_kick(&c->journal); + return true; +} + +void bch2_btree_key_cache_drop(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + + BUG_ON(!ck->valid); + + /* + * We just did an update to the btree, bypassing the key cache: the key + * cache key is now stale and must be dropped, even if dirty: + */ + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + clear_bit(BKEY_CACHED_DIRTY, &ck->flags); + atomic_long_dec(&c->btree_key_cache.nr_dirty); + bch2_journal_pin_drop(&c->journal, &ck->journal); + } + + ck->valid = false; +} + +static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + struct bucket_table *tbl; + struct bkey_cached *ck, *t; + size_t scanned = 0, freed = 0, nr = sc->nr_to_scan; + unsigned start, flags; + int srcu_idx; + + mutex_lock(&bc->lock); + srcu_idx = srcu_read_lock(&c->btree_trans_barrier); + flags = memalloc_nofs_save(); + + /* + * Newest freed entries are at the end of the list - once we hit one + * that's too new to be freed, we can bail out: + */ + list_for_each_entry_safe(ck, t, &bc->freed_nonpcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + list_for_each_entry_safe(ck, t, &bc->freed_pcpu, list) { + if (!poll_state_synchronize_srcu(&c->btree_trans_barrier, + ck->btree_trans_barrier_seq)) + break; + + list_del(&ck->list); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + atomic_long_dec(&bc->nr_freed); + scanned++; + freed++; + } + + if (scanned >= nr) + goto out; + + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + start = bc->shrink_iter; + + do { + struct rhash_head *pos, *next; + + pos = rht_ptr_rcu(rht_bucket(tbl, bc->shrink_iter)); + + while (!rht_is_a_nulls(pos)) { + next = rht_dereference_bucket_rcu(pos->next, tbl, bc->shrink_iter); + ck = container_of(pos, struct bkey_cached, hash); + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) + goto next; + + if (test_bit(BKEY_CACHED_ACCESSED, &ck->flags)) + clear_bit(BKEY_CACHED_ACCESSED, &ck->flags); + else if (bkey_cached_lock_for_evict(ck)) { + bkey_cached_evict(bc, ck); + bkey_cached_free(bc, ck); + } + + scanned++; + if (scanned >= nr) + break; +next: + pos = next; + } + + bc->shrink_iter++; + if (bc->shrink_iter >= tbl->size) + bc->shrink_iter = 0; + } while (scanned < nr && bc->shrink_iter != start); + + rcu_read_unlock(); +out: + memalloc_nofs_restore(flags); + srcu_read_unlock(&c->btree_trans_barrier, srcu_idx); + mutex_unlock(&bc->lock); + + return freed; +} + +static unsigned long bch2_btree_key_cache_count(struct shrinker *shrink, + struct shrink_control *sc) +{ + struct bch_fs *c = container_of(shrink, struct bch_fs, + btree_key_cache.shrink); + struct btree_key_cache *bc = &c->btree_key_cache; + long nr = atomic_long_read(&bc->nr_keys) - + atomic_long_read(&bc->nr_dirty); + + return max(0L, nr); +} + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + struct bucket_table *tbl; + struct bkey_cached *ck, *n; + struct rhash_head *pos; + LIST_HEAD(items); + unsigned i; +#ifdef __KERNEL__ + int cpu; +#endif + + unregister_shrinker(&bc->shrink); + + mutex_lock(&bc->lock); + + /* + * The loop is needed to guard against racing with rehash: + */ + while (atomic_long_read(&bc->nr_keys)) { + rcu_read_lock(); + tbl = rht_dereference_rcu(bc->table.tbl, &bc->table); + if (tbl) + for (i = 0; i < tbl->size; i++) + rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { + bkey_cached_evict(bc, ck); + list_add(&ck->list, &items); + } + rcu_read_unlock(); + } + +#ifdef __KERNEL__ + for_each_possible_cpu(cpu) { + struct btree_key_cache_freelist *f = + per_cpu_ptr(bc->pcpu_freed, cpu); + + for (i = 0; i < f->nr; i++) { + ck = f->objs[i]; + list_add(&ck->list, &items); + } + } +#endif + + list_splice(&bc->freed_pcpu, &items); + list_splice(&bc->freed_nonpcpu, &items); + + mutex_unlock(&bc->lock); + + list_for_each_entry_safe(ck, n, &items, list) { + cond_resched(); + + bch2_journal_pin_drop(&c->journal, &ck->journal); + bch2_journal_preres_put(&c->journal, &ck->res); + + list_del(&ck->list); + kfree(ck->k); + six_lock_exit(&ck->c.lock); + kmem_cache_free(bch2_key_cache, ck); + } + + if (atomic_long_read(&bc->nr_dirty) && + !bch2_journal_error(&c->journal) && + test_bit(BCH_FS_WAS_RW, &c->flags)) + panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n", + atomic_long_read(&bc->nr_dirty)); + + if (atomic_long_read(&bc->nr_keys)) + panic("btree key cache shutdown error: nr_keys nonzero (%li)\n", + atomic_long_read(&bc->nr_keys)); + + if (bc->table_init_done) + rhashtable_destroy(&bc->table); + + free_percpu(bc->pcpu_freed); +} + +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *c) +{ + mutex_init(&c->lock); + INIT_LIST_HEAD(&c->freed_pcpu); + INIT_LIST_HEAD(&c->freed_nonpcpu); +} + +static void bch2_btree_key_cache_shrinker_to_text(struct seq_buf *s, struct shrinker *shrink) +{ + struct btree_key_cache *bc = + container_of(shrink, struct btree_key_cache, shrink); + char *cbuf; + size_t buflen = seq_buf_get_buf(s, &cbuf); + struct printbuf out = PRINTBUF_EXTERN(cbuf, buflen); + + bch2_btree_key_cache_to_text(&out, bc); + seq_buf_commit(s, out.pos); +} + +int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) +{ + struct bch_fs *c = container_of(bc, struct bch_fs, btree_key_cache); + +#ifdef __KERNEL__ + bc->pcpu_freed = alloc_percpu(struct btree_key_cache_freelist); + if (!bc->pcpu_freed) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; +#endif + + if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + + bc->table_init_done = true; + + bc->shrink.seeks = 0; + bc->shrink.count_objects = bch2_btree_key_cache_count; + bc->shrink.scan_objects = bch2_btree_key_cache_scan; + bc->shrink.to_text = bch2_btree_key_cache_shrinker_to_text; + if (register_shrinker(&bc->shrink, "%s/btree_key_cache", c->name)) + return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return 0; +} + +void bch2_btree_key_cache_to_text(struct printbuf *out, struct btree_key_cache *c) +{ + prt_printf(out, "nr_freed:\t%zu", atomic_long_read(&c->nr_freed)); + prt_newline(out); + prt_printf(out, "nr_keys:\t%lu", atomic_long_read(&c->nr_keys)); + prt_newline(out); + prt_printf(out, "nr_dirty:\t%lu", atomic_long_read(&c->nr_dirty)); + prt_newline(out); +} + +void bch2_btree_key_cache_exit(void) +{ + kmem_cache_destroy(bch2_key_cache); +} + +int __init bch2_btree_key_cache_init(void) +{ + bch2_key_cache = KMEM_CACHE(bkey_cached, SLAB_RECLAIM_ACCOUNT); + if (!bch2_key_cache) + return -ENOMEM; + + return 0; +} diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h new file mode 100644 index 000000000..be3acde2c --- /dev/null +++ b/fs/bcachefs/btree_key_cache.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_KEY_CACHE_H +#define _BCACHEFS_BTREE_KEY_CACHE_H + +static inline size_t bch2_nr_btree_keys_need_flush(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 1024 + nr_keys / 2; + + return max_t(ssize_t, 0, nr_dirty - max_dirty); +} + +static inline bool bch2_btree_key_cache_must_wait(struct bch_fs *c) +{ + size_t nr_dirty = atomic_long_read(&c->btree_key_cache.nr_dirty); + size_t nr_keys = atomic_long_read(&c->btree_key_cache.nr_keys); + size_t max_dirty = 4096 + (nr_keys * 3) / 4; + + return nr_dirty > max_dirty; +} + +int bch2_btree_key_cache_journal_flush(struct journal *, + struct journal_entry_pin *, u64); + +struct bkey_cached * +bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); + +int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, + unsigned); + +bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, + struct btree_insert_entry *); +int bch2_btree_key_cache_flush(struct btree_trans *, + enum btree_id, struct bpos); +void bch2_btree_key_cache_drop(struct btree_trans *, + struct btree_path *); + +void bch2_fs_btree_key_cache_exit(struct btree_key_cache *); +void bch2_fs_btree_key_cache_init_early(struct btree_key_cache *); +int bch2_fs_btree_key_cache_init(struct btree_key_cache *); + +void bch2_btree_key_cache_to_text(struct printbuf *, struct btree_key_cache *); + +void bch2_btree_key_cache_exit(void); +int __init bch2_btree_key_cache_init(void); + +#endif /* _BCACHEFS_BTREE_KEY_CACHE_H */ diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c new file mode 100644 index 000000000..d7fd87149 --- /dev/null +++ b/fs/bcachefs/btree_locking.c @@ -0,0 +1,797 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_types.h" + +static struct lock_class_key bch2_btree_node_lock_key; + +void bch2_btree_lock_init(struct btree_bkey_cached_common *b, + enum six_lock_init_flags flags) +{ + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + lockdep_set_no_check_recursion(&b->lock.dep_map); +#endif +} + +#ifdef CONFIG_LOCKDEP +void bch2_assert_btree_nodes_not_locked(void) +{ + BUG_ON(lock_class_is_held(&bch2_btree_node_lock_key)); +} +#endif + +/* Btree node locking: */ + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans, + struct btree_path *skip, + struct btree_bkey_cached_common *b, + unsigned level) +{ + struct btree_path *path; + struct six_lock_count ret; + + memset(&ret, 0, sizeof(ret)); + + if (IS_ERR_OR_NULL(b)) + return ret; + + trans_for_each_path(trans, path) + if (path != skip && &path->l[level].b->c == b) { + int t = btree_node_locked_type(path, level); + + if (t != BTREE_NODE_UNLOCKED) + ret.n[t]++; + } + + return ret; +} + +/* unlock */ + +void bch2_btree_node_unlock_write(struct btree_trans *trans, + struct btree_path *path, struct btree *b) +{ + bch2_btree_node_unlock_write_inlined(trans, path, b); +} + +/* lock */ + +/* + * @trans wants to lock @b with type @type + */ +struct trans_waiting_for_lock { + struct btree_trans *trans; + struct btree_bkey_cached_common *node_want; + enum six_lock_type lock_want; + + /* for iterating over held locks :*/ + u8 path_idx; + u8 level; + u64 lock_start_time; +}; + +struct lock_graph { + struct trans_waiting_for_lock g[8]; + unsigned nr; +}; + +static noinline void print_cycle(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + prt_printf(out, "Found lock cycle (%u entries):", g->nr); + prt_newline(out); + + for (i = g->g; i < g->g + g->nr; i++) + bch2_btree_trans_to_text(out, i->trans); +} + +static noinline void print_chain(struct printbuf *out, struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g; i != g->g + g->nr; i++) { + if (i != g->g) + prt_str(out, "<- "); + prt_printf(out, "%u ", i->trans->locking_wait.task->pid); + } + prt_newline(out); +} + +static void lock_graph_up(struct lock_graph *g) +{ + closure_put(&g->g[--g->nr].trans->ref); +} + +static noinline void lock_graph_pop_all(struct lock_graph *g) +{ + while (g->nr) + lock_graph_up(g); +} + +static void __lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + g->g[g->nr++] = (struct trans_waiting_for_lock) { + .trans = trans, + .node_want = trans->locking, + .lock_want = trans->locking_wait.lock_want, + }; +} + +static void lock_graph_down(struct lock_graph *g, struct btree_trans *trans) +{ + closure_get(&trans->ref); + __lock_graph_down(g, trans); +} + +static bool lock_graph_remove_non_waiters(struct lock_graph *g) +{ + struct trans_waiting_for_lock *i; + + for (i = g->g + 1; i < g->g + g->nr; i++) + if (i->trans->locking != i->node_want || + i->trans->locking_wait.start_time != i[-1].lock_start_time) { + while (g->g + g->nr > i) + lock_graph_up(g); + return true; + } + + return false; +} + +static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) +{ + if (i == g->g) { + trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_); + return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + } else { + i->trans->lock_must_abort = true; + wake_up_process(i->trans->locking_wait.task); + return 0; + } +} + +static int btree_trans_abort_preference(struct btree_trans *trans) +{ + if (trans->lock_may_not_fail) + return 0; + if (trans->locking_wait.lock_want == SIX_LOCK_write) + return 1; + if (!trans->in_traverse_all) + return 2; + return 3; +} + +static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle) +{ + struct trans_waiting_for_lock *i, *abort = NULL; + unsigned best = 0, pref; + int ret; + + if (lock_graph_remove_non_waiters(g)) + return 0; + + /* Only checking, for debugfs: */ + if (cycle) { + print_cycle(cycle, g); + ret = -1; + goto out; + } + + for (i = g->g; i < g->g + g->nr; i++) { + pref = btree_trans_abort_preference(i->trans); + if (pref > best) { + abort = i; + best = pref; + } + } + + if (unlikely(!best)) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); + + for (i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:"); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_string_as_lines(KERN_ERR, buf.buf); + printbuf_exit(&buf); + BUG(); + } + + ret = abort_lock(g, abort); +out: + if (ret) + while (g->nr) + lock_graph_up(g); + return ret; +} + +static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, + struct printbuf *cycle) +{ + struct btree_trans *orig_trans = g->g->trans; + struct trans_waiting_for_lock *i; + + for (i = g->g; i < g->g + g->nr; i++) + if (i->trans == trans) { + closure_put(&trans->ref); + return break_cycle(g, cycle); + } + + if (g->nr == ARRAY_SIZE(g->g)) { + closure_put(&trans->ref); + + if (orig_trans->lock_may_not_fail) + return 0; + + while (g->nr) + lock_graph_up(g); + + if (cycle) + return 0; + + trace_and_count(trans->c, trans_restart_would_deadlock_recursion_limit, trans, _RET_IP_); + return btree_trans_restart(orig_trans, BCH_ERR_transaction_restart_deadlock_recursion_limit); + } + + __lock_graph_down(g, trans); + return 0; +} + +static bool lock_type_conflicts(enum six_lock_type t1, enum six_lock_type t2) +{ + return t1 + t2 > 1; +} + +int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) +{ + struct lock_graph g; + struct trans_waiting_for_lock *top; + struct btree_bkey_cached_common *b; + struct btree_path *path; + unsigned path_idx; + int ret; + + if (trans->lock_must_abort) { + if (cycle) + return -1; + + trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock); + } + + g.nr = 0; + lock_graph_down(&g, trans); +next: + if (!g.nr) + return 0; + + top = &g.g[g.nr - 1]; + + trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) { + if (!path->nodes_locked) + continue; + + if (path_idx != top->path_idx) { + top->path_idx = path_idx; + top->level = 0; + top->lock_start_time = 0; + } + + for (; + top->level < BTREE_MAX_DEPTH; + top->level++, top->lock_start_time = 0) { + int lock_held = btree_node_locked_type(path, top->level); + + if (lock_held == BTREE_NODE_UNLOCKED) + continue; + + b = &READ_ONCE(path->l[top->level].b)->c; + + if (IS_ERR_OR_NULL(b)) { + /* + * If we get here, it means we raced with the + * other thread updating its btree_path + * structures - which means it can't be blocked + * waiting on a lock: + */ + if (!lock_graph_remove_non_waiters(&g)) { + /* + * If lock_graph_remove_non_waiters() + * didn't do anything, it must be + * because we're being called by debugfs + * checking for lock cycles, which + * invokes us on btree_transactions that + * aren't actually waiting on anything. + * Just bail out: + */ + lock_graph_pop_all(&g); + } + + goto next; + } + + if (list_empty_careful(&b->lock.wait_list)) + continue; + + raw_spin_lock(&b->lock.wait_lock); + list_for_each_entry(trans, &b->lock.wait_list, locking_wait.list) { + BUG_ON(b != trans->locking); + + if (top->lock_start_time && + time_after_eq64(top->lock_start_time, trans->locking_wait.start_time)) + continue; + + top->lock_start_time = trans->locking_wait.start_time; + + /* Don't check for self deadlock: */ + if (trans == top->trans || + !lock_type_conflicts(lock_held, trans->locking_wait.lock_want)) + continue; + + closure_get(&trans->ref); + raw_spin_unlock(&b->lock.wait_lock); + + ret = lock_graph_descend(&g, trans, cycle); + if (ret) + return ret; + goto next; + + } + raw_spin_unlock(&b->lock.wait_lock); + } + } + + if (g.nr > 1 && cycle) + print_chain(cycle, &g); + lock_graph_up(&g); + goto next; +} + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p) +{ + struct btree_trans *trans = p; + + return bch2_check_for_deadlock(trans, NULL); +} + +int __bch2_btree_node_lock_write(struct btree_trans *trans, struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) +{ + int readers = bch2_btree_node_lock_counts(trans, NULL, b, b->level).n[SIX_LOCK_read]; + int ret; + + /* + * Must drop our read locks before calling six_lock_write() - + * six_unlock() won't do wakeups until the reader count + * goes to 0, and it's safe because we have the node intent + * locked: + */ + six_lock_readers_add(&b->lock, -readers); + ret = __btree_node_lock_nopath(trans, b, SIX_LOCK_write, + lock_may_not_fail, _RET_IP_); + six_lock_readers_add(&b->lock, readers); + + if (ret) + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_intent); + + return ret; +} + +void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + struct btree_path *linked; + unsigned i; + int ret; + + /* + * XXX BIG FAT NOTICE + * + * Drop all read locks before taking a write lock: + * + * This is a hack, because bch2_btree_node_lock_write_nofail() is a + * hack - but by dropping read locks first, this should never fail, and + * we only use this in code paths where whatever read locks we've + * already taken are no longer needed: + */ + + trans_for_each_path(trans, linked) { + if (!linked->nodes_locked) + continue; + + for (i = 0; i < BTREE_MAX_DEPTH; i++) + if (btree_node_read_locked(linked, i)) { + btree_node_unlock(trans, linked, i); + btree_path_set_dirty(linked, BTREE_ITER_NEED_RELOCK); + } + } + + ret = __btree_node_lock_write(trans, path, b, true); + BUG_ON(ret); +} + +/* relock */ + +static inline bool btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade) +{ + unsigned l = path->level; + int fail_idx = -1; + + do { + if (!btree_path_node(path, l)) + break; + + if (!(upgrade + ? bch2_btree_node_upgrade(trans, path, l) + : bch2_btree_node_relock(trans, path, l))) + fail_idx = l; + + l++; + } while (l < path->locks_want); + + /* + * When we fail to get a lock, we have to ensure that any child nodes + * can't be relocked so bch2_btree_path_traverse has to walk back up to + * the node that we failed to relock: + */ + if (fail_idx >= 0) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + + do { + path->l[fail_idx].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + --fail_idx; + } while (fail_idx >= 0); + } + + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + bch2_trans_verify_locks(trans); + + return path->uptodate < BTREE_ITER_NEED_RELOCK; +} + +bool __bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level, + bool trace) +{ + struct btree *b = btree_path_node(path, level); + int want = __btree_lock_want(path, level); + + if (race_fault()) + goto fail; + + if (six_relock_type(&b->c.lock, want, path->l[level].lock_seq) || + (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, want))) { + mark_btree_node_locked(trans, path, level, want); + return true; + } +fail: + if (trace && !trans->notrace_relock_fail) + trace_and_count(trans->c, btree_path_relock_fail, trans, _RET_IP_, path, level); + return false; +} + +/* upgrade */ + +bool bch2_btree_node_upgrade(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + struct btree *b = path->l[level].b; + struct six_lock_count count = bch2_btree_node_lock_counts(trans, path, &b->c, level); + + if (!is_btree_node(path, level)) + return false; + + switch (btree_lock_want(path, level)) { + case BTREE_NODE_UNLOCKED: + BUG_ON(btree_node_locked(path, level)); + return true; + case BTREE_NODE_READ_LOCKED: + BUG_ON(btree_node_intent_locked(path, level)); + return bch2_btree_node_relock(trans, path, level); + case BTREE_NODE_INTENT_LOCKED: + break; + case BTREE_NODE_WRITE_LOCKED: + BUG(); + } + + if (btree_node_intent_locked(path, level)) + return true; + + if (race_fault()) + return false; + + if (btree_node_locked(path, level)) { + bool ret; + + six_lock_readers_add(&b->c.lock, -count.n[SIX_LOCK_read]); + ret = six_lock_tryupgrade(&b->c.lock); + six_lock_readers_add(&b->c.lock, count.n[SIX_LOCK_read]); + + if (ret) + goto success; + } else { + if (six_relock_type(&b->c.lock, SIX_LOCK_intent, path->l[level].lock_seq)) + goto success; + } + + /* + * Do we already have an intent lock via another path? If so, just bump + * lock count: + */ + if (btree_node_lock_seq_matches(path, b, level) && + btree_node_lock_increment(trans, &b->c, level, BTREE_NODE_INTENT_LOCKED)) { + btree_node_unlock(trans, path, level); + goto success; + } + + trace_and_count(trans->c, btree_path_upgrade_fail, trans, _RET_IP_, path, level); + return false; +success: + mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + return true; +} + +/* Btree path locking: */ + +/* + * Only for btree_cache.c - only relocks intent locks + */ +int bch2_btree_path_relock_intent(struct btree_trans *trans, + struct btree_path *path) +{ + unsigned l; + + for (l = path->level; + l < path->locks_want && btree_path_node(path, l); + l++) { + if (!bch2_btree_node_relock(trans, path, l)) { + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); + } + } + + return 0; +} + +__flatten +bool bch2_btree_path_relock_norestart(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_path_get_locks(trans, path, false); +} + +int __bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) { + trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path); + } + + return 0; +} + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + EBUG_ON(path->locks_want >= new_locks_want); + + path->locks_want = new_locks_want; + + return btree_path_get_locks(trans, path, true); +} + +bool __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + struct btree_path *linked; + + if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want)) + return true; + + /* + * XXX: this is ugly - we'd prefer to not be mucking with other + * iterators in the btree_trans here. + * + * On failure to upgrade the iterator, setting iter->locks_want and + * calling get_locks() is sufficient to make bch2_btree_path_traverse() + * get the locks we want on transaction restart. + * + * But if this iterator was a clone, on transaction restart what we did + * to this iterator isn't going to be preserved. + * + * Possibly we could add an iterator field for the parent iterator when + * an iterator is a copy - for now, we'll just upgrade any other + * iterators with the same btree id. + * + * The code below used to be needed to ensure ancestor nodes get locked + * before interior nodes - now that's handled by + * bch2_btree_path_traverse_all(). + */ + if (!path->cached && !trans->in_traverse_all) + trans_for_each_path(trans, linked) + if (linked != path && + linked->cached == path->cached && + linked->btree_id == path->btree_id && + linked->locks_want < new_locks_want) { + linked->locks_want = new_locks_want; + btree_path_get_locks(trans, linked, true); + } + + return false; +} + +void __bch2_btree_path_downgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + unsigned l; + + EBUG_ON(path->locks_want < new_locks_want); + + path->locks_want = new_locks_want; + + while (path->nodes_locked && + (l = btree_path_highest_level_locked(path)) >= path->locks_want) { + if (l > path->level) { + btree_node_unlock(trans, path, l); + } else { + if (btree_node_intent_locked(path, l)) { + six_lock_downgrade(&path->l[l].b->c.lock); + mark_btree_node_locked_noreset(path, l, SIX_LOCK_read); + } + break; + } + } + + bch2_btree_path_verify_locks(path); +} + +/* Btree transaction locking: */ + +void bch2_trans_downgrade(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_downgrade(trans, path); +} + +int bch2_trans_relock(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +int bch2_trans_relock_notrace(struct btree_trans *trans) +{ + struct btree_path *path; + + if (unlikely(trans->restarted)) + return -((int) trans->restarted); + + trans_for_each_path(trans, path) + if (path->should_be_locked && + !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) { + return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + } + return 0; +} + +void bch2_trans_unlock_noassert(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); +} + +void bch2_trans_unlock(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + __bch2_btree_path_unlock(trans, path); + + /* + * bch2_gc_btree_init_recurse() doesn't use btree iterators for walking + * btree nodes, it implements its own walking: + */ + if (!trans->is_initial_gc) + bch2_assert_btree_nodes_not_locked(); +} + +bool bch2_trans_locked(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (path->nodes_locked) + return true; + return false; +} + +int __bch2_trans_mutex_lock(struct btree_trans *trans, + struct mutex *lock) +{ + int ret = drop_locks_do(trans, (mutex_lock(lock), 0)); + + if (ret) + mutex_unlock(lock); + return ret; +} + +/* Debug */ + +#ifdef CONFIG_BCACHEFS_DEBUG + +void bch2_btree_path_verify_locks(struct btree_path *path) +{ + unsigned l; + + if (!path->nodes_locked) { + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && + btree_path_node(path, path->level)); + return; + } + + for (l = 0; l < BTREE_MAX_DEPTH; l++) { + int want = btree_lock_want(path, l); + int have = btree_node_locked_type(path, l); + + BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); + + BUG_ON(is_btree_node(path, l) && + (want == BTREE_NODE_UNLOCKED || + have != BTREE_NODE_WRITE_LOCKED) && + want != have); + } +} + +void bch2_trans_verify_locks(struct btree_trans *trans) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + bch2_btree_path_verify_locks(path); +} + +#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h new file mode 100644 index 000000000..f3e58aa27 --- /dev/null +++ b/fs/bcachefs/btree_locking.h @@ -0,0 +1,424 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_LOCKING_H +#define _BCACHEFS_BTREE_LOCKING_H + +/* + * Only for internal btree use: + * + * The btree iterator tracks what locks it wants to take, and what locks it + * currently has - here we have wrappers for locking/unlocking btree nodes and + * updating the iterator state + */ + +#include + +#include "btree_iter.h" + +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); + +#ifdef CONFIG_LOCKDEP +void bch2_assert_btree_nodes_not_locked(void); +#else +static inline void bch2_assert_btree_nodes_not_locked(void) {} +#endif + +void bch2_trans_unlock_noassert(struct btree_trans *); + +static inline bool is_btree_node(struct btree_path *path, unsigned l) +{ + return l < BTREE_MAX_DEPTH && !IS_ERR_OR_NULL(path->l[l].b); +} + +static inline struct btree_transaction_stats *btree_trans_stats(struct btree_trans *trans) +{ + return trans->fn_idx < ARRAY_SIZE(trans->c->btree_transaction_stats) + ? &trans->c->btree_transaction_stats[trans->fn_idx] + : NULL; +} + +/* matches six lock types */ +enum btree_node_locked_type { + BTREE_NODE_UNLOCKED = -1, + BTREE_NODE_READ_LOCKED = SIX_LOCK_read, + BTREE_NODE_INTENT_LOCKED = SIX_LOCK_intent, + BTREE_NODE_WRITE_LOCKED = SIX_LOCK_write, +}; + +static inline int btree_node_locked_type(struct btree_path *path, + unsigned level) +{ + return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); +} + +static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) +{ + return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; +} + +static inline bool btree_node_intent_locked(struct btree_path *path, unsigned l) +{ + return btree_node_locked_type(path, l) == BTREE_NODE_INTENT_LOCKED; +} + +static inline bool btree_node_read_locked(struct btree_path *path, unsigned l) +{ + return btree_node_locked_type(path, l) == BTREE_NODE_READ_LOCKED; +} + +static inline bool btree_node_locked(struct btree_path *path, unsigned level) +{ + return btree_node_locked_type(path, level) != BTREE_NODE_UNLOCKED; +} + +static inline void mark_btree_node_locked_noreset(struct btree_path *path, + unsigned level, + enum btree_node_locked_type type) +{ + /* relying on this to avoid a branch */ + BUILD_BUG_ON(SIX_LOCK_read != 0); + BUILD_BUG_ON(SIX_LOCK_intent != 1); + + path->nodes_locked &= ~(3U << (level << 1)); + path->nodes_locked |= (type + 1) << (level << 1); +} + +static inline void mark_btree_node_unlocked(struct btree_path *path, + unsigned level) +{ + EBUG_ON(btree_node_write_locked(path, level)); + mark_btree_node_locked_noreset(path, level, BTREE_NODE_UNLOCKED); +} + +static inline void mark_btree_node_locked(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + enum six_lock_type type) +{ + mark_btree_node_locked_noreset(path, level, (enum btree_node_locked_type) type); +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[level].lock_taken_time = local_clock(); +#endif +} + +static inline enum six_lock_type __btree_lock_want(struct btree_path *path, int level) +{ + return level < path->locks_want + ? SIX_LOCK_intent + : SIX_LOCK_read; +} + +static inline enum btree_node_locked_type +btree_lock_want(struct btree_path *path, int level) +{ + if (level < path->level) + return BTREE_NODE_UNLOCKED; + if (level < path->locks_want) + return BTREE_NODE_INTENT_LOCKED; + if (level == path->level) + return BTREE_NODE_READ_LOCKED; + return BTREE_NODE_UNLOCKED; +} + +static void btree_trans_lock_hold_time_update(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + struct btree_transaction_stats *s = btree_trans_stats(trans); + + if (s) + __bch2_time_stats_update(&s->lock_hold_times, + path->l[level].lock_taken_time, + local_clock()); +#endif +} + +/* unlock: */ + +static inline void btree_node_unlock(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + int lock_type = btree_node_locked_type(path, level); + + EBUG_ON(level >= BTREE_MAX_DEPTH); + + if (lock_type != BTREE_NODE_UNLOCKED) { + six_unlock_type(&path->l[level].b->c.lock, lock_type); + btree_trans_lock_hold_time_update(trans, path, level); + } + mark_btree_node_unlocked(path, level); +} + +static inline int btree_path_lowest_level_locked(struct btree_path *path) +{ + return __ffs(path->nodes_locked) >> 1; +} + +static inline int btree_path_highest_level_locked(struct btree_path *path) +{ + return __fls(path->nodes_locked) >> 1; +} + +static inline void __bch2_btree_path_unlock(struct btree_trans *trans, + struct btree_path *path) +{ + btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + + while (path->nodes_locked) + btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); +} + +/* + * Updates the saved lock sequence number, so that bch2_btree_node_relock() will + * succeed: + */ +static inline void +bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_path *path, + struct btree *b) +{ + struct btree_path *linked; + + EBUG_ON(path->l[b->c.level].b != b); + EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock)); + EBUG_ON(btree_node_locked_type(path, b->c.level) != SIX_LOCK_write); + + mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); + + trans_for_each_path_with_node(trans, b, linked) + linked->l[b->c.level].lock_seq++; + + six_unlock_write(&b->c.lock); +} + +void bch2_btree_node_unlock_write(struct btree_trans *, + struct btree_path *, struct btree *); + +int bch2_six_check_for_deadlock(struct six_lock *lock, void *p); + +/* lock: */ + +static inline int __btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + bool lock_may_not_fail, + unsigned long ip) +{ + int ret; + + trans->lock_may_not_fail = lock_may_not_fail; + trans->lock_must_abort = false; + trans->locking = b; + + ret = six_lock_ip_waiter(&b->lock, type, &trans->locking_wait, + bch2_six_check_for_deadlock, trans, ip); + WRITE_ONCE(trans->locking, NULL); + WRITE_ONCE(trans->locking_wait.start_time, 0); + return ret; +} + +static inline int __must_check +btree_node_lock_nopath(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type, + unsigned long ip) +{ + return __btree_node_lock_nopath(trans, b, type, false, ip); +} + +static inline void btree_node_lock_nopath_nofail(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + enum six_lock_type type) +{ + int ret = __btree_node_lock_nopath(trans, b, type, true, _THIS_IP_); + + BUG_ON(ret); +} + +/* + * Lock a btree node if we already have it locked on one of our linked + * iterators: + */ +static inline bool btree_node_lock_increment(struct btree_trans *trans, + struct btree_bkey_cached_common *b, + unsigned level, + enum btree_node_locked_type want) +{ + struct btree_path *path; + + trans_for_each_path(trans, path) + if (&path->l[level].b->c == b && + btree_node_locked_type(path, level) >= want) { + six_lock_increment(&b->lock, (enum six_lock_type) want); + return true; + } + + return false; +} + +static inline int btree_node_lock(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + unsigned level, + enum six_lock_type type, + unsigned long ip) +{ + int ret = 0; + + EBUG_ON(level >= BTREE_MAX_DEPTH); + EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx))); + + if (likely(six_trylock_type(&b->lock, type)) || + btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) || + !(ret = btree_node_lock_nopath(trans, b, type, btree_path_ip_allocated(path)))) { +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + path->l[b->level].lock_taken_time = local_clock(); +#endif + } + + return ret; +} + +int __bch2_btree_node_lock_write(struct btree_trans *, struct btree_path *, + struct btree_bkey_cached_common *b, bool); + +static inline int __btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b, + bool lock_may_not_fail) +{ + EBUG_ON(&path->l[b->level].b->c != b); + EBUG_ON(path->l[b->level].lock_seq != six_lock_seq(&b->lock)); + EBUG_ON(!btree_node_intent_locked(path, b->level)); + + /* + * six locks are unfair, and read locks block while a thread wants a + * write lock: thus, we need to tell the cycle detector we have a write + * lock _before_ taking the lock: + */ + mark_btree_node_locked_noreset(path, b->level, SIX_LOCK_write); + + return likely(six_trylock_write(&b->lock)) + ? 0 + : __bch2_btree_node_lock_write(trans, path, b, lock_may_not_fail); +} + +static inline int __must_check +bch2_btree_node_lock_write(struct btree_trans *trans, + struct btree_path *path, + struct btree_bkey_cached_common *b) +{ + return __btree_node_lock_write(trans, path, b, false); +} + +void bch2_btree_node_lock_write_nofail(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *); + +/* relock: */ + +bool bch2_btree_path_relock_norestart(struct btree_trans *, + struct btree_path *, unsigned long); +int __bch2_btree_path_relock(struct btree_trans *, + struct btree_path *, unsigned long); + +static inline int bch2_btree_path_relock(struct btree_trans *trans, + struct btree_path *path, unsigned long trace_ip) +{ + return btree_node_locked(path, path->level) + ? 0 + : __bch2_btree_path_relock(trans, path, trace_ip); +} + +bool __bch2_btree_node_relock(struct btree_trans *, struct btree_path *, unsigned, bool trace); + +static inline bool bch2_btree_node_relock(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + EBUG_ON(btree_node_locked(path, level) && + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, true)); +} + +static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, + struct btree_path *path, unsigned level) +{ + EBUG_ON(btree_node_locked(path, level) && + !btree_node_write_locked(path, level) && + btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + + return likely(btree_node_locked(path, level)) || + (!IS_ERR_OR_NULL(path->l[level].b) && + __bch2_btree_node_relock(trans, path, level, false)); +} + +/* upgrade */ + +bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, + struct btree_path *, unsigned); +bool __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); + +static inline int bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + unsigned old_locks_want = path->locks_want; + + new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); + + if (path->locks_want < new_locks_want + ? __bch2_btree_path_upgrade(trans, path, new_locks_want) + : path->uptodate == BTREE_ITER_UPTODATE) + return 0; + + trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, + old_locks_want, new_locks_want); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); +} + +/* misc: */ + +static inline void btree_path_set_should_be_locked(struct btree_path *path) +{ + EBUG_ON(!btree_node_locked(path, path->level)); + EBUG_ON(path->uptodate); + + path->should_be_locked = true; +} + +static inline void __btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path, + unsigned l) +{ + btree_node_unlock(trans, path, l); + path->l[l].b = ERR_PTR(-BCH_ERR_no_btree_node_up); +} + +static inline void btree_path_set_level_up(struct btree_trans *trans, + struct btree_path *path) +{ + __btree_path_set_level_up(trans, path, path->level++); + btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); +} + +/* debug */ + +struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, + struct btree_path *, + struct btree_bkey_cached_common *b, + unsigned); + +int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_btree_path_verify_locks(struct btree_path *); +void bch2_trans_verify_locks(struct btree_trans *); +#else +static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} +static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} +#endif + +#endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h new file mode 100644 index 000000000..4efc69492 --- /dev/null +++ b/fs/bcachefs/btree_types.h @@ -0,0 +1,742 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_TYPES_H +#define _BCACHEFS_BTREE_TYPES_H + +#include +#include +#include + +//#include "bkey_methods.h" +#include "buckets_types.h" +#include "darray.h" +#include "errcode.h" +#include "journal_types.h" +#include "replicas_types.h" + +struct open_bucket; +struct btree_update; +struct btree_trans; + +#define MAX_BSETS 3U + +struct btree_nr_keys { + + /* + * Amount of live metadata (i.e. size of node after a compaction) in + * units of u64s + */ + u16 live_u64s; + u16 bset_u64s[MAX_BSETS]; + + /* live keys only: */ + u16 packed_keys; + u16 unpacked_keys; +}; + +struct bset_tree { + /* + * We construct a binary tree in an array as if the array + * started at 1, so that things line up on the same cachelines + * better: see comments in bset.c at cacheline_to_bkey() for + * details + */ + + /* size of the binary tree and prev array */ + u16 size; + + /* function of size - precalculated for to_inorder() */ + u16 extra; + + u16 data_offset; + u16 aux_data_offset; + u16 end_offset; +}; + +struct btree_write { + struct journal_entry_pin journal; +}; + +struct btree_alloc { + struct open_buckets ob; + __BKEY_PADDED(k, BKEY_BTREE_PTR_VAL_U64s_MAX); +}; + +struct btree_bkey_cached_common { + struct six_lock lock; + u8 level; + u8 btree_id; + bool cached; +}; + +struct btree { + struct btree_bkey_cached_common c; + + struct rhash_head hash; + u64 hash_val; + + unsigned long flags; + u16 written; + u8 nsets; + u8 nr_key_bits; + u16 version_ondisk; + + struct bkey_format format; + + struct btree_node *data; + void *aux_data; + + /* + * Sets of sorted keys - the real btree node - plus a binary search tree + * + * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point + * to the memory we have allocated for this btree node. Additionally, + * set[0]->data points to the entire btree node as it exists on disk. + */ + struct bset_tree set[MAX_BSETS]; + + struct btree_nr_keys nr; + u16 sib_u64s[2]; + u16 whiteout_u64s; + u8 byte_order; + u8 unpack_fn_len; + + struct btree_write writes[2]; + + /* Key/pointer for this btree node */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + + /* + * XXX: add a delete sequence number, so when bch2_btree_node_relock() + * fails because the lock sequence number has changed - i.e. the + * contents were modified - we can still relock the node if it's still + * the one we want, without redoing the traversal + */ + + /* + * For asynchronous splits/interior node updates: + * When we do a split, we allocate new child nodes and update the parent + * node to point to them: we update the parent in memory immediately, + * but then we must wait until the children have been written out before + * the update to the parent can be written - this is a list of the + * btree_updates that are blocking this node from being + * written: + */ + struct list_head write_blocked; + + /* + * Also for asynchronous splits/interior node updates: + * If a btree node isn't reachable yet, we don't want to kick off + * another write - because that write also won't yet be reachable and + * marking it as completed before it's reachable would be incorrect: + */ + unsigned long will_make_reachable; + + struct open_buckets ob; + + /* lru list */ + struct list_head list; +}; + +struct btree_cache { + struct rhashtable table; + bool table_init_done; + /* + * We never free a struct btree, except on shutdown - we just put it on + * the btree_cache_freed list and reuse it later. This simplifies the + * code, and it doesn't cost us much memory as the memory usage is + * dominated by buffers that hold the actual btree node data and those + * can be freed - and the number of struct btrees allocated is + * effectively bounded. + * + * btree_cache_freeable effectively is a small cache - we use it because + * high order page allocations can be rather expensive, and it's quite + * common to delete and allocate btree nodes in quick succession. It + * should never grow past ~2-3 nodes in practice. + */ + struct mutex lock; + struct list_head live; + struct list_head freeable; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + + /* Number of elements in live + freeable lists */ + unsigned used; + unsigned reserve; + unsigned freed; + unsigned not_freed_lock_intent; + unsigned not_freed_lock_write; + unsigned not_freed_dirty; + unsigned not_freed_read_in_flight; + unsigned not_freed_write_in_flight; + unsigned not_freed_noevict; + unsigned not_freed_write_blocked; + unsigned not_freed_will_make_reachable; + unsigned not_freed_access_bit; + atomic_t dirty; + struct shrinker shrink; + + /* + * If we need to allocate memory for a new btree node and that + * allocation fails, we can cannibalize another node in the btree cache + * to satisfy the allocation - lock to guarantee only one thread does + * this at a time: + */ + struct task_struct *alloc_lock; + struct closure_waitlist alloc_wait; +}; + +struct btree_node_iter { + struct btree_node_iter_set { + u16 k, end; + } data[MAX_BSETS]; +}; + +/* + * Iterate over all possible positions, synthesizing deleted keys for holes: + */ +static const u16 BTREE_ITER_SLOTS = 1 << 0; +static const u16 BTREE_ITER_ALL_LEVELS = 1 << 1; +/* + * Indicates that intent locks should be taken on leaf nodes, because we expect + * to be doing updates: + */ +static const u16 BTREE_ITER_INTENT = 1 << 2; +/* + * Causes the btree iterator code to prefetch additional btree nodes from disk: + */ +static const u16 BTREE_ITER_PREFETCH = 1 << 3; +/* + * Used in bch2_btree_iter_traverse(), to indicate whether we're searching for + * @pos or the first key strictly greater than @pos + */ +static const u16 BTREE_ITER_IS_EXTENTS = 1 << 4; +static const u16 BTREE_ITER_NOT_EXTENTS = 1 << 5; +static const u16 BTREE_ITER_CACHED = 1 << 6; +static const u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7; +static const u16 BTREE_ITER_WITH_UPDATES = 1 << 8; +static const u16 BTREE_ITER_WITH_JOURNAL = 1 << 9; +static const u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10; +static const u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11; +static const u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12; +static const u16 BTREE_ITER_NOPRESERVE = 1 << 13; +static const u16 BTREE_ITER_CACHED_NOFILL = 1 << 14; +static const u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15; +#define __BTREE_ITER_FLAGS_END 16 + +enum btree_path_uptodate { + BTREE_ITER_UPTODATE = 0, + BTREE_ITER_NEED_RELOCK = 1, + BTREE_ITER_NEED_TRAVERSE = 2, +}; + +#if defined(CONFIG_BCACHEFS_LOCK_TIME_STATS) || defined(CONFIG_BCACHEFS_DEBUG) +#define TRACK_PATH_ALLOCATED +#endif + +struct btree_path { + u8 idx; + u8 sorted_idx; + u8 ref; + u8 intent_ref; + + /* btree_iter_copy starts here: */ + struct bpos pos; + + enum btree_id btree_id:5; + bool cached:1; + bool preserve:1; + enum btree_path_uptodate uptodate:2; + /* + * When true, failing to relock this path will cause the transaction to + * restart: + */ + bool should_be_locked:1; + unsigned level:3, + locks_want:3; + u8 nodes_locked; + + struct btree_path_level { + struct btree *b; + struct btree_node_iter iter; + u32 lock_seq; +#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS + u64 lock_taken_time; +#endif + } l[BTREE_MAX_DEPTH]; +#ifdef TRACK_PATH_ALLOCATED + unsigned long ip_allocated; +#endif +}; + +static inline struct btree_path_level *path_l(struct btree_path *path) +{ + return path->l + path->level; +} + +static inline unsigned long btree_path_ip_allocated(struct btree_path *path) +{ +#ifdef TRACK_PATH_ALLOCATED + return path->ip_allocated; +#else + return _THIS_IP_; +#endif +} + +/* + * @pos - iterator's current position + * @level - current btree depth + * @locks_want - btree level below which we start taking intent locks + * @nodes_locked - bitmask indicating which nodes in @nodes are locked + * @nodes_intent_locked - bitmask indicating which locks are intent locks + */ +struct btree_iter { + struct btree_trans *trans; + struct btree_path *path; + struct btree_path *update_path; + struct btree_path *key_cache_path; + + enum btree_id btree_id:8; + unsigned min_depth:3; + unsigned advanced:1; + + /* btree_iter_copy starts here: */ + u16 flags; + + /* When we're filtering by snapshot, the snapshot ID we're looking for: */ + unsigned snapshot; + + struct bpos pos; + /* + * Current unpacked key - so that bch2_btree_iter_next()/ + * bch2_btree_iter_next_slot() can correctly advance pos. + */ + struct bkey k; + + /* BTREE_ITER_WITH_JOURNAL: */ + size_t journal_idx; + struct bpos journal_pos; +#ifdef TRACK_PATH_ALLOCATED + unsigned long ip_allocated; +#endif +}; + +struct btree_key_cache_freelist { + struct bkey_cached *objs[16]; + unsigned nr; +}; + +struct btree_key_cache { + struct mutex lock; + struct rhashtable table; + bool table_init_done; + struct list_head freed_pcpu; + struct list_head freed_nonpcpu; + struct shrinker shrink; + unsigned shrink_iter; + struct btree_key_cache_freelist __percpu *pcpu_freed; + + atomic_long_t nr_freed; + atomic_long_t nr_keys; + atomic_long_t nr_dirty; +}; + +struct bkey_cached_key { + u32 btree_id; + struct bpos pos; +} __packed __aligned(4); + +#define BKEY_CACHED_ACCESSED 0 +#define BKEY_CACHED_DIRTY 1 + +struct bkey_cached { + struct btree_bkey_cached_common c; + + unsigned long flags; + u16 u64s; + bool valid; + u32 btree_trans_barrier_seq; + struct bkey_cached_key key; + + struct rhash_head hash; + struct list_head list; + + struct journal_preres res; + struct journal_entry_pin journal; + u64 seq; + + struct bkey_i *k; +}; + +static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) +{ + return !b->cached + ? container_of(b, struct btree, c)->key.k.p + : container_of(b, struct bkey_cached, c)->key.pos; +} + +struct btree_insert_entry { + unsigned flags; + u8 bkey_type; + enum btree_id btree_id:8; + u8 level:4; + bool cached:1; + bool insert_trigger_run:1; + bool overwrite_trigger_run:1; + bool key_cache_already_flushed:1; + /* + * @old_k may be a key from the journal; @old_btree_u64s always refers + * to the size of the key being overwritten in the btree: + */ + u8 old_btree_u64s; + struct bkey_i *k; + struct btree_path *path; + /* key being overwritten: */ + struct bkey old_k; + const struct bch_val *old_v; + unsigned long ip_allocated; +}; + +#define BTREE_ITER_MAX 64 + +struct btree_trans_commit_hook; +typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *); + +struct btree_trans_commit_hook { + btree_trans_commit_hook_fn *fn; + struct btree_trans_commit_hook *next; +}; + +#define BTREE_TRANS_MEM_MAX (1U << 16) + +#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000 + +struct btree_trans { + struct bch_fs *c; + const char *fn; + struct closure ref; + struct list_head list; + u64 last_begin_time; + + u8 lock_may_not_fail; + u8 lock_must_abort; + struct btree_bkey_cached_common *locking; + struct six_lock_waiter locking_wait; + + int srcu_idx; + + u8 fn_idx; + u8 nr_sorted; + u8 nr_updates; + u8 nr_wb_updates; + u8 wb_updates_size; + bool used_mempool:1; + bool in_traverse_all:1; + bool paths_sorted:1; + bool memory_allocation_failure:1; + bool journal_transaction_names:1; + bool journal_replay_not_finished:1; + bool is_initial_gc:1; + bool notrace_relock_fail:1; + enum bch_errcode restarted:16; + u32 restart_count; + unsigned long last_begin_ip; + unsigned long last_restarted_ip; + unsigned long srcu_lock_time; + + /* + * For when bch2_trans_update notices we'll be splitting a compressed + * extent: + */ + unsigned extra_journal_res; + unsigned nr_max_paths; + + u64 paths_allocated; + + unsigned mem_top; + unsigned mem_max; + unsigned mem_bytes; + void *mem; + + u8 sorted[BTREE_ITER_MAX + 8]; + struct btree_path *paths; + struct btree_insert_entry *updates; + struct btree_write_buffered_key *wb_updates; + + /* update path: */ + struct btree_trans_commit_hook *hooks; + darray_u64 extra_journal_entries; + struct journal_entry_pin *journal_pin; + + struct journal_res journal_res; + struct journal_preres journal_preres; + u64 *journal_seq; + struct disk_reservation *disk_res; + unsigned journal_u64s; + unsigned journal_preres_u64s; + struct replicas_delta_list *fs_usage_deltas; +}; + +#define BCH_BTREE_WRITE_TYPES() \ + x(initial, 0) \ + x(init_next_bset, 1) \ + x(cache_reclaim, 2) \ + x(journal_reclaim, 3) \ + x(interior, 4) + +enum btree_write_type { +#define x(t, n) BTREE_WRITE_##t, + BCH_BTREE_WRITE_TYPES() +#undef x + BTREE_WRITE_TYPE_NR, +}; + +#define BTREE_WRITE_TYPE_MASK (roundup_pow_of_two(BTREE_WRITE_TYPE_NR) - 1) +#define BTREE_WRITE_TYPE_BITS ilog2(roundup_pow_of_two(BTREE_WRITE_TYPE_NR)) + +#define BTREE_FLAGS() \ + x(read_in_flight) \ + x(read_error) \ + x(dirty) \ + x(need_write) \ + x(write_blocked) \ + x(will_make_reachable) \ + x(noevict) \ + x(write_idx) \ + x(accessed) \ + x(write_in_flight) \ + x(write_in_flight_inner) \ + x(just_written) \ + x(dying) \ + x(fake) \ + x(need_rewrite) \ + x(never_write) + +enum btree_flags { + /* First bits for btree node write type */ + BTREE_NODE_FLAGS_START = BTREE_WRITE_TYPE_BITS - 1, +#define x(flag) BTREE_NODE_##flag, + BTREE_FLAGS() +#undef x +}; + +#define x(flag) \ +static inline bool btree_node_ ## flag(struct btree *b) \ +{ return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void set_btree_node_ ## flag(struct btree *b) \ +{ set_bit(BTREE_NODE_ ## flag, &b->flags); } \ + \ +static inline void clear_btree_node_ ## flag(struct btree *b) \ +{ clear_bit(BTREE_NODE_ ## flag, &b->flags); } + +BTREE_FLAGS() +#undef x + +static inline struct btree_write *btree_current_write(struct btree *b) +{ + return b->writes + btree_node_write_idx(b); +} + +static inline struct btree_write *btree_prev_write(struct btree *b) +{ + return b->writes + (btree_node_write_idx(b) ^ 1); +} + +static inline struct bset_tree *bset_tree_last(struct btree *b) +{ + EBUG_ON(!b->nsets); + return b->set + b->nsets - 1; +} + +static inline void * +__btree_node_offset_to_ptr(const struct btree *b, u16 offset) +{ + return (void *) ((u64 *) b->data + 1 + offset); +} + +static inline u16 +__btree_node_ptr_to_offset(const struct btree *b, const void *p) +{ + u16 ret = (u64 *) p - 1 - (u64 *) b->data; + + EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); + return ret; +} + +static inline struct bset *bset(const struct btree *b, + const struct bset_tree *t) +{ + return __btree_node_offset_to_ptr(b, t->data_offset); +} + +static inline void set_btree_bset_end(struct btree *b, struct bset_tree *t) +{ + t->end_offset = + __btree_node_ptr_to_offset(b, vstruct_last(bset(b, t))); +} + +static inline void set_btree_bset(struct btree *b, struct bset_tree *t, + const struct bset *i) +{ + t->data_offset = __btree_node_ptr_to_offset(b, i); + set_btree_bset_end(b, t); +} + +static inline struct bset *btree_bset_first(struct btree *b) +{ + return bset(b, b->set); +} + +static inline struct bset *btree_bset_last(struct btree *b) +{ + return bset(b, bset_tree_last(b)); +} + +static inline u16 +__btree_node_key_to_offset(const struct btree *b, const struct bkey_packed *k) +{ + return __btree_node_ptr_to_offset(b, k); +} + +static inline struct bkey_packed * +__btree_node_offset_to_key(const struct btree *b, u16 k) +{ + return __btree_node_offset_to_ptr(b, k); +} + +static inline unsigned btree_bkey_first_offset(const struct bset_tree *t) +{ + return t->data_offset + offsetof(struct bset, _data) / sizeof(u64); +} + +#define btree_bkey_first(_b, _t) \ +({ \ + EBUG_ON(bset(_b, _t)->start != \ + __btree_node_offset_to_key(_b, btree_bkey_first_offset(_t)));\ + \ + bset(_b, _t)->start; \ +}) + +#define btree_bkey_last(_b, _t) \ +({ \ + EBUG_ON(__btree_node_offset_to_key(_b, (_t)->end_offset) != \ + vstruct_last(bset(_b, _t))); \ + \ + __btree_node_offset_to_key(_b, (_t)->end_offset); \ +}) + +static inline unsigned bset_u64s(struct bset_tree *t) +{ + return t->end_offset - t->data_offset - + sizeof(struct bset) / sizeof(u64); +} + +static inline unsigned bset_dead_u64s(struct btree *b, struct bset_tree *t) +{ + return bset_u64s(t) - b->nr.bset_u64s[t - b->set]; +} + +static inline unsigned bset_byte_offset(struct btree *b, void *i) +{ + return i - (void *) b->data; +} + +enum btree_node_type { +#define x(kwd, val) BKEY_TYPE_##kwd = val, + BCH_BTREE_IDS() +#undef x + BKEY_TYPE_btree, +}; + +/* Type of a key in btree @id at level @level: */ +static inline enum btree_node_type __btree_node_type(unsigned level, enum btree_id id) +{ + return level ? BKEY_TYPE_btree : (enum btree_node_type) id; +} + +/* Type of keys @b contains: */ +static inline enum btree_node_type btree_node_type(struct btree *b) +{ + return __btree_node_type(b->c.level, b->c.btree_id); +} + +#define BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS \ + ((1U << BKEY_TYPE_extents)| \ + (1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_reflink)| \ + (1U << BKEY_TYPE_btree)) + +#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \ + ((1U << BKEY_TYPE_alloc)| \ + (1U << BKEY_TYPE_inodes)| \ + (1U << BKEY_TYPE_stripes)| \ + (1U << BKEY_TYPE_snapshots)) + +#define BTREE_NODE_TYPE_HAS_TRIGGERS \ + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \ + BTREE_NODE_TYPE_HAS_MEM_TRIGGERS) + +#define BTREE_ID_IS_EXTENTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)| \ + (1U << BTREE_ID_freespace)) + +static inline bool btree_node_type_is_extents(enum btree_node_type type) +{ + return (1U << type) & BTREE_ID_IS_EXTENTS; +} + +static inline bool btree_id_is_extents(enum btree_id btree) +{ + return btree_node_type_is_extents((enum btree_node_type) btree); +} + +#define BTREE_ID_HAS_SNAPSHOTS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_inodes)| \ + (1U << BTREE_ID_dirents)| \ + (1U << BTREE_ID_xattrs)) + +#define BTREE_ID_HAS_PTRS \ + ((1U << BTREE_ID_extents)| \ + (1U << BTREE_ID_reflink)) + +static inline bool btree_type_has_snapshots(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_SNAPSHOTS; +} + +static inline bool btree_type_has_ptrs(enum btree_id id) +{ + return (1 << id) & BTREE_ID_HAS_PTRS; +} + +static inline bool btree_node_type_needs_gc(enum btree_node_type type) +{ + return BTREE_NODE_TYPE_HAS_TRIGGERS & (1U << type); +} + +struct btree_root { + struct btree *b; + + /* On disk root - see async splits: */ + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); + u8 level; + u8 alive; + s8 error; +}; + +enum btree_gc_coalesce_fail_reason { + BTREE_GC_COALESCE_FAIL_RESERVE_GET, + BTREE_GC_COALESCE_FAIL_KEYLIST_REALLOC, + BTREE_GC_COALESCE_FAIL_FORMAT_FITS, +}; + +enum btree_node_sibling { + btree_prev_sib, + btree_next_sib, +}; + +#endif /* _BCACHEFS_BTREE_TYPES_H */ diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h new file mode 100644 index 000000000..f794c9d10 --- /dev/null +++ b/fs/bcachefs/btree_update.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_H +#define _BCACHEFS_BTREE_UPDATE_H + +#include "btree_iter.h" +#include "journal.h" +#include "journal.h" + +struct bch_fs; +struct btree; + +void bch2_btree_node_prep_for_write(struct btree_trans *, + struct btree_path *, struct btree *); +bool bch2_btree_bset_insert_key(struct btree_trans *, struct btree_path *, + struct btree *, struct btree_node_iter *, + struct bkey_i *); + +int bch2_btree_node_flush0(struct journal *, struct journal_entry_pin *, u64); +int bch2_btree_node_flush1(struct journal *, struct journal_entry_pin *, u64); +void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64); + +void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *, + struct bkey_i *, u64); + +enum btree_insert_flags { + /* First bits for bch_watermark: */ + __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS, + __BTREE_INSERT_NOCHECK_RW, + __BTREE_INSERT_LAZY_RW, + __BTREE_INSERT_JOURNAL_REPLAY, + __BTREE_INSERT_JOURNAL_RECLAIM, + __BTREE_INSERT_NOWAIT, + __BTREE_INSERT_GC_LOCK_HELD, + __BCH_HASH_SET_MUST_CREATE, + __BCH_HASH_SET_MUST_REPLACE, +}; + +/* Don't check for -ENOSPC: */ +#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL) + +#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW) +#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW) + +/* Insert is for journal replay - don't get journal reservations: */ +#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY) + +/* Insert is being called from journal reclaim path: */ +#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM) + +/* Don't block on allocation failure (for new btree nodes: */ +#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT) +#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD) + +#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE) +#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE) + +int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *, + unsigned, unsigned); +int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned); +int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos); + +int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id, + struct bkey_i *, enum btree_update_flags); + +int __bch2_btree_insert(struct btree_trans *, enum btree_id, struct bkey_i *, + enum btree_update_flags); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, u64 *, int flags); + +int bch2_btree_delete_range_trans(struct btree_trans *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); +int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, unsigned, u64 *); + +int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, + struct btree *, unsigned); +void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); +int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, + struct btree *, struct bkey_i *, + unsigned, bool); +int bch2_btree_node_update_key_get_iter(struct btree_trans *, struct btree *, + struct bkey_i *, unsigned, bool); + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, + struct bpos, struct bpos); + +/* + * For use when splitting extents in existing snapshots: + * + * If @old_pos is an interior snapshot node, iterate over descendent snapshot + * nodes: for every descendent snapshot in whiche @old_pos is overwritten and + * not visible, emit a whiteout at @new_pos. + */ +static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id btree, + struct bpos old_pos, + struct bpos new_pos) +{ + if (!btree_type_has_snapshots(btree) || + bkey_eq(old_pos, new_pos)) + return 0; + + return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); +} + +int bch2_trans_update_extent(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); + +int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, + enum btree_id, struct bpos); + +int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_update_flags); +int __must_check bch2_trans_update_buffered(struct btree_trans *, + enum btree_id, struct bkey_i *); + +void bch2_trans_commit_hook(struct btree_trans *, + struct btree_trans_commit_hook *); +int __bch2_trans_commit(struct btree_trans *, unsigned); + +int bch2_fs_log_msg(struct bch_fs *, const char *, ...); +int bch2_journal_log_msg(struct bch_fs *, const char *, ...); + +/** + * bch2_trans_commit - insert keys at given iterator positions + * + * This is main entry point for btree updates. + * + * Return values: + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +static inline int bch2_trans_commit(struct btree_trans *trans, + struct disk_reservation *disk_res, + u64 *journal_seq, + unsigned flags) +{ + trans->disk_res = disk_res; + trans->journal_seq = journal_seq; + + return __bch2_trans_commit(trans, flags); +} + +#define commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + +#define nested_commit_do(_trans, _disk_res, _journal_seq, _flags, _do) \ + nested_lockrestart_do(_trans, _do ?: bch2_trans_commit(_trans, (_disk_res),\ + (_journal_seq), (_flags))) + +#define bch2_trans_do(_c, _disk_res, _journal_seq, _flags, _do) \ +({ \ + struct btree_trans trans; \ + int _ret; \ + \ + bch2_trans_init(&trans, (_c), 0, 0); \ + _ret = commit_do(&trans, _disk_res, _journal_seq, _flags, _do); \ + bch2_trans_exit(&trans); \ + \ + _ret; \ +}) + +#define bch2_trans_run(_c, _do) \ +({ \ + struct btree_trans trans; \ + int _ret; \ + \ + bch2_trans_init(&trans, (_c), 0, 0); \ + _ret = (_do); \ + bch2_trans_exit(&trans); \ + \ + _ret; \ +}) + +#define trans_for_each_update(_trans, _i) \ + for ((_i) = (_trans)->updates; \ + (_i) < (_trans)->updates + (_trans)->nr_updates; \ + (_i)++) + +#define trans_for_each_wb_update(_trans, _i) \ + for ((_i) = (_trans)->wb_updates; \ + (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \ + (_i)++) + +static inline void bch2_trans_reset_updates(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) + bch2_path_put(trans, i->path, true); + + trans->extra_journal_res = 0; + trans->nr_updates = 0; + trans->nr_wb_updates = 0; + trans->wb_updates = NULL; + trans->hooks = NULL; + trans->extra_journal_entries.nr = 0; + + if (trans->fs_usage_deltas) { + trans->fs_usage_deltas->used = 0; + memset((void *) trans->fs_usage_deltas + + offsetof(struct replicas_delta_list, memset_start), 0, + (void *) &trans->fs_usage_deltas->memset_end - + (void *) &trans->fs_usage_deltas->memset_start); + } +} + +static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, + unsigned type, unsigned min_bytes) +{ + unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); + struct bkey_i *mut; + + if (type && k.k->type != type) + return ERR_PTR(-ENOENT); + + mut = bch2_trans_kmalloc_nomemzero(trans, bytes); + if (!IS_ERR(mut)) { + bkey_reassemble(mut, k); + + if (unlikely(bytes > bkey_bytes(k.k))) { + memset((void *) mut + bkey_bytes(k.k), 0, + bytes - bkey_bytes(k.k)); + mut->k.u64s = DIV_ROUND_UP(bytes, sizeof(u64)); + } + } + return mut; +} + +static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +{ + return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); +} + +#define bch2_bkey_make_mut_noupdate_typed(_trans, _k, _type) \ + bkey_i_to_##_type(__bch2_bkey_make_mut_noupdate(_trans, _k, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k, unsigned flags, + unsigned type, unsigned min_bytes) +{ + struct bkey_i *mut = __bch2_bkey_make_mut_noupdate(trans, *k, type, min_bytes); + int ret; + + if (IS_ERR(mut)) + return mut; + + ret = bch2_trans_update(trans, iter, mut, flags); + if (ret) + return ERR_PTR(ret); + + *k = bkey_i_to_s_c(mut); + return mut; +} + +static inline struct bkey_i *bch2_bkey_make_mut(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c *k, unsigned flags) +{ + return __bch2_bkey_make_mut(trans, iter, k, flags, 0, 0); +} + +#define bch2_bkey_make_mut_typed(_trans, _iter, _k, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_make_mut(_trans, _iter, _k, _flags,\ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, unsigned min_bytes) +{ + struct bkey_s_c k = __bch2_bkey_get_iter(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type); + struct bkey_i *ret = unlikely(IS_ERR(k.k)) + ? ERR_CAST(k.k) + : __bch2_bkey_make_mut_noupdate(trans, k, 0, min_bytes); + if (unlikely(IS_ERR(ret))) + bch2_trans_iter_exit(trans, iter); + return ret; +} + +static inline struct bkey_i *bch2_bkey_get_mut_noupdate(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_mut_noupdate(trans, iter, btree_id, pos, flags, 0, 0); +} + +static inline struct bkey_i *__bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned type, unsigned min_bytes) +{ + struct bkey_i *mut = __bch2_bkey_get_mut_noupdate(trans, iter, + btree_id, pos, flags|BTREE_ITER_INTENT, type, min_bytes); + int ret; + + if (IS_ERR(mut)) + return mut; + + ret = bch2_trans_update(trans, iter, mut, flags); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ERR_PTR(ret); + } + + return mut; +} + +static inline struct bkey_i *bch2_bkey_get_mut_minsize(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags, unsigned min_bytes) +{ + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, min_bytes); +} + +static inline struct bkey_i *bch2_bkey_get_mut(struct btree_trans *trans, + struct btree_iter *iter, + unsigned btree_id, struct bpos pos, + unsigned flags) +{ + return __bch2_bkey_get_mut(trans, iter, btree_id, pos, flags, 0, 0); +} + +#define bch2_bkey_get_mut_typed(_trans, _iter, _btree_id, _pos, _flags, _type)\ + bkey_i_to_##_type(__bch2_bkey_get_mut(_trans, _iter, \ + _btree_id, _pos, _flags, \ + KEY_TYPE_##_type, sizeof(struct bkey_i_##_type))) + +static inline struct bkey_i *__bch2_bkey_alloc(struct btree_trans *trans, struct btree_iter *iter, + unsigned flags, unsigned type, unsigned val_size) +{ + struct bkey_i *k = bch2_trans_kmalloc(trans, sizeof(*k) + val_size); + int ret; + + if (IS_ERR(k)) + return k; + + bkey_init(&k->k); + k->k.p = iter->pos; + k->k.type = type; + set_bkey_val_bytes(&k->k, val_size); + + ret = bch2_trans_update(trans, iter, k, flags); + if (unlikely(ret)) + return ERR_PTR(ret); + return k; +} + +#define bch2_bkey_alloc(_trans, _iter, _flags, _type) \ + bkey_i_to_##_type(__bch2_bkey_alloc(_trans, _iter, _flags, \ + KEY_TYPE_##_type, sizeof(struct bch_##_type))) + +#endif /* _BCACHEFS_BTREE_UPDATE_H */ diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c new file mode 100644 index 000000000..3659b2c08 --- /dev/null +++ b/fs/bcachefs/btree_update_interior.c @@ -0,0 +1,2488 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "keylist.h" +#include "recovery.h" +#include "replicas.h" +#include "super-io.h" +#include "trace.h" + +#include + +static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, + struct btree_path *, struct btree *, + struct keylist *, unsigned); +static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); + +static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans, + enum btree_id btree_id, + unsigned level, + struct bpos pos) +{ + struct btree_path *path; + + path = bch2_path_get(trans, btree_id, pos, level + 1, level, + BTREE_ITER_NOPRESERVE| + BTREE_ITER_INTENT, _RET_IP_); + path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_); + bch2_btree_path_downgrade(trans, path); + __bch2_btree_path_unlock(trans, path); + return path; +} + +/* Debug code: */ + +/* + * Verify that child nodes correctly span parent node's range: + */ +static void btree_node_interior_verify(struct bch_fs *c, struct btree *b) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bpos next_node = b->data->min_key; + struct btree_node_iter iter; + struct bkey_s_c k; + struct bkey_s_c_btree_ptr_v2 bp; + struct bkey unpacked; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + BUG_ON(!b->c.level); + + if (!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) + return; + + bch2_btree_node_iter_init_from_start(&iter, b); + + while (1) { + k = bch2_btree_node_iter_peek_unpack(&iter, b, &unpacked); + if (k.k->type != KEY_TYPE_btree_ptr_v2) + break; + bp = bkey_s_c_to_btree_ptr_v2(k); + + if (!bpos_eq(next_node, bp.v->min_key)) { + bch2_dump_btree_node(c, b); + bch2_bpos_to_text(&buf1, next_node); + bch2_bpos_to_text(&buf2, bp.v->min_key); + panic("expected next min_key %s got %s\n", buf1.buf, buf2.buf); + } + + bch2_btree_node_iter_advance(&iter, b); + + if (bch2_btree_node_iter_end(&iter)) { + if (!bpos_eq(k.k->p, b->key.k.p)) { + bch2_dump_btree_node(c, b); + bch2_bpos_to_text(&buf1, b->key.k.p); + bch2_bpos_to_text(&buf2, k.k->p); + panic("expected end %s got %s\n", buf1.buf, buf2.buf); + } + break; + } + + next_node = bpos_successor(k.k->p); + } +#endif +} + +/* Calculate ideal packed bkey format for new btree nodes: */ + +void __bch2_btree_calc_format(struct bkey_format_state *s, struct btree *b) +{ + struct bkey_packed *k; + struct bset_tree *t; + struct bkey uk; + + for_each_bset(b, t) + bset_tree_for_each_key(b, t, k) + if (!bkey_deleted(k)) { + uk = bkey_unpack_key(b, k); + bch2_bkey_format_add_key(s, &uk); + } +} + +static struct bkey_format bch2_btree_calc_format(struct btree *b) +{ + struct bkey_format_state s; + + bch2_bkey_format_init(&s); + bch2_bkey_format_add_pos(&s, b->data->min_key); + bch2_bkey_format_add_pos(&s, b->data->max_key); + __bch2_btree_calc_format(&s, b); + + return bch2_bkey_format_done(&s); +} + +static size_t btree_node_u64s_with_format(struct btree *b, + struct bkey_format *new_f) +{ + struct bkey_format *old_f = &b->format; + + /* stupid integer promotion rules */ + ssize_t delta = + (((int) new_f->key_u64s - old_f->key_u64s) * + (int) b->nr.packed_keys) + + (((int) new_f->key_u64s - BKEY_U64s) * + (int) b->nr.unpacked_keys); + + BUG_ON(delta + b->nr.live_u64s < 0); + + return b->nr.live_u64s + delta; +} + +/** + * btree_node_format_fits - check if we could rewrite node with a new format + * + * This assumes all keys can pack with the new format -- it just checks if + * the re-packed keys would fit inside the node itself. + */ +bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b, + struct bkey_format *new_f) +{ + size_t u64s = btree_node_u64s_with_format(b, new_f); + + return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c); +} + +/* Btree node freeing/allocation: */ + +static void __btree_node_free(struct bch_fs *c, struct btree *b) +{ + trace_and_count(c, btree_node_free, c, b); + + BUG_ON(btree_node_write_blocked(b)); + BUG_ON(btree_node_dirty(b)); + BUG_ON(btree_node_need_write(b)); + BUG_ON(b == btree_node_root(c, b)); + BUG_ON(b->ob.nr); + BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(b->will_make_reachable); + + clear_btree_node_noevict(b); + + mutex_lock(&c->btree_cache.lock); + list_move(&b->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); +} + +static void bch2_btree_node_free_inmem(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + unsigned level = b->c.level; + + bch2_btree_node_lock_write_nofail(trans, path, &b->c); + bch2_btree_node_hash_remove(&c->btree_cache, b); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + mark_btree_node_locked_noreset(path, level, SIX_LOCK_intent); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + +static void bch2_btree_node_free_never_used(struct btree_update *as, + struct btree_trans *trans, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL]; + struct btree_path *path; + unsigned level = b->c.level; + + BUG_ON(!list_empty(&b->write_blocked)); + BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as)); + + b->will_make_reachable = 0; + closure_put(&as->cl); + + clear_btree_node_will_make_reachable(b); + clear_btree_node_accessed(b); + clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + + mutex_lock(&c->btree_cache.lock); + list_del_init(&b->list); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + + BUG_ON(p->nr >= ARRAY_SIZE(p->b)); + p->b[p->nr++] = b; + + six_unlock_intent(&b->c.lock); + + trans_for_each_path(trans, path) + if (path->l[level].b == b) { + btree_node_unlock(trans, path, level); + path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init); + } +} + +static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, + struct disk_reservation *res, + struct closure *cl, + bool interior_node, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct write_point *wp; + struct btree *b; + BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp; + struct open_buckets ob = { .nr = 0 }; + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + unsigned nr_reserve = watermark > BCH_WATERMARK_reclaim + ? BTREE_NODE_RESERVE + : 0; + int ret; + + mutex_lock(&c->btree_reserve_cache_lock); + if (c->btree_reserve_cache_nr > nr_reserve) { + struct btree_alloc *a = + &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; + + ob = a->ob; + bkey_copy(&tmp.k, &a->k); + mutex_unlock(&c->btree_reserve_cache_lock); + goto mem_alloc; + } + mutex_unlock(&c->btree_reserve_cache_lock); + +retry: + ret = bch2_alloc_sectors_start_trans(trans, + c->opts.metadata_target ?: + c->opts.foreground_target, + 0, + writepoint_ptr(&c->btree_write_point), + &devs_have, + res->nr_replicas, + c->opts.metadata_replicas_required, + watermark, 0, cl, &wp); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (wp->sectors_free < btree_sectors(c)) { + struct open_bucket *ob; + unsigned i; + + open_bucket_for_each(c, &wp->ptrs, ob, i) + if (ob->sectors_free < btree_sectors(c)) + ob->sectors_free = 0; + + bch2_alloc_sectors_done(c, wp); + goto retry; + } + + bkey_btree_ptr_v2_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, &tmp.k, btree_sectors(c), false); + + bch2_open_bucket_get(c, wp, &ob); + bch2_alloc_sectors_done(c, wp); +mem_alloc: + b = bch2_btree_node_mem_alloc(trans, interior_node); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + + /* we hold cannibalize_lock: */ + BUG_ON(IS_ERR(b)); + BUG_ON(b->ob.nr); + + bkey_copy(&b->key, &tmp.k); + b->ob = ob; + + return b; +} + +static struct btree *bch2_btree_node_alloc(struct btree_update *as, + struct btree_trans *trans, + unsigned level) +{ + struct bch_fs *c = as->c; + struct btree *b; + struct prealloc_nodes *p = &as->prealloc_nodes[!!level]; + int ret; + + BUG_ON(level >= BTREE_MAX_DEPTH); + BUG_ON(!p->nr); + + b = p->b[--p->nr]; + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + + set_btree_node_accessed(b); + set_btree_node_dirty_acct(c, b); + set_btree_node_need_write(b); + + bch2_bset_init_first(b, &b->data->keys); + b->c.level = level; + b->c.btree_id = as->btree_id; + b->version_ondisk = c->sb.version; + + memset(&b->nr, 0, sizeof(b->nr)); + b->data->magic = cpu_to_le64(bset_magic(c)); + memset(&b->data->_ptr, 0, sizeof(b->data->_ptr)); + b->data->flags = 0; + SET_BTREE_NODE_ID(b->data, as->btree_id); + SET_BTREE_NODE_LEVEL(b->data, level); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + struct bkey_i_btree_ptr_v2 *bp = bkey_i_to_btree_ptr_v2(&b->key); + + bp->v.mem_ptr = 0; + bp->v.seq = b->data->keys.seq; + bp->v.sectors_written = 0; + } + + SET_BTREE_NODE_NEW_EXTENT_OVERWRITE(b->data, true); + + bch2_btree_build_aux_trees(b); + + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id); + BUG_ON(ret); + + trace_and_count(c, btree_node_alloc, c, b); + bch2_increment_clock(c, btree_sectors(c), WRITE); + return b; +} + +static void btree_set_min(struct btree *b, struct bpos pos) +{ + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(&b->key)->v.min_key = pos; + b->data->min_key = pos; +} + +static void btree_set_max(struct btree *b, struct bpos pos) +{ + b->key.k.p = pos; + b->data->max_key = pos; +} + +static struct btree *bch2_btree_node_alloc_replacement(struct btree_update *as, + struct btree_trans *trans, + struct btree *b) +{ + struct btree *n = bch2_btree_node_alloc(as, trans, b->c.level); + struct bkey_format format = bch2_btree_calc_format(b); + + /* + * The keys might expand with the new format - if they wouldn't fit in + * the btree node anymore, use the old format for now: + */ + if (!bch2_btree_node_format_fits(as->c, b, &format)) + format = b->format; + + SET_BTREE_NODE_SEQ(n->data, BTREE_NODE_SEQ(b->data) + 1); + + btree_set_min(n, b->data->min_key); + btree_set_max(n, b->data->max_key); + + n->data->format = format; + btree_node_set_format(n, format); + + bch2_btree_sort_into(as->c, n, b); + + btree_node_reset_sib_u64s(n); + return n; +} + +static struct btree *__btree_root_alloc(struct btree_update *as, + struct btree_trans *trans, unsigned level) +{ + struct btree *b = bch2_btree_node_alloc(as, trans, level); + + btree_set_min(b, POS_MIN); + btree_set_max(b, SPOS_MAX); + b->data->format = bch2_btree_calc_format(b); + + btree_node_set_format(b, b->data->format); + bch2_btree_build_aux_trees(b); + + return b; +} + +static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *trans) +{ + struct bch_fs *c = as->c; + struct prealloc_nodes *p; + + for (p = as->prealloc_nodes; + p < as->prealloc_nodes + ARRAY_SIZE(as->prealloc_nodes); + p++) { + while (p->nr) { + struct btree *b = p->b[--p->nr]; + + mutex_lock(&c->btree_reserve_cache_lock); + + if (c->btree_reserve_cache_nr < + ARRAY_SIZE(c->btree_reserve_cache)) { + struct btree_alloc *a = + &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; + + a->ob = b->ob; + b->ob.nr = 0; + bkey_copy(&a->k, &b->key); + } else { + bch2_open_buckets_put(c, &b->ob); + } + + mutex_unlock(&c->btree_reserve_cache_lock); + + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write); + __btree_node_free(c, b); + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + } + } +} + +static int bch2_btree_reserve_get(struct btree_trans *trans, + struct btree_update *as, + unsigned nr_nodes[2], + unsigned flags, + struct closure *cl) +{ + struct bch_fs *c = as->c; + struct btree *b; + unsigned interior; + int ret = 0; + + BUG_ON(nr_nodes[0] + nr_nodes[1] > BTREE_RESERVE_MAX); + + /* + * Protects reaping from the btree node cache and using the btree node + * open bucket reserve: + * + * BTREE_INSERT_NOWAIT only applies to btree node allocation, not + * blocking on this lock: + */ + ret = bch2_btree_cache_cannibalize_lock(c, cl); + if (ret) + return ret; + + for (interior = 0; interior < 2; interior++) { + struct prealloc_nodes *p = as->prealloc_nodes + interior; + + while (p->nr < nr_nodes[interior]) { + b = __bch2_btree_node_alloc(trans, &as->disk_res, + flags & BTREE_INSERT_NOWAIT ? NULL : cl, + interior, flags); + if (IS_ERR(b)) { + ret = PTR_ERR(b); + goto err; + } + + p->b[p->nr++] = b; + } + } +err: + bch2_btree_cache_cannibalize_unlock(c); + return ret; +} + +/* Asynchronous interior node update machinery */ + +static void bch2_btree_update_free(struct btree_update *as, struct btree_trans *trans) +{ + struct bch_fs *c = as->c; + + if (as->took_gc_lock) + up_read(&c->gc_lock); + as->took_gc_lock = false; + + bch2_journal_preres_put(&c->journal, &as->journal_preres); + + bch2_journal_pin_drop(&c->journal, &as->journal); + bch2_journal_pin_flush(&c->journal, &as->journal); + bch2_disk_reservation_put(c, &as->disk_res); + bch2_btree_reserve_put(as, trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_total], + as->start_time); + + mutex_lock(&c->btree_interior_update_lock); + list_del(&as->unwritten_list); + list_del(&as->list); + + closure_debug_destroy(&as->cl); + mempool_free(as, &c->btree_interior_update_pool); + + /* + * Have to do the wakeup with btree_interior_update_lock still held, + * since being on btree_interior_update_list is our ref on @c: + */ + closure_wake_up(&c->btree_interior_update_wait); + + mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_update_add_key(struct btree_update *as, + struct keylist *keys, struct btree *b) +{ + struct bkey_i *k = &b->key; + + BUG_ON(bch2_keylist_u64s(keys) + k->k.u64s > + ARRAY_SIZE(as->_old_keys)); + + bkey_copy(keys->top, k); + bkey_i_to_btree_ptr_v2(keys->top)->v.mem_ptr = b->c.level + 1; + + bch2_keylist_push(keys); +} + +/* + * The transactional part of an interior btree node update, where we journal the + * update we did to the interior node and update alloc info: + */ +static int btree_update_nodes_written_trans(struct btree_trans *trans, + struct btree_update *as) +{ + struct bkey_i *k; + int ret; + + ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s); + if (ret) + return ret; + + memcpy(&darray_top(trans->extra_journal_entries), + as->journal_entries, + as->journal_u64s * sizeof(u64)); + trans->extra_journal_entries.nr += as->journal_u64s; + + trans->journal_pin = &as->journal; + + for_each_keylist_key(&as->old_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0); + if (ret) + return ret; + } + + for_each_keylist_key(&as->new_keys, k) { + unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr; + + ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0); + if (ret) + return ret; + } + + return 0; +} + +static void btree_update_nodes_written(struct btree_update *as) +{ + struct bch_fs *c = as->c; + struct btree *b; + struct btree_trans trans; + u64 journal_seq = 0; + unsigned i; + int ret; + + bch2_trans_init(&trans, c, 0, 512); + /* + * If we're already in an error state, it might be because a btree node + * was never written, and we might be trying to free that same btree + * node here, but it won't have been marked as allocated and we'll see + * spurious disk usage inconsistencies in the transactional part below + * if we don't skip it: + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + /* + * Wait for any in flight writes to finish before we free the old nodes + * on disk: + */ + for (i = 0; i < as->nr_old_nodes; i++) { + __le64 seq; + + b = as->old_nodes[i]; + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + seq = b->data ? b->data->keys.seq : 0; + six_unlock_read(&b->c.lock); + + if (seq == as->old_nodes_seq[i]) + wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, + TASK_UNINTERRUPTIBLE); + } + + /* + * We did an update to a parent node where the pointers we added pointed + * to child nodes that weren't written yet: now, the child nodes have + * been written so we can write out the update to the interior node. + */ + + /* + * We can't call into journal reclaim here: we'd block on the journal + * reclaim lock, but we may need to release the open buckets we have + * pinned in order for other btree updates to make forward progress, and + * journal reclaim does btree updates when flushing bkey_cached entries, + * which may require allocations as well. + */ + ret = commit_do(&trans, &as->disk_res, &journal_seq, + BCH_WATERMARK_reclaim| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_JOURNAL_RECLAIM, + btree_update_nodes_written_trans(&trans, as)); + bch2_trans_unlock(&trans); + + bch2_fs_fatal_err_on(ret && !bch2_journal_error(&c->journal), c, + "%s(): error %s", __func__, bch2_err_str(ret)); +err: + if (as->b) { + struct btree_path *path; + + b = as->b; + path = get_unlocked_mut_path(&trans, as->btree_id, b->c.level, b->key.k.p); + /* + * @b is the node we did the final insert into: + * + * On failure to get a journal reservation, we still have to + * unblock the write and allow most of the write path to happen + * so that shutdown works, but the i->journal_seq mechanism + * won't work to prevent the btree write from being visible (we + * didn't get a journal sequence number) - instead + * __bch2_btree_node_write() doesn't do the actual write if + * we're in journal error state: + */ + + /* + * Ensure transaction is unlocked before using + * btree_node_lock_nopath() (the use of which is always suspect, + * we need to work on removing this in the future) + * + * It should be, but get_unlocked_mut_path() -> bch2_path_get() + * calls bch2_path_upgrade(), before we call path_make_mut(), so + * we may rarely end up with a locked path besides the one we + * have here: + */ + bch2_trans_unlock(&trans); + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_intent); + mark_btree_node_locked(&trans, path, b->c.level, SIX_LOCK_intent); + path->l[b->c.level].lock_seq = six_lock_seq(&b->c.lock); + path->l[b->c.level].b = b; + + bch2_btree_node_lock_write_nofail(&trans, path, &b->c); + + mutex_lock(&c->btree_interior_update_lock); + + list_del(&as->write_blocked_list); + if (list_empty(&b->write_blocked)) + clear_btree_node_write_blocked(b); + + /* + * Node might have been freed, recheck under + * btree_interior_update_lock: + */ + if (as->b == b) { + struct bset *i = btree_bset_last(b); + + BUG_ON(!b->c.level); + BUG_ON(!btree_node_dirty(b)); + + if (!ret) { + i->journal_seq = cpu_to_le64( + max(journal_seq, + le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + } else { + /* + * If we didn't get a journal sequence number we + * can't write this btree node, because recovery + * won't know to ignore this write: + */ + set_btree_node_never_write(b); + } + } + + mutex_unlock(&c->btree_interior_update_lock); + + mark_btree_node_locked_noreset(path, b->c.level, SIX_LOCK_intent); + six_unlock_write(&b->c.lock); + + btree_node_write_if_need(c, b, SIX_LOCK_intent); + btree_node_unlock(&trans, path, b->c.level); + bch2_path_put(&trans, path, true); + } + + bch2_journal_pin_drop(&c->journal, &as->journal); + + bch2_journal_preres_put(&c->journal, &as->journal_preres); + + mutex_lock(&c->btree_interior_update_lock); + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; + + BUG_ON(b->will_make_reachable != (unsigned long) as); + b->will_make_reachable = 0; + clear_btree_node_will_make_reachable(b); + } + mutex_unlock(&c->btree_interior_update_lock); + + for (i = 0; i < as->nr_new_nodes; i++) { + b = as->new_nodes[i]; + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + } + + for (i = 0; i < as->nr_open_buckets; i++) + bch2_open_bucket_put(c, c->open_buckets + as->open_buckets[i]); + + bch2_btree_update_free(as, &trans); + bch2_trans_exit(&trans); +} + +static void btree_interior_update_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, btree_interior_update_work); + struct btree_update *as; + + while (1) { + mutex_lock(&c->btree_interior_update_lock); + as = list_first_entry_or_null(&c->btree_interior_updates_unwritten, + struct btree_update, unwritten_list); + if (as && !as->nodes_written) + as = NULL; + mutex_unlock(&c->btree_interior_update_lock); + + if (!as) + break; + + btree_update_nodes_written(as); + } +} + +static void btree_update_set_nodes_written(struct closure *cl) +{ + struct btree_update *as = container_of(cl, struct btree_update, cl); + struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); + as->nodes_written = true; + mutex_unlock(&c->btree_interior_update_lock); + + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); +} + +/* + * We're updating @b with pointers to nodes that haven't finished writing yet: + * block @b from being written until @as completes + */ +static void btree_update_updated_node(struct btree_update *as, struct btree *b) +{ + struct bch_fs *c = as->c; + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + BUG_ON(!btree_node_dirty(b)); + BUG_ON(!b->c.level); + + as->mode = BTREE_INTERIOR_UPDATING_NODE; + as->b = b; + + set_btree_node_write_blocked(b); + list_add(&as->write_blocked_list, &b->write_blocked); + + mutex_unlock(&c->btree_interior_update_lock); +} + +static void btree_update_reparent(struct btree_update *as, + struct btree_update *child) +{ + struct bch_fs *c = as->c; + + lockdep_assert_held(&c->btree_interior_update_lock); + + child->b = NULL; + child->mode = BTREE_INTERIOR_UPDATING_AS; + + bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL); +} + +static void btree_update_updated_root(struct btree_update *as, struct btree *b) +{ + struct bkey_i *insert = &b->key; + struct bch_fs *c = as->c; + + BUG_ON(as->mode != BTREE_INTERIOR_NO_UPDATE); + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); + + as->journal_u64s += + journal_entry_set((void *) &as->journal_entries[as->journal_u64s], + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + insert, insert->k.u64s); + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->unwritten_list, &c->btree_interior_updates_unwritten); + + as->mode = BTREE_INTERIOR_UPDATING_ROOT; + mutex_unlock(&c->btree_interior_update_lock); +} + +/* + * bch2_btree_update_add_new_node: + * + * This causes @as to wait on @b to be written, before it gets to + * bch2_btree_update_nodes_written + * + * Additionally, it sets b->will_make_reachable to prevent any additional writes + * to @b from happening besides the first until @b is reachable on disk + * + * And it adds @b to the list of @as's new nodes, so that we can update sector + * counts in bch2_btree_update_nodes_written: + */ +static void bch2_btree_update_add_new_node(struct btree_update *as, struct btree *b) +{ + struct bch_fs *c = as->c; + + closure_get(&as->cl); + + mutex_lock(&c->btree_interior_update_lock); + BUG_ON(as->nr_new_nodes >= ARRAY_SIZE(as->new_nodes)); + BUG_ON(b->will_make_reachable); + + as->new_nodes[as->nr_new_nodes++] = b; + b->will_make_reachable = 1UL|(unsigned long) as; + set_btree_node_will_make_reachable(b); + + mutex_unlock(&c->btree_interior_update_lock); + + btree_update_add_key(as, &as->new_keys, b); + + if (b->key.k.type == KEY_TYPE_btree_ptr_v2) { + unsigned bytes = vstruct_end(&b->data->keys) - (void *) b->data; + unsigned sectors = round_up(bytes, block_bytes(c)) >> 9; + + bkey_i_to_btree_ptr_v2(&b->key)->v.sectors_written = + cpu_to_le16(sectors); + } +} + +/* + * returns true if @b was a new node + */ +static void btree_update_drop_new_node(struct bch_fs *c, struct btree *b) +{ + struct btree_update *as; + unsigned long v; + unsigned i; + + mutex_lock(&c->btree_interior_update_lock); + /* + * When b->will_make_reachable != 0, it owns a ref on as->cl that's + * dropped when it gets written by bch2_btree_complete_write - the + * xchg() is for synchronization with bch2_btree_complete_write: + */ + v = xchg(&b->will_make_reachable, 0); + clear_btree_node_will_make_reachable(b); + as = (struct btree_update *) (v & ~1UL); + + if (!as) { + mutex_unlock(&c->btree_interior_update_lock); + return; + } + + for (i = 0; i < as->nr_new_nodes; i++) + if (as->new_nodes[i] == b) + goto found; + + BUG(); +found: + array_remove_item(as->new_nodes, as->nr_new_nodes, i); + mutex_unlock(&c->btree_interior_update_lock); + + if (v & 1) + closure_put(&as->cl); +} + +static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct btree *b) +{ + while (b->ob.nr) + as->open_buckets[as->nr_open_buckets++] = + b->ob.v[--b->ob.nr]; +} + +/* + * @b is being split/rewritten: it may have pointers to not-yet-written btree + * nodes and thus outstanding btree_updates - redirect @b's + * btree_updates to point to this btree_update: + */ +static void bch2_btree_interior_update_will_free_node(struct btree_update *as, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct btree_update *p, *n; + struct btree_write *w; + + set_btree_node_dying(b); + + if (btree_node_fake(b)) + return; + + mutex_lock(&c->btree_interior_update_lock); + + /* + * Does this node have any btree_update operations preventing + * it from being written? + * + * If so, redirect them to point to this btree_update: we can + * write out our new nodes, but we won't make them visible until those + * operations complete + */ + list_for_each_entry_safe(p, n, &b->write_blocked, write_blocked_list) { + list_del_init(&p->write_blocked_list); + btree_update_reparent(as, p); + + /* + * for flush_held_btree_writes() waiting on updates to flush or + * nodes to be writeable: + */ + closure_wake_up(&c->btree_interior_update_wait); + } + + clear_btree_node_dirty_acct(c, b); + clear_btree_node_need_write(b); + clear_btree_node_write_blocked(b); + + /* + * Does this node have unwritten data that has a pin on the journal? + * + * If so, transfer that pin to the btree_update operation - + * note that if we're freeing multiple nodes, we only need to keep the + * oldest pin of any of the nodes we're freeing. We'll release the pin + * when the new nodes are persistent and reachable on disk: + */ + w = btree_current_write(b); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_drop(&c->journal, &w->journal); + + w = btree_prev_write(b); + bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL); + bch2_journal_pin_drop(&c->journal, &w->journal); + + mutex_unlock(&c->btree_interior_update_lock); + + /* + * Is this a node that isn't reachable on disk yet? + * + * Nodes that aren't reachable yet have writes blocked until they're + * reachable - now that we've cancelled any pending writes and moved + * things waiting on that write to wait on this update, we can drop this + * node from the list of nodes that the other update is making + * reachable, prior to freeing it: + */ + btree_update_drop_new_node(c, b); + + btree_update_add_key(as, &as->old_keys, b); + + as->old_nodes[as->nr_old_nodes] = b; + as->old_nodes_seq[as->nr_old_nodes] = b->data->keys.seq; + as->nr_old_nodes++; +} + +static void bch2_btree_update_done(struct btree_update *as, struct btree_trans *trans) +{ + struct bch_fs *c = as->c; + u64 start_time = as->start_time; + + BUG_ON(as->mode == BTREE_INTERIOR_NO_UPDATE); + + if (as->took_gc_lock) + up_read(&as->c->gc_lock); + as->took_gc_lock = false; + + bch2_btree_reserve_put(as, trans); + + continue_at(&as->cl, btree_update_set_nodes_written, + as->c->btree_interior_update_worker); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_interior_update_foreground], + start_time); +} + +static struct btree_update * +bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, + unsigned level, bool split, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_update *as; + u64 start_time = local_clock(); + int disk_res_flags = (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0; + unsigned nr_nodes[2] = { 0, 0 }; + unsigned update_level = level; + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + unsigned journal_flags = 0; + int ret = 0; + u32 restart_count = trans->restart_count; + + BUG_ON(!path->should_be_locked); + + if (watermark == BCH_WATERMARK_copygc) + watermark = BCH_WATERMARK_btree_copygc; + if (watermark < BCH_WATERMARK_btree) + watermark = BCH_WATERMARK_btree; + + flags &= ~BCH_WATERMARK_MASK; + flags |= watermark; + + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) + journal_flags |= JOURNAL_RES_GET_NONBLOCK; + journal_flags |= watermark; + + while (1) { + nr_nodes[!!update_level] += 1 + split; + update_level++; + + ret = bch2_btree_path_upgrade(trans, path, update_level + 1); + if (ret) + return ERR_PTR(ret); + + if (!btree_path_node(path, update_level)) { + /* Allocating new root? */ + nr_nodes[1] += split; + update_level = BTREE_MAX_DEPTH; + break; + } + + if (bch2_btree_node_insert_fits(c, path->l[update_level].b, + BKEY_BTREE_PTR_U64s_MAX * (1 + split))) + break; + + split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c); + } + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + else if (!down_read_trylock(&c->gc_lock)) { + ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0)); + if (ret) { + up_read(&c->gc_lock); + return ERR_PTR(ret); + } + } + + as = mempool_alloc(&c->btree_interior_update_pool, GFP_NOFS); + memset(as, 0, sizeof(*as)); + closure_init(&as->cl, NULL); + as->c = c; + as->start_time = start_time; + as->mode = BTREE_INTERIOR_NO_UPDATE; + as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD); + as->btree_id = path->btree_id; + as->update_level = update_level; + INIT_LIST_HEAD(&as->list); + INIT_LIST_HEAD(&as->unwritten_list); + INIT_LIST_HEAD(&as->write_blocked_list); + bch2_keylist_init(&as->old_keys, as->_old_keys); + bch2_keylist_init(&as->new_keys, as->_new_keys); + bch2_keylist_init(&as->parent_keys, as->inline_keys); + + mutex_lock(&c->btree_interior_update_lock); + list_add_tail(&as->list, &c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + /* + * We don't want to allocate if we're in an error state, that can cause + * deadlock on emergency shutdown due to open buckets getting stuck in + * the btree_reserve_cache after allocator shutdown has cleared it out. + * This check needs to come after adding us to the btree_interior_update + * list but before calling bch2_btree_reserve_get, to synchronize with + * __bch2_fs_read_only(). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + ret = bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + journal_flags|JOURNAL_RES_GET_NONBLOCK); + if (ret) { + if (flags & BTREE_INSERT_JOURNAL_RECLAIM) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto err; + } + + ret = drop_locks_do(trans, + bch2_journal_preres_get(&c->journal, &as->journal_preres, + BTREE_UPDATE_JOURNAL_RES, + journal_flags)); + if (ret == -BCH_ERR_journal_preres_get_blocked) { + trace_and_count(c, trans_restart_journal_preres_get, trans, _RET_IP_, journal_flags); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_journal_preres_get); + } + if (ret) + goto err; + } + + ret = bch2_disk_reservation_get(c, &as->disk_res, + (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), + c->opts.metadata_replicas, + disk_res_flags); + if (ret) + goto err; + + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + if (bch2_err_matches(ret, ENOSPC) || + bch2_err_matches(ret, ENOMEM)) { + struct closure cl; + + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if (bch2_err_matches(ret, ENOSPC) && + (flags & BTREE_INSERT_JOURNAL_RECLAIM) && + watermark != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + goto err; + } + + closure_init_stack(&cl); + + do { + ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + + bch2_trans_unlock(trans); + closure_sync(&cl); + } while (bch2_err_matches(ret, BCH_ERR_operation_blocked)); + } + + if (ret) { + trace_and_count(c, btree_reserve_get_fail, trans->fn, + _RET_IP_, nr_nodes[0] + nr_nodes[1], ret); + goto err; + } + + ret = bch2_trans_relock(trans); + if (ret) + goto err; + + bch2_trans_verify_not_restarted(trans, restart_count); + return as; +err: + bch2_btree_update_free(as, trans); + return ERR_PTR(ret); +} + +/* Btree root updates: */ + +static void bch2_btree_set_root_inmem(struct bch_fs *c, struct btree *b) +{ + /* Root nodes cannot be reaped */ + mutex_lock(&c->btree_cache.lock); + list_del_init(&b->list); + mutex_unlock(&c->btree_cache.lock); + + mutex_lock(&c->btree_root_lock); + BUG_ON(btree_node_root(c, b) && + (b->c.level < btree_node_root(c, b)->c.level || + !btree_node_dying(btree_node_root(c, b)))); + + bch2_btree_id_root(c, b->c.btree_id)->b = b; + mutex_unlock(&c->btree_root_lock); + + bch2_recalc_btree_reserve(c); +} + +/** + * bch_btree_set_root - update the root in memory and on disk + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. However, you must hold an intent lock on the + * old root. + * + * Note: This allocates a journal entry but doesn't add any keys to + * it. All the btree roots are part of every journal write, so there + * is nothing new to be done. This just guarantees that there is a + * journal write. + */ +static void bch2_btree_set_root(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = as->c; + struct btree *old; + + trace_and_count(c, btree_node_set_root, c, b); + + old = btree_node_root(c, b); + + /* + * Ensure no one is using the old root while we switch to the + * new root: + */ + bch2_btree_node_lock_write_nofail(trans, path, &old->c); + + bch2_btree_set_root_inmem(c, b); + + btree_update_updated_root(as, b); + + /* + * Unlock old root after new root is visible: + * + * The new root isn't persistent, but that's ok: we still have + * an intent lock on the new root, and any updates that would + * depend on the new root would have to update the new root. + */ + bch2_btree_node_unlock_write(trans, path, old); +} + +/* Interior node updates: */ + +static void bch2_insert_fixup_btree_ptr(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + struct bch_fs *c = as->c; + struct bkey_packed *k; + struct printbuf buf = PRINTBUF; + unsigned long old, new, v; + + BUG_ON(insert->k.type == KEY_TYPE_btree_ptr_v2 && + !btree_ptr_sectors_written(insert)); + + if (unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags))) + bch2_journal_key_overwritten(c, b->c.btree_id, b->c.level, insert->k.p); + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf) ?: + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "inserting invalid bkey\n "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + prt_printf(&buf, "\n "); + bch2_bkey_invalid(c, bkey_i_to_s_c(insert), + btree_node_type(b), WRITE, &buf); + bch2_bkey_in_btree_node(b, bkey_i_to_s_c(insert), &buf); + + bch2_fs_inconsistent(c, "%s", buf.buf); + dump_stack(); + } + + BUG_ON(as->journal_u64s + jset_u64s(insert->k.u64s) > + ARRAY_SIZE(as->journal_entries)); + + as->journal_u64s += + journal_entry_set((void *) &as->journal_entries[as->journal_u64s], + BCH_JSET_ENTRY_btree_keys, + b->c.btree_id, b->c.level, + insert, insert->k.u64s); + + while ((k = bch2_btree_node_iter_peek_all(node_iter, b)) && + bkey_iter_pos_cmp(b, k, &insert->k.p) < 0) + bch2_btree_node_iter_advance(node_iter, b); + + bch2_btree_bset_insert_key(trans, path, b, node_iter, insert); + set_btree_node_dirty_acct(c, b); + + v = READ_ONCE(b->flags); + do { + old = new = v; + + new &= ~BTREE_WRITE_TYPE_MASK; + new |= BTREE_WRITE_interior; + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + printbuf_exit(&buf); +} + +static void +__bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter node_iter, + struct keylist *keys) +{ + struct bkey_i *insert = bch2_keylist_front(keys); + struct bkey_packed *k; + + BUG_ON(btree_node_type(b) != BKEY_TYPE_btree); + + while ((k = bch2_btree_node_iter_prev_all(&node_iter, b)) && + (bkey_cmp_left_packed(b, k, &insert->k.p) >= 0)) + ; + + while (!bch2_keylist_empty(keys)) { + struct bkey_i *k = bch2_keylist_front(keys); + + if (bpos_gt(k->k.p, b->key.k.p)) + break; + + bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, k); + bch2_keylist_pop_front(keys); + } +} + +/* + * Move keys from n1 (original replacement node, now lower node) to n2 (higher + * node) + */ +static void __btree_split_node(struct btree_update *as, + struct btree_trans *trans, + struct btree *b, + struct btree *n[2]) +{ + struct bkey_packed *k; + struct bpos n1_pos = POS_MIN; + struct btree_node_iter iter; + struct bset *bsets[2]; + struct bkey_format_state format[2]; + struct bkey_packed *out[2]; + struct bkey uk; + unsigned u64s, n1_u64s = (b->nr.live_u64s * 3) / 5; + int i; + + for (i = 0; i < 2; i++) { + BUG_ON(n[i]->nsets != 1); + + bsets[i] = btree_bset_first(n[i]); + out[i] = bsets[i]->start; + + SET_BTREE_NODE_SEQ(n[i]->data, BTREE_NODE_SEQ(b->data) + 1); + bch2_bkey_format_init(&format[i]); + } + + u64s = 0; + for_each_btree_node_key(b, k, &iter) { + if (bkey_deleted(k)) + continue; + + i = u64s >= n1_u64s; + u64s += k->u64s; + uk = bkey_unpack_key(b, k); + if (!i) + n1_pos = uk.p; + bch2_bkey_format_add_key(&format[i], &uk); + } + + btree_set_min(n[0], b->data->min_key); + btree_set_max(n[0], n1_pos); + btree_set_min(n[1], bpos_successor(n1_pos)); + btree_set_max(n[1], b->data->max_key); + + for (i = 0; i < 2; i++) { + bch2_bkey_format_add_pos(&format[i], n[i]->data->min_key); + bch2_bkey_format_add_pos(&format[i], n[i]->data->max_key); + + n[i]->data->format = bch2_bkey_format_done(&format[i]); + btree_node_set_format(n[i], n[i]->data->format); + } + + u64s = 0; + for_each_btree_node_key(b, k, &iter) { + if (bkey_deleted(k)) + continue; + + i = u64s >= n1_u64s; + u64s += k->u64s; + + if (bch2_bkey_transform(&n[i]->format, out[i], bkey_packed(k) + ? &b->format: &bch2_bkey_format_current, k)) + out[i]->format = KEY_FORMAT_LOCAL_BTREE; + else + bch2_bkey_unpack(b, (void *) out[i], k); + + out[i]->needs_whiteout = false; + + btree_keys_account_key_add(&n[i]->nr, 0, out[i]); + out[i] = bkey_p_next(out[i]); + } + + for (i = 0; i < 2; i++) { + bsets[i]->u64s = cpu_to_le16((u64 *) out[i] - bsets[i]->_data); + + BUG_ON(!bsets[i]->u64s); + + set_btree_bset_end(n[i], n[i]->set); + + btree_node_reset_sib_u64s(n[i]); + + bch2_verify_btree_nr_keys(n[i]); + + if (b->c.level) + btree_node_interior_verify(as->c, n[i]); + } +} + +/* + * For updates to interior nodes, we've got to do the insert before we split + * because the stuff we're inserting has to be inserted atomically. Post split, + * the keys might have to go in different nodes and the split would no longer be + * atomic. + * + * Worse, if the insert is from btree node coalescing, if we do the insert after + * we do the split (and pick the pivot) - the pivot we pick might be between + * nodes that were coalesced, and thus in the middle of a child node post + * coalescing: + */ +static void btree_split_insert_keys(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct keylist *keys) +{ + if (!bch2_keylist_empty(keys) && + bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) { + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); + + __bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + + btree_node_interior_verify(as->c, b); + } +} + +static int btree_split(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) +{ + struct bch_fs *c = as->c; + struct btree *parent = btree_node_parent(path, b); + struct btree *n1, *n2 = NULL, *n3 = NULL; + struct btree_path *path1 = NULL, *path2 = NULL; + u64 start_time = local_clock(); + int ret = 0; + + BUG_ON(!parent && (b != btree_node_root(c, b))); + BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1)); + + bch2_btree_interior_update_will_free_node(as, b); + + if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) { + struct btree *n[2]; + + trace_and_count(c, btree_node_split, c, b); + + n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level); + n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level); + + __btree_split_node(as, trans, b, n); + + if (keys) { + btree_split_insert_keys(as, trans, path, n1, keys); + btree_split_insert_keys(as, trans, path, n2, keys); + BUG_ON(!bch2_keylist_empty(keys)); + } + + bch2_btree_build_aux_trees(n2); + bch2_btree_build_aux_trees(n1); + + bch2_btree_update_add_new_node(as, n1); + bch2_btree_update_add_new_node(as, n2); + six_unlock_write(&n2->c.lock); + six_unlock_write(&n1->c.lock); + + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); + + path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p); + six_lock_increment(&n2->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n2->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n2); + + /* + * Note that on recursive parent_keys == keys, so we + * can't start adding new keys to parent_keys before emptying it + * out (which we did with btree_split_insert_keys() above) + */ + bch2_keylist_add(&as->parent_keys, &n1->key); + bch2_keylist_add(&as->parent_keys, &n2->key); + + if (!parent) { + /* Depth increases, make a new root */ + n3 = __btree_root_alloc(as, trans, b->c.level + 1); + + bch2_btree_update_add_new_node(as, n3); + six_unlock_write(&n3->c.lock); + + path2->locks_want++; + BUG_ON(btree_node_locked(path2, n3->c.level)); + six_lock_increment(&n3->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path2, n3->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path2, n3); + + n3->sib_u64s[0] = U16_MAX; + n3->sib_u64s[1] = U16_MAX; + + btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + } + } else { + trace_and_count(c, btree_node_compact, c, b); + + n1 = bch2_btree_node_alloc_replacement(as, trans, b); + + if (keys) { + btree_split_insert_keys(as, trans, path, n1, keys); + BUG_ON(!bch2_keylist_empty(keys)); + } + + bch2_btree_build_aux_trees(n1); + bch2_btree_update_add_new_node(as, n1); + six_unlock_write(&n1->c.lock); + + path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p); + six_lock_increment(&n1->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, path1, n1->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, path1, n1); + + if (parent) + bch2_keylist_add(&as->parent_keys, &n1->key); + } + + /* New nodes all written, now make them visible: */ + + if (parent) { + /* Split a non root node */ + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + if (ret) + goto err; + } else if (n3) { + bch2_btree_set_root(as, trans, path, n3); + } else { + /* Root filled up but didn't need to be split */ + bch2_btree_set_root(as, trans, path, n1); + } + + if (n3) { + bch2_btree_update_get_open_buckets(as, n3); + bch2_btree_node_write(c, n3, SIX_LOCK_intent, 0); + } + if (n2) { + bch2_btree_update_get_open_buckets(as, n2); + bch2_btree_node_write(c, n2, SIX_LOCK_intent, 0); + } + bch2_btree_update_get_open_buckets(as, n1); + bch2_btree_node_write(c, n1, SIX_LOCK_intent, 0); + + /* + * The old node must be freed (in memory) _before_ unlocking the new + * nodes - else another thread could re-acquire a read lock on the old + * node after another thread has locked and updated the new node, thus + * seeing stale data: + */ + bch2_btree_node_free_inmem(trans, path, b); + + if (n3) + bch2_trans_node_add(trans, n3); + if (n2) + bch2_trans_node_add(trans, n2); + bch2_trans_node_add(trans, n1); + + if (n3) + six_unlock_intent(&n3->c.lock); + if (n2) + six_unlock_intent(&n2->c.lock); + six_unlock_intent(&n1->c.lock); +out: + if (path2) { + __bch2_btree_path_unlock(trans, path2); + bch2_path_put(trans, path2, true); + } + if (path1) { + __bch2_btree_path_unlock(trans, path1); + bch2_path_put(trans, path1, true); + } + + bch2_trans_verify_locks(trans); + + bch2_time_stats_update(&c->times[n2 + ? BCH_TIME_btree_node_split + : BCH_TIME_btree_node_compact], + start_time); + return ret; +err: + if (n3) + bch2_btree_node_free_never_used(as, trans, n3); + if (n2) + bch2_btree_node_free_never_used(as, trans, n2); + bch2_btree_node_free_never_used(as, trans, n1); + goto out; +} + +static void +bch2_btree_insert_keys_interior(struct btree_update *as, + struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct keylist *keys) +{ + struct btree_path *linked; + + __bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); + + btree_update_updated_node(as, b); + + trans_for_each_path_with_node(trans, b, linked) + bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); + + bch2_trans_verify_paths(trans); +} + +/** + * bch_btree_insert_node - insert bkeys into a given btree node + * + * @iter: btree iterator + * @keys: list of keys to insert + * @hook: insert callback + * @persistent: if not null, @persistent will wait on journal write + * + * Inserts as many keys as it can into a given btree node, splitting it if full. + * If a split occurred, this function will return early. This can only happen + * for leaf nodes -- inserts into interior nodes have to be atomic. + */ +static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans, + struct btree_path *path, struct btree *b, + struct keylist *keys, unsigned flags) +{ + struct bch_fs *c = as->c; + int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + int ret; + + lockdep_assert_held(&c->gc_lock); + BUG_ON(!btree_node_intent_locked(path, b->c.level)); + BUG_ON(!b->c.level); + BUG_ON(!as || as->b); + bch2_verify_keylist_sorted(keys); + + ret = bch2_btree_node_lock_write(trans, path, &b->c); + if (ret) + return ret; + + bch2_btree_node_prep_for_write(trans, path, b); + + if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) { + bch2_btree_node_unlock_write(trans, path, b); + goto split; + } + + btree_node_interior_verify(c, b); + + bch2_btree_insert_keys_interior(as, trans, path, b, keys); + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) le16_to_cpu(btree_bset_last(b)->u64s) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); + + bch2_btree_node_unlock_write(trans, path, b); + + btree_node_interior_verify(c, b); + return 0; +split: + /* + * We could attempt to avoid the transaction restart, by calling + * bch2_btree_path_upgrade() and allocating more nodes: + */ + if (b->c.level >= as->update_level) { + trace_and_count(c, trans_restart_split_race, trans, _THIS_IP_, b); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race); + } + + return btree_split(as, trans, path, b, keys, flags); +} + +int bch2_btree_split_leaf(struct btree_trans *trans, + struct btree_path *path, + unsigned flags) +{ + struct btree *b = path_l(path)->b; + struct btree_update *as; + unsigned l; + int ret = 0; + + as = bch2_btree_update_start(trans, path, path->level, + true, flags); + if (IS_ERR(as)) + return PTR_ERR(as); + + ret = btree_split(as, trans, path, b, NULL, flags); + if (ret) { + bch2_btree_update_free(as, trans); + return ret; + } + + bch2_btree_update_done(as, trans); + + for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++) + ret = bch2_foreground_maybe_merge(trans, path, l, flags); + + return ret; +} + +int __bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + unsigned flags, + enum btree_node_sibling sib) +{ + struct bch_fs *c = trans->c; + struct btree_path *sib_path = NULL, *new_path = NULL; + struct btree_update *as; + struct bkey_format_state new_s; + struct bkey_format new_f; + struct bkey_i delete; + struct btree *b, *m, *n, *prev, *next, *parent; + struct bpos sib_pos; + size_t sib_u64s; + u64 start_time = local_clock(); + int ret = 0; + + BUG_ON(!path->should_be_locked); + BUG_ON(!btree_node_locked(path, level)); + + b = path->l[level].b; + + if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) || + (sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) { + b->sib_u64s[sib] = U16_MAX; + return 0; + } + + sib_pos = sib == btree_prev_sib + ? bpos_predecessor(b->data->min_key) + : bpos_successor(b->data->max_key); + + sib_path = bch2_path_get(trans, path->btree_id, sib_pos, + U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, sib_path, false); + if (ret) + goto err; + + btree_path_set_should_be_locked(sib_path); + + m = sib_path->l[level].b; + + if (btree_node_parent(path, b) != + btree_node_parent(sib_path, m)) { + b->sib_u64s[sib] = U16_MAX; + goto out; + } + + if (sib == btree_prev_sib) { + prev = m; + next = b; + } else { + prev = b; + next = m; + } + + if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + bch2_bpos_to_text(&buf1, prev->data->max_key); + bch2_bpos_to_text(&buf2, next->data->min_key); + bch_err(c, + "%s(): btree topology error:\n" + " prev ends at %s\n" + " next starts at %s", + __func__, buf1.buf, buf2.buf); + printbuf_exit(&buf1); + printbuf_exit(&buf2); + bch2_topology_error(c); + ret = -EIO; + goto err; + } + + bch2_bkey_format_init(&new_s); + bch2_bkey_format_add_pos(&new_s, prev->data->min_key); + __bch2_btree_calc_format(&new_s, prev); + __bch2_btree_calc_format(&new_s, next); + bch2_bkey_format_add_pos(&new_s, next->data->max_key); + new_f = bch2_bkey_format_done(&new_s); + + sib_u64s = btree_node_u64s_with_format(b, &new_f) + + btree_node_u64s_with_format(m, &new_f); + + if (sib_u64s > BTREE_FOREGROUND_MERGE_HYSTERESIS(c)) { + sib_u64s -= BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + sib_u64s /= 2; + sib_u64s += BTREE_FOREGROUND_MERGE_HYSTERESIS(c); + } + + sib_u64s = min(sib_u64s, btree_max_u64s(c)); + sib_u64s = min(sib_u64s, (size_t) U16_MAX - 1); + b->sib_u64s[sib] = sib_u64s; + + if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold) + goto out; + + parent = btree_node_parent(path, b); + as = bch2_btree_update_start(trans, path, level, false, + BTREE_INSERT_NOFAIL|flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto err; + + trace_and_count(c, btree_node_merge, c, b); + + bch2_btree_interior_update_will_free_node(as, b); + bch2_btree_interior_update_will_free_node(as, m); + + n = bch2_btree_node_alloc(as, trans, b->c.level); + + SET_BTREE_NODE_SEQ(n->data, + max(BTREE_NODE_SEQ(b->data), + BTREE_NODE_SEQ(m->data)) + 1); + + btree_set_min(n, prev->data->min_key); + btree_set_max(n, next->data->max_key); + + n->data->format = new_f; + btree_node_set_format(n, new_f); + + bch2_btree_sort_into(c, n, prev); + bch2_btree_sort_into(c, n, next); + + bch2_btree_build_aux_trees(n); + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); + + bkey_init(&delete.k); + delete.k.p = prev->key.k.p; + bch2_keylist_add(&as->parent_keys, &delete); + bch2_keylist_add(&as->parent_keys, &n->key); + + bch2_trans_verify_paths(trans); + + ret = bch2_btree_insert_node(as, trans, path, parent, &as->parent_keys, flags); + if (ret) + goto err_free_update; + + bch2_trans_verify_paths(trans); + + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, path, b); + bch2_btree_node_free_inmem(trans, sib_path, m); + + bch2_trans_node_add(trans, n); + + bch2_trans_verify_paths(trans); + + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as, trans); + + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_merge], start_time); +out: +err: + if (new_path) + bch2_path_put(trans, new_path, true); + bch2_path_put(trans, sib_path, true); + bch2_trans_verify_locks(trans); + return ret; +err_free_update: + bch2_btree_node_free_never_used(as, trans, n); + bch2_btree_update_free(as, trans); + goto out; +} + +/** + * bch_btree_node_rewrite - Rewrite/move a btree node + */ +int bch2_btree_node_rewrite(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_path *new_path = NULL; + struct btree *n, *parent; + struct btree_update *as; + int ret; + + flags |= BTREE_INSERT_NOFAIL; + + parent = btree_node_parent(iter->path, b); + as = bch2_btree_update_start(trans, iter->path, b->c.level, + false, flags); + ret = PTR_ERR_OR_ZERO(as); + if (ret) + goto out; + + bch2_btree_interior_update_will_free_node(as, b); + + n = bch2_btree_node_alloc_replacement(as, trans, b); + + bch2_btree_build_aux_trees(n); + bch2_btree_update_add_new_node(as, n); + six_unlock_write(&n->c.lock); + + new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p); + six_lock_increment(&n->c.lock, SIX_LOCK_intent); + mark_btree_node_locked(trans, new_path, n->c.level, SIX_LOCK_intent); + bch2_btree_path_level_init(trans, new_path, n); + + trace_and_count(c, btree_node_rewrite, c, b); + + if (parent) { + bch2_keylist_add(&as->parent_keys, &n->key); + ret = bch2_btree_insert_node(as, trans, iter->path, parent, + &as->parent_keys, flags); + if (ret) + goto err; + } else { + bch2_btree_set_root(as, trans, iter->path, n); + } + + bch2_btree_update_get_open_buckets(as, n); + bch2_btree_node_write(c, n, SIX_LOCK_intent, 0); + + bch2_btree_node_free_inmem(trans, iter->path, b); + + bch2_trans_node_add(trans, n); + six_unlock_intent(&n->c.lock); + + bch2_btree_update_done(as, trans); +out: + if (new_path) + bch2_path_put(trans, new_path, true); + bch2_btree_path_downgrade(trans, iter->path); + return ret; +err: + bch2_btree_node_free_never_used(as, trans, n); + bch2_btree_update_free(as, trans); + goto out; +} + +struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; + struct list_head list; + enum btree_id btree_id; + unsigned level; + struct bpos pos; + __le64 seq; +}; + +static int async_btree_node_rewrite_trans(struct btree_trans *trans, + struct async_btree_rewrite *a) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct btree *b; + int ret; + + bch2_trans_node_iter_init(trans, &iter, a->btree_id, a->pos, + BTREE_MAX_DEPTH, a->level, 0); + b = bch2_btree_iter_peek_node(&iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto out; + + if (!b || b->data->keys.seq != a->seq) { + struct printbuf buf = PRINTBUF; + + if (b) + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + else + prt_str(&buf, "(null"); + bch_info(c, "%s: node to rewrite not found:, searching for seq %llu, got\n%s", + __func__, a->seq, buf.buf); + printbuf_exit(&buf); + goto out; + } + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); +out: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static void async_btree_node_rewrite_work(struct work_struct *work) +{ + struct async_btree_rewrite *a = + container_of(work, struct async_btree_rewrite, work); + struct bch_fs *c = a->c; + int ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + async_btree_node_rewrite_trans(&trans, a)); + if (ret) + bch_err(c, "%s: error %s", __func__, bch2_err_str(ret)); + bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + kfree(a); +} + +void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) +{ + struct async_btree_rewrite *a; + int ret; + + a = kmalloc(sizeof(*a), GFP_NOFS); + if (!a) { + bch_err(c, "%s: error allocating memory", __func__); + return; + } + + a->c = c; + a->btree_id = b->c.btree_id; + a->level = b->c.level; + a->pos = b->key.k.p; + a->seq = b->data->keys.seq; + INIT_WORK(&a->work, async_btree_node_rewrite_work); + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + mutex_lock(&c->pending_node_rewrites_lock); + list_add(&a->list, &c->pending_node_rewrites); + mutex_unlock(&c->pending_node_rewrites_lock); + return; + } + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (test_bit(BCH_FS_STARTED, &c->flags)) { + bch_err(c, "%s: error getting c->writes ref", __func__); + kfree(a); + return; + } + + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err(c, "%s: error going read-write: %s", + __func__, bch2_err_str(ret)); + kfree(a); + return; + } + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + } + + queue_work(c->btree_interior_update_worker, &a->work); +} + +void bch2_do_pending_node_rewrites(struct bch_fs *c) +{ + struct async_btree_rewrite *a, *n; + + mutex_lock(&c->pending_node_rewrites_lock); + list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { + list_del(&a->list); + + bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + queue_work(c->btree_interior_update_worker, &a->work); + } + mutex_unlock(&c->pending_node_rewrites_lock); +} + +void bch2_free_pending_node_rewrites(struct bch_fs *c) +{ + struct async_btree_rewrite *a, *n; + + mutex_lock(&c->pending_node_rewrites_lock); + list_for_each_entry_safe(a, n, &c->pending_node_rewrites, list) { + list_del(&a->list); + + kfree(a); + } + mutex_unlock(&c->pending_node_rewrites_lock); +} + +static int __bch2_btree_node_update_key(struct btree_trans *trans, + struct btree_iter *iter, + struct btree *b, struct btree *new_hash, + struct bkey_i *new_key, + unsigned commit_flags, + bool skip_triggers) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter2 = { NULL }; + struct btree *parent; + int ret; + + if (!skip_triggers) { + ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1, + bkey_i_to_s_c(&b->key), 0); + if (ret) + return ret; + + ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1, + new_key, 0); + if (ret) + return ret; + } + + if (new_hash) { + bkey_copy(&new_hash->key, new_key); + ret = bch2_btree_node_hash_insert(&c->btree_cache, + new_hash, b->c.level, b->c.btree_id); + BUG_ON(ret); + } + + parent = btree_node_parent(iter->path, b); + if (parent) { + bch2_trans_copy_iter(&iter2, iter); + + iter2.path = bch2_btree_path_make_mut(trans, iter2.path, + iter2.flags & BTREE_ITER_INTENT, + _THIS_IP_); + + BUG_ON(iter2.path->level != b->c.level); + BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p)); + + btree_path_set_level_up(trans, iter2.path); + + trans->paths_sorted = false; + + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_NORUN); + if (ret) + goto err; + } else { + BUG_ON(btree_node_root(c, b) != b); + + ret = darray_make_room(&trans->extra_journal_entries, + jset_u64s(new_key->k.u64s)); + if (ret) + return ret; + + journal_entry_set((void *) &darray_top(trans->extra_journal_entries), + BCH_JSET_ENTRY_btree_root, + b->c.btree_id, b->c.level, + new_key, new_key->k.u64s); + trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s); + } + + ret = bch2_trans_commit(trans, NULL, NULL, commit_flags); + if (ret) + goto err; + + bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c); + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, new_hash); + bch2_btree_node_hash_remove(&c->btree_cache, b); + + bkey_copy(&b->key, new_key); + ret = __bch2_btree_node_hash_insert(&c->btree_cache, b); + BUG_ON(ret); + mutex_unlock(&c->btree_cache.lock); + } else { + bkey_copy(&b->key, new_key); + } + + bch2_btree_node_unlock_write(trans, iter->path, b); +out: + bch2_trans_iter_exit(trans, &iter2); + return ret; +err: + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + bch2_btree_node_hash_remove(&c->btree_cache, b); + mutex_unlock(&c->btree_cache.lock); + } + goto out; +} + +int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, struct bkey_i *new_key, + unsigned commit_flags, bool skip_triggers) +{ + struct bch_fs *c = trans->c; + struct btree *new_hash = NULL; + struct btree_path *path = iter->path; + struct closure cl; + int ret = 0; + + ret = bch2_btree_path_upgrade(trans, path, b->c.level + 1); + if (ret) + return ret; + + closure_init_stack(&cl); + + /* + * check btree_ptr_hash_val() after @b is locked by + * btree_iter_traverse(): + */ + if (btree_ptr_hash_val(new_key) != b->hash_val) { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + if (ret) { + ret = drop_locks_do(trans, (closure_sync(&cl), 0)); + if (ret) + return ret; + } + + new_hash = bch2_btree_node_mem_alloc(trans, false); + } + + path->intent_ref++; + ret = __bch2_btree_node_update_key(trans, iter, b, new_hash, new_key, + commit_flags, skip_triggers); + --path->intent_ref; + + if (new_hash) { + mutex_lock(&c->btree_cache.lock); + list_move(&new_hash->list, &c->btree_cache.freeable); + mutex_unlock(&c->btree_cache.lock); + + six_unlock_write(&new_hash->c.lock); + six_unlock_intent(&new_hash->c.lock); + } + closure_sync(&cl); + bch2_btree_cache_cannibalize_unlock(c); + return ret; +} + +int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, + struct btree *b, struct bkey_i *new_key, + unsigned commit_flags, bool skip_triggers) +{ + struct btree_iter iter; + int ret; + + bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* has node been freed? */ + if (iter.path->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + goto out; + } + + BUG_ON(!btree_node_hashed(b)); + + ret = bch2_btree_node_update_key(trans, &iter, b, new_key, + commit_flags, skip_triggers); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* Init code: */ + +/* + * Only for filesystem bringup, when first reading the btree roots or allocating + * btree roots when initializing a new filesystem: + */ +void bch2_btree_set_root_for_read(struct bch_fs *c, struct btree *b) +{ + BUG_ON(btree_node_root(c, b)); + + bch2_btree_set_root_inmem(c, b); +} + +static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id) +{ + struct bch_fs *c = trans->c; + struct closure cl; + struct btree *b; + int ret; + + closure_init_stack(&cl); + + do { + ret = bch2_btree_cache_cannibalize_lock(c, &cl); + closure_sync(&cl); + } while (ret); + + b = bch2_btree_node_mem_alloc(trans, false); + bch2_btree_cache_cannibalize_unlock(c); + + set_btree_node_fake(b); + set_btree_node_need_rewrite(b); + b->c.level = 0; + b->c.btree_id = id; + + bkey_btree_ptr_init(&b->key); + b->key.k.p = SPOS_MAX; + *((u64 *) bkey_i_to_btree_ptr(&b->key)->v.start) = U64_MAX - id; + + bch2_bset_init_first(b, &b->data->keys); + bch2_btree_build_aux_trees(b); + + b->data->flags = 0; + btree_set_min(b, POS_MIN); + btree_set_max(b, SPOS_MAX); + b->data->format = bch2_btree_calc_format(b); + btree_node_set_format(b, b->data->format); + + ret = bch2_btree_node_hash_insert(&c->btree_cache, b, + b->c.level, b->c.btree_id); + BUG_ON(ret); + + bch2_btree_set_root_inmem(c, b); + + six_unlock_write(&b->c.lock); + six_unlock_intent(&b->c.lock); + return 0; +} + +void bch2_btree_root_alloc(struct bch_fs *c, enum btree_id id) +{ + bch2_trans_run(c, __bch2_btree_root_alloc(&trans, id)); +} + +void bch2_btree_updates_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct btree_update *as; + + mutex_lock(&c->btree_interior_update_lock); + list_for_each_entry(as, &c->btree_interior_update_list, list) + prt_printf(out, "%p m %u w %u r %u j %llu\n", + as, + as->mode, + as->nodes_written, + atomic_read(&as->cl.remaining) & CLOSURE_REMAINING_MASK, + as->journal.seq); + mutex_unlock(&c->btree_interior_update_lock); +} + +static bool bch2_btree_interior_updates_pending(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->btree_interior_update_lock); + ret = !list_empty(&c->btree_interior_update_list); + mutex_unlock(&c->btree_interior_update_lock); + + return ret; +} + +bool bch2_btree_interior_updates_flush(struct bch_fs *c) +{ + bool ret = bch2_btree_interior_updates_pending(c); + + if (ret) + closure_wait_event(&c->btree_interior_update_wait, + !bch2_btree_interior_updates_pending(c)); + return ret; +} + +void bch2_journal_entry_to_btree_root(struct bch_fs *c, struct jset_entry *entry) +{ + struct btree_root *r = bch2_btree_id_root(c, entry->btree_id); + + mutex_lock(&c->btree_root_lock); + + r->level = entry->level; + r->alive = true; + bkey_copy(&r->key, &entry->start[0]); + + mutex_unlock(&c->btree_root_lock); +} + +struct jset_entry * +bch2_btree_roots_to_journal_entries(struct bch_fs *c, + struct jset_entry *start, + struct jset_entry *end) +{ + struct jset_entry *entry; + unsigned long have = 0; + unsigned i; + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root) + __set_bit(entry->btree_id, &have); + + mutex_lock(&c->btree_root_lock); + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (r->alive && !test_bit(i, &have)) { + journal_entry_set(end, BCH_JSET_ENTRY_btree_root, + i, r->level, &r->key, r->key.k.u64s); + end = vstruct_next(end); + } + } + + mutex_unlock(&c->btree_root_lock); + + return end; +} + +void bch2_fs_btree_interior_update_exit(struct bch_fs *c) +{ + if (c->btree_interior_update_worker) + destroy_workqueue(c->btree_interior_update_worker); + mempool_exit(&c->btree_interior_update_pool); +} + +void bch2_fs_btree_interior_update_init_early(struct bch_fs *c) +{ + mutex_init(&c->btree_reserve_cache_lock); + INIT_LIST_HEAD(&c->btree_interior_update_list); + INIT_LIST_HEAD(&c->btree_interior_updates_unwritten); + mutex_init(&c->btree_interior_update_lock); + INIT_WORK(&c->btree_interior_update_work, btree_interior_update_work); + + INIT_LIST_HEAD(&c->pending_node_rewrites); + mutex_init(&c->pending_node_rewrites_lock); +} + +int bch2_fs_btree_interior_update_init(struct bch_fs *c) +{ + c->btree_interior_update_worker = + alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 1); + if (!c->btree_interior_update_worker) + return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + + if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, + sizeof(struct btree_update))) + return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + + return 0; +} diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h new file mode 100644 index 000000000..221b7ad5d --- /dev/null +++ b/fs/bcachefs/btree_update_interior.h @@ -0,0 +1,328 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_UPDATE_INTERIOR_H +#define _BCACHEFS_BTREE_UPDATE_INTERIOR_H + +#include "btree_cache.h" +#include "btree_locking.h" +#include "btree_update.h" + +void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); +bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, + struct bkey_format *); + +#define BTREE_UPDATE_NODES_MAX ((BTREE_MAX_DEPTH - 2) * 2 + GC_MERGE_NODES) + +#define BTREE_UPDATE_JOURNAL_RES (BTREE_UPDATE_NODES_MAX * (BKEY_BTREE_PTR_U64s_MAX + 1)) + +/* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: + * + * When we split/rewrite a node, we do all the updates in memory without + * waiting for any writes to complete - we allocate the new node(s) and update + * the parent node, possibly recursively up to the root. + * + * The end result is that we have one or more new nodes being written - + * possibly several, if there were multiple splits - and then a write (updating + * an interior node) which will make all these new nodes visible. + * + * Additionally, as we split/rewrite nodes we free the old nodes - but the old + * nodes can't be freed (their space on disk can't be reclaimed) until the + * update to the interior node that makes the new node visible completes - + * until then, the old nodes are still reachable on disk. + * + */ +struct btree_update { + struct closure cl; + struct bch_fs *c; + u64 start_time; + + struct list_head list; + struct list_head unwritten_list; + + /* What kind of update are we doing? */ + enum { + BTREE_INTERIOR_NO_UPDATE, + BTREE_INTERIOR_UPDATING_NODE, + BTREE_INTERIOR_UPDATING_ROOT, + BTREE_INTERIOR_UPDATING_AS, + } mode; + + unsigned nodes_written:1; + unsigned took_gc_lock:1; + + enum btree_id btree_id; + unsigned update_level; + + struct disk_reservation disk_res; + struct journal_preres journal_preres; + + /* + * BTREE_INTERIOR_UPDATING_NODE: + * The update that made the new nodes visible was a regular update to an + * existing interior node - @b. We can't write out the update to @b + * until the new nodes we created are finished writing, so we block @b + * from writing by putting this btree_interior update on the + * @b->write_blocked list with @write_blocked_list: + */ + struct btree *b; + struct list_head write_blocked_list; + + /* + * We may be freeing nodes that were dirty, and thus had journal entries + * pinned: we need to transfer the oldest of those pins to the + * btree_update operation, and release it when the new node(s) + * are all persistent and reachable: + */ + struct journal_entry_pin journal; + + /* Preallocated nodes we reserve when we start the update: */ + struct prealloc_nodes { + struct btree *b[BTREE_UPDATE_NODES_MAX]; + unsigned nr; + } prealloc_nodes[2]; + + /* Nodes being freed: */ + struct keylist old_keys; + u64 _old_keys[BTREE_UPDATE_NODES_MAX * + BKEY_BTREE_PTR_U64s_MAX]; + + /* Nodes being added: */ + struct keylist new_keys; + u64 _new_keys[BTREE_UPDATE_NODES_MAX * + BKEY_BTREE_PTR_U64s_MAX]; + + /* New nodes, that will be made reachable by this update: */ + struct btree *new_nodes[BTREE_UPDATE_NODES_MAX]; + unsigned nr_new_nodes; + + struct btree *old_nodes[BTREE_UPDATE_NODES_MAX]; + __le64 old_nodes_seq[BTREE_UPDATE_NODES_MAX]; + unsigned nr_old_nodes; + + open_bucket_idx_t open_buckets[BTREE_UPDATE_NODES_MAX * + BCH_REPLICAS_MAX]; + open_bucket_idx_t nr_open_buckets; + + unsigned journal_u64s; + u64 journal_entries[BTREE_UPDATE_JOURNAL_RES]; + + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; + /* + * Enough room for btree_split's keys without realloc - btree node + * pointers never have crc/compression info, so we only need to acount + * for the pointers for three keys + */ + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; +}; + +struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *, + struct btree_trans *, + struct btree *, + struct bkey_format); + +int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned); + +int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *, + unsigned, unsigned, enum btree_node_sibling); + +static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, + struct btree_path *path, + unsigned level, unsigned flags, + enum btree_node_sibling sib) +{ + struct btree *b; + + EBUG_ON(!btree_node_locked(path, level)); + + b = path->l[level].b; + if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold) + return 0; + + return __bch2_foreground_maybe_merge(trans, path, level, flags, sib); +} + +static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, + struct btree_path *path, + unsigned level, + unsigned flags) +{ + return bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_prev_sib) ?: + bch2_foreground_maybe_merge_sibling(trans, path, level, flags, + btree_next_sib); +} + +void bch2_btree_set_root_for_read(struct bch_fs *, struct btree *); +void bch2_btree_root_alloc(struct bch_fs *, enum btree_id); + +static inline unsigned btree_update_reserve_required(struct bch_fs *c, + struct btree *b) +{ + unsigned depth = btree_node_root(c, b)->c.level + 1; + + /* + * Number of nodes we might have to allocate in a worst case btree + * split operation - we split all the way up to the root, then allocate + * a new root, unless we're already at max depth: + */ + if (depth < BTREE_MAX_DEPTH) + return (depth - b->c.level) * 2 + 1; + else + return (depth - b->c.level) * 2 - 1; +} + +static inline void btree_node_reset_sib_u64s(struct btree *b) +{ + b->sib_u64s[0] = b->nr.live_u64s; + b->sib_u64s[1] = b->nr.live_u64s; +} + +static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +{ + return (void *) b->data + btree_bytes(c); +} + +static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, + struct btree *b) +{ + return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); +} + +static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, + struct btree *b) +{ + return btree_data_end(c, b); +} + +static inline void *write_block(struct btree *b) +{ + return (void *) b->data + (b->written << 9); +} + +static inline bool __btree_addr_written(struct btree *b, void *p) +{ + return p < write_block(b); +} + +static inline bool bset_written(struct btree *b, struct bset *i) +{ + return __btree_addr_written(b, i); +} + +static inline bool bkey_written(struct btree *b, struct bkey_packed *k) +{ + return __btree_addr_written(b, k); +} + +static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c, + struct btree *b, + void *end) +{ + ssize_t used = bset_byte_offset(b, end) / sizeof(u64) + + b->whiteout_u64s; + ssize_t total = c->opts.btree_node_size >> 3; + + /* Always leave one extra u64 for bch2_varint_decode: */ + used++; + + return total - used; +} + +static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + struct btree *b) +{ + ssize_t remaining = __bch_btree_u64s_remaining(c, b, + btree_bkey_last(b, bset_tree_last(b))); + + BUG_ON(remaining < 0); + + if (bset_written(b, btree_bset_last(b))) + return 0; + + return remaining; +} + +#define BTREE_WRITE_SET_U64s_BITS 9 + +static inline unsigned btree_write_set_buffer(struct btree *b) +{ + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ + return 8 << BTREE_WRITE_SET_U64s_BITS; +} + +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + struct btree *b) +{ + struct bset_tree *t = bset_tree_last(b); + struct btree_node_entry *bne = max(write_block(b), + (void *) btree_bkey_last(b, bset_tree_last(b))); + ssize_t remaining_space = + __bch_btree_u64s_remaining(c, b, &bne->keys.start[0]); + + if (unlikely(bset_written(b, bset(b, t)))) { + if (remaining_space > (ssize_t) (block_bytes(c) >> 3)) + return bne; + } else { + if (unlikely(bset_u64s(t) * sizeof(u64) > btree_write_set_buffer(b)) && + remaining_space > (ssize_t) (btree_write_set_buffer(b) >> 3)) + return bne; + } + + return NULL; +} + +static inline void push_whiteout(struct bch_fs *c, struct btree *b, + struct bpos pos) +{ + struct bkey_packed k; + + BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s); + EBUG_ON(btree_node_just_written(b)); + + if (!bkey_pack_pos(&k, pos, b)) { + struct bkey *u = (void *) &k; + + bkey_init(u); + u->p = pos; + } + + k.needs_whiteout = true; + + b->whiteout_u64s += k.u64s; + bkey_copy(unwritten_whiteouts_start(c, b), &k); +} + +/* + * write lock must be held on @b (else the dirty bset that we were going to + * insert into could be written out from under us) + */ +static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + struct btree *b, unsigned u64s) +{ + if (unlikely(btree_node_need_rewrite(b))) + return false; + + return u64s <= bch_btree_keys_u64s_remaining(c, b); +} + +void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *); + +bool bch2_btree_interior_updates_flush(struct bch_fs *); + +void bch2_journal_entry_to_btree_root(struct bch_fs *, struct jset_entry *); +struct jset_entry *bch2_btree_roots_to_journal_entries(struct bch_fs *, + struct jset_entry *, struct jset_entry *); + +void bch2_do_pending_node_rewrites(struct bch_fs *); +void bch2_free_pending_node_rewrites(struct bch_fs *); + +void bch2_fs_btree_interior_update_exit(struct bch_fs *); +void bch2_fs_btree_interior_update_init_early(struct bch_fs *); +int bch2_fs_btree_interior_update_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_UPDATE_INTERIOR_H */ diff --git a/fs/bcachefs/btree_update_leaf.c b/fs/bcachefs/btree_update_leaf.c new file mode 100644 index 000000000..3638cef21 --- /dev/null +++ b/fs/bcachefs/btree_update_leaf.c @@ -0,0 +1,2065 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_key_cache.h" +#include "btree_locking.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "debug.h" +#include "errcode.h" +#include "error.h" +#include "extent_update.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "keylist.h" +#include "recovery.h" +#include "subvolume.h" +#include "replicas.h" +#include "trace.h" + +#include +#include + +/* + * bch2_btree_path_peek_slot() for a cached iterator might return a key in a + * different snapshot: + */ +static struct bkey_s_c bch2_btree_path_peek_slot_exact(struct btree_path *path, struct bkey *u) +{ + struct bkey_s_c k = bch2_btree_path_peek_slot(path, u); + + if (k.k && bpos_eq(path->pos, k.k->p)) + return k; + + bkey_init(u); + u->p = path->pos; + return (struct bkey_s_c) { u, NULL }; +} + +static void verify_update_old_key(struct btree_trans *trans, struct btree_insert_entry *i) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_fs *c = trans->c; + struct bkey u; + struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u); + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, i->btree_id, i->level, i->k->k.p); + + if (j_k) + k = bkey_i_to_s_c(j_k); + } + + u = *k.k; + u.needs_whiteout = i->old_k.needs_whiteout; + + BUG_ON(memcmp(&i->old_k, &u, sizeof(struct bkey))); + BUG_ON(i->old_v != k.v); +#endif +} + +static int __must_check +bch2_trans_update_by_path(struct btree_trans *, struct btree_path *, + struct bkey_i *, enum btree_update_flags); + +static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, + const struct btree_insert_entry *r) +{ + return cmp_int(l->btree_id, r->btree_id) ?: + cmp_int(l->cached, r->cached) ?: + -cmp_int(l->level, r->level) ?: + bpos_cmp(l->k->k.p, r->k->k.p); +} + +static inline struct btree_path_level *insert_l(struct btree_insert_entry *i) +{ + return i->path->l + i->level; +} + +static inline bool same_leaf_as_prev(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i != trans->updates && + insert_l(&i[0])->b == insert_l(&i[-1])->b; +} + +static inline bool same_leaf_as_next(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + return i + 1 < trans->updates + trans->nr_updates && + insert_l(&i[0])->b == insert_l(&i[1])->b; +} + +inline void bch2_btree_node_prep_for_write(struct btree_trans *trans, + struct btree_path *path, + struct btree *b) +{ + struct bch_fs *c = trans->c; + + if (unlikely(btree_node_just_written(b)) && + bch2_btree_post_write_cleanup(c, b)) + bch2_trans_node_reinit_iter(trans, b); + + /* + * If the last bset has been written, or if it's gotten too big - start + * a new bset to insert into: + */ + if (want_new_bset(c, b)) + bch2_btree_init_next(trans, b); +} + +/* Inserting into a given leaf node (last stage of insert): */ + +/* Handle overwrites and do insert, for non extents: */ +bool bch2_btree_bset_insert_key(struct btree_trans *trans, + struct btree_path *path, + struct btree *b, + struct btree_node_iter *node_iter, + struct bkey_i *insert) +{ + struct bkey_packed *k; + unsigned clobber_u64s = 0, new_u64s = 0; + + EBUG_ON(btree_node_just_written(b)); + EBUG_ON(bset_written(b, btree_bset_last(b))); + EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k)); + EBUG_ON(bpos_lt(insert->k.p, b->data->min_key)); + EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); + EBUG_ON(insert->k.u64s > + bch_btree_keys_u64s_remaining(trans->c, b)); + + k = bch2_btree_node_iter_peek_all(node_iter, b); + if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) + k = NULL; + + /* @k is the key being overwritten/deleted, if any: */ + EBUG_ON(k && bkey_deleted(k)); + + /* Deleting, but not found? nothing to do: */ + if (bkey_deleted(&insert->k) && !k) + return false; + + if (bkey_deleted(&insert->k)) { + /* Deleting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + if (k->needs_whiteout) + push_whiteout(trans->c, b, insert->k.p); + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + bch2_bset_delete(b, k, clobber_u64s); + goto fix_iter; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + + return true; + } + + if (k) { + /* Overwriting: */ + btree_account_key_drop(b, k); + k->type = KEY_TYPE_deleted; + + insert->k.needs_whiteout = k->needs_whiteout; + k->needs_whiteout = false; + + if (k >= btree_bset_last(b)->start) { + clobber_u64s = k->u64s; + goto overwrite; + } else { + bch2_btree_path_fix_key_modified(trans, b, k); + } + } + + k = bch2_btree_node_iter_bset_pos(node_iter, b, bset_tree_last(b)); +overwrite: + bch2_bset_insert(b, node_iter, k, insert, clobber_u64s); + new_u64s = k->u64s; +fix_iter: + if (clobber_u64s != new_u64s) + bch2_btree_node_iter_fix(trans, path, b, node_iter, k, + clobber_u64s, new_u64s); + return true; +} + +static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin, + unsigned i, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write *w = container_of(pin, struct btree_write, journal); + struct btree *b = container_of(w, struct btree, writes[i]); + struct btree_trans trans; + unsigned long old, new, v; + unsigned idx = w - b->writes; + + bch2_trans_init(&trans, c, 0, 0); + + btree_node_lock_nopath_nofail(&trans, &b->c, SIX_LOCK_read); + v = READ_ONCE(b->flags); + + do { + old = new = v; + + if (!(old & (1 << BTREE_NODE_dirty)) || + !!(old & (1 << BTREE_NODE_write_idx)) != idx || + w->journal.seq != seq) + break; + + new &= ~BTREE_WRITE_TYPE_MASK; + new |= BTREE_WRITE_journal_reclaim; + new |= 1 << BTREE_NODE_need_write; + } while ((v = cmpxchg(&b->flags, old, new)) != old); + + btree_node_write_if_need(c, b, SIX_LOCK_read); + six_unlock_read(&b->c.lock); + + bch2_trans_exit(&trans); + return 0; +} + +int bch2_btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 0, seq); +} + +int bch2_btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq) +{ + return __btree_node_flush(j, pin, 1, seq); +} + +inline void bch2_btree_add_journal_pin(struct bch_fs *c, + struct btree *b, u64 seq) +{ + struct btree_write *w = btree_current_write(b); + + bch2_journal_pin_add(&c->journal, seq, &w->journal, + btree_node_write_idx(b) == 0 + ? bch2_btree_node_flush0 + : bch2_btree_node_flush1); +} + +/** + * btree_insert_key - insert a key one key into a leaf node + */ +inline void bch2_btree_insert_key_leaf(struct btree_trans *trans, + struct btree_path *path, + struct bkey_i *insert, + u64 journal_seq) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(path)->b; + struct bset_tree *t = bset_tree_last(b); + struct bset *i = bset(b, t); + int old_u64s = bset_u64s(t); + int old_live_u64s = b->nr.live_u64s; + int live_u64s_added, u64s_added; + + if (unlikely(!bch2_btree_bset_insert_key(trans, path, b, + &path_l(path)->iter, insert))) + return; + + i->journal_seq = cpu_to_le64(max(journal_seq, le64_to_cpu(i->journal_seq))); + + bch2_btree_add_journal_pin(c, b, journal_seq); + + if (unlikely(!btree_node_dirty(b))) { + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + set_btree_node_dirty_acct(c, b); + } + + live_u64s_added = (int) b->nr.live_u64s - old_live_u64s; + u64s_added = (int) bset_u64s(t) - old_u64s; + + if (b->sib_u64s[0] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[0] = max(0, (int) b->sib_u64s[0] + live_u64s_added); + if (b->sib_u64s[1] != U16_MAX && live_u64s_added < 0) + b->sib_u64s[1] = max(0, (int) b->sib_u64s[1] + live_u64s_added); + + if (u64s_added > live_u64s_added && + bch2_maybe_compact_whiteouts(c, b)) + bch2_trans_node_reinit_iter(trans, b); +} + +static void btree_insert_key_leaf(struct btree_trans *trans, + struct btree_insert_entry *insert) +{ + bch2_btree_insert_key_leaf(trans, insert->path, insert->k, trans->journal_res.seq); +} + +/* Cached btree updates: */ + +/* Normal update interface: */ + +static inline void btree_insert_entry_checks(struct btree_trans *trans, + struct btree_insert_entry *i) +{ + BUG_ON(!bpos_eq(i->k->k.p, i->path->pos)); + BUG_ON(i->cached != i->path->cached); + BUG_ON(i->level != i->path->level); + BUG_ON(i->btree_id != i->path->btree_id); + EBUG_ON(!i->level && + !(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) && + test_bit(JOURNAL_REPLAY_DONE, &trans->c->journal.flags) && + i->k->k.p.snapshot && + bch2_snapshot_is_internal_node(trans->c, i->k->k.p.snapshot)); +} + +static noinline int +bch2_trans_journal_preres_get_cold(struct btree_trans *trans, unsigned flags, + unsigned long trace_ip) +{ + return drop_locks_do(trans, + bch2_journal_preres_get(&trans->c->journal, + &trans->journal_preres, + trans->journal_preres_u64s, + (flags & BCH_WATERMARK_MASK))); +} + +static __always_inline int bch2_trans_journal_res_get(struct btree_trans *trans, + unsigned flags) +{ + return bch2_journal_res_get(&trans->c->journal, &trans->journal_res, + trans->journal_u64s, flags); +} + +#define JSET_ENTRY_LOG_U64s 4 + +static noinline void journal_transaction_name(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct jset_entry *entry = + bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_log, 0, 0, + JSET_ENTRY_LOG_U64s); + struct jset_entry_log *l = + container_of(entry, struct jset_entry_log, entry); + + strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); +} + +static inline int btree_key_can_insert(struct btree_trans *trans, + struct btree *b, unsigned u64s) +{ + struct bch_fs *c = trans->c; + + if (!bch2_btree_node_insert_fits(c, b, u64s)) + return -BCH_ERR_btree_insert_btree_node_full; + + return 0; +} + +static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags, + struct btree_path *path, unsigned u64s) +{ + struct bch_fs *c = trans->c; + struct bkey_cached *ck = (void *) path->l[0].b; + struct btree_insert_entry *i; + unsigned new_u64s; + struct bkey_i *new_k; + + EBUG_ON(path->level); + + if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && + bch2_btree_key_cache_must_wait(c) && + !(flags & BTREE_INSERT_JOURNAL_RECLAIM)) + return -BCH_ERR_btree_insert_need_journal_reclaim; + + /* + * bch2_varint_decode can read past the end of the buffer by at most 7 + * bytes (it won't be used): + */ + u64s += 1; + + if (u64s <= ck->u64s) + return 0; + + new_u64s = roundup_pow_of_two(u64s); + new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOFS); + if (!new_k) { + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", + bch2_btree_ids[path->btree_id], new_u64s); + return -BCH_ERR_ENOMEM_btree_key_cache_insert; + } + + trans_for_each_update(trans, i) + if (i->old_v == &ck->k->v) + i->old_v = &new_k->v; + + ck->u64s = new_u64s; + ck->k = new_k; + return 0; +} + +/* Triggers: */ + +static int run_one_mem_trigger(struct btree_trans *trans, + struct btree_insert_entry *i, + unsigned flags) +{ + struct bkey_s_c old = { &i->old_k, i->old_v }; + struct bkey_i *new = i->k; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + int ret; + + verify_update_old_key(trans, i); + + if (unlikely(flags & BTREE_TRIGGER_NORUN)) + return 0; + + if (!btree_node_type_needs_gc(i->btree_id)) + return 0; + + if (old_ops->atomic_trigger == new_ops->atomic_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + ret = bch2_mark_key(trans, i->btree_id, i->level, + old, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags); + } else { + struct bkey _deleted = KEY(0, 0, 0); + struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL }; + + _deleted.p = i->path->pos; + + ret = bch2_mark_key(trans, i->btree_id, i->level, + deleted, bkey_i_to_s_c(new), + BTREE_TRIGGER_INSERT|flags) ?: + bch2_mark_key(trans, i->btree_id, i->level, + old, deleted, + BTREE_TRIGGER_OVERWRITE|flags); + } + + return ret; +} + +static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_entry *i, + bool overwrite) +{ + /* + * Transactional triggers create new btree_insert_entries, so we can't + * pass them a pointer to a btree_insert_entry, that memory is going to + * move: + */ + struct bkey old_k = i->old_k; + struct bkey_s_c old = { &old_k, i->old_v }; + const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type); + const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type); + + verify_update_old_key(trans, i); + + if ((i->flags & BTREE_TRIGGER_NORUN) || + !(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type))) + return 0; + + if (!i->insert_trigger_run && + !i->overwrite_trigger_run && + old_ops->trans_trigger == new_ops->trans_trigger && + ((1U << old.k->type) & BTREE_TRIGGER_WANTS_OLD_AND_NEW)) { + i->overwrite_trigger_run = true; + i->insert_trigger_run = true; + return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k, + BTREE_TRIGGER_INSERT| + BTREE_TRIGGER_OVERWRITE| + i->flags) ?: 1; + } else if (overwrite && !i->overwrite_trigger_run) { + i->overwrite_trigger_run = true; + return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1; + } else if (!overwrite && !i->insert_trigger_run) { + i->insert_trigger_run = true; + return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1; + } else { + return 0; + } +} + +static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, + struct btree_insert_entry *btree_id_start) +{ + struct btree_insert_entry *i; + bool trans_trigger_run; + int ret, overwrite; + + for (overwrite = 1; overwrite >= 0; --overwrite) { + + /* + * Running triggers will append more updates to the list of updates as + * we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = btree_id_start; + i < trans->updates + trans->nr_updates && i->btree_id <= btree_id; + i++) { + if (i->btree_id != btree_id) + continue; + + ret = run_one_trans_trigger(trans, i, overwrite); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; + } + } while (trans_trigger_run); + } + + return 0; +} + +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) +{ + struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates; + unsigned btree_id = 0; + int ret = 0; + + /* + * + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being moved + * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before + * they are re-added. + */ + for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { + if (btree_id == BTREE_ID_alloc) + continue; + + while (btree_id_start < trans->updates + trans->nr_updates && + btree_id_start->btree_id < btree_id) + btree_id_start++; + + ret = run_btree_triggers(trans, btree_id, btree_id_start); + if (ret) + return ret; + } + + trans_for_each_update(trans, i) { + if (i->btree_id > BTREE_ID_alloc) + break; + if (i->btree_id == BTREE_ID_alloc) { + ret = run_btree_triggers(trans, BTREE_ID_alloc, i); + if (ret) + return ret; + break; + } + } + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(!(i->flags & BTREE_TRIGGER_NORUN) && + (BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS & (1U << i->bkey_type)) && + (!i->insert_trigger_run || !i->overwrite_trigger_run)); +#endif + return 0; +} + +static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + /* + * XXX: synchronization of cached update triggers with gc + * XXX: synchronization of interior node updates with gc + */ + BUG_ON(i->cached || i->level); + + if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) { + ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC); + if (ret) + break; + } + } + + return ret; +} + +static inline int +bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + struct btree_trans_commit_hook *h; + unsigned u64s = 0; + bool marking = false; + int ret; + + if (race_fault()) { + trace_and_count(c, trans_restart_fault_inject, trans, trace_ip); + return btree_trans_restart_nounlock(trans, BCH_ERR_transaction_restart_fault_inject); + } + + /* + * Check if the insert will fit in the leaf node with the write lock + * held, otherwise another thread could write the node changing the + * amount of space available: + */ + + prefetch(&trans->c->journal.flags); + + trans_for_each_update(trans, i) { + /* Multiple inserts might go to same leaf: */ + if (!same_leaf_as_prev(trans, i)) + u64s = 0; + + u64s += i->k->k.u64s; + ret = !i->cached + ? btree_key_can_insert(trans, insert_l(i)->b, u64s) + : btree_key_can_insert_cached(trans, flags, i->path, u64s); + if (ret) { + *stopped_at = i; + return ret; + } + + if (btree_node_type_needs_gc(i->bkey_type)) + marking = true; + } + + if (trans->nr_wb_updates && + trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size) + return -BCH_ERR_btree_insert_need_flush_buffer; + + /* + * Don't get journal reservation until after we know insert will + * succeed: + */ + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + ret = bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_NONBLOCK); + if (ret) + return ret; + + if (unlikely(trans->journal_transaction_names)) + journal_transaction_name(trans); + } else { + trans->journal_res.seq = c->journal.replay_journal_seq; + } + + /* + * Not allowed to fail after we've gotten our journal reservation - we + * have to use it: + */ + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && + !(flags & BTREE_INSERT_JOURNAL_REPLAY)) { + if (bch2_journal_seq_verify) + trans_for_each_update(trans, i) + i->k->k.version.lo = trans->journal_res.seq; + else if (bch2_inject_invalid_keys) + trans_for_each_update(trans, i) + i->k->k.version = MAX_VERSION; + } + + if (trans->fs_usage_deltas && + bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + if (trans->nr_wb_updates) { + EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY); + + ret = bch2_btree_insert_keys_write_buffer(trans); + if (ret) + goto revert_fs_usage; + } + + h = trans->hooks; + while (h) { + ret = h->fn(trans, h); + if (ret) + goto revert_fs_usage; + h = h->next; + } + + trans_for_each_update(trans, i) + if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) { + ret = run_one_mem_trigger(trans, i, i->flags); + if (ret) + goto fatal_err; + } + + if (unlikely(c->gc_pos.phase)) { + ret = bch2_trans_commit_run_gc_triggers(trans); + if (ret) + goto fatal_err; + } + + if (unlikely(trans->extra_journal_entries.nr)) { + memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), + trans->extra_journal_entries.data, + trans->extra_journal_entries.nr); + + trans->journal_res.offset += trans->extra_journal_entries.nr; + trans->journal_res.u64s -= trans->extra_journal_entries.nr; + } + + if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) { + struct journal *j = &c->journal; + struct jset_entry *entry; + + trans_for_each_update(trans, i) { + if (i->key_cache_already_flushed) + continue; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + verify_update_old_key(trans, i); + + if (trans->journal_transaction_names) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_overwrite, + i->btree_id, i->level, + i->old_k.u64s); + bkey_reassemble(&entry->start[0], + (struct bkey_s_c) { &i->old_k, i->old_v }); + } + + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + i->btree_id, i->level, + i->k->k.u64s); + bkey_copy(&entry->start[0], i->k); + } + + trans_for_each_wb_update(trans, wb) { + entry = bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_btree_keys, + wb->btree, 0, + wb->k.k.u64s); + bkey_copy(&entry->start[0], &wb->k); + } + + if (trans->journal_seq) + *trans->journal_seq = trans->journal_res.seq; + } + + trans_for_each_update(trans, i) { + i->k->k.needs_whiteout = false; + + if (!i->cached) + btree_insert_key_leaf(trans, i); + else if (!i->key_cache_already_flushed) + bch2_btree_insert_key_cached(trans, flags, i); + else { + bch2_btree_key_cache_drop(trans, i->path); + btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE); + } + } + + return 0; +fatal_err: + bch2_fatal_error(c); +revert_fs_usage: + if (trans->fs_usage_deltas) + bch2_trans_fs_usage_revert(trans, trans->fs_usage_deltas); + return ret; +} + +static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btree_insert_entry *i) +{ + while (--i >= trans->updates) { + if (same_leaf_as_prev(trans, i)) + continue; + + bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b); + } + + trace_and_count(trans->c, trans_restart_would_deadlock_write, trans); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock_write); +} + +static inline int trans_lock_write(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + + trans_for_each_update(trans, i) { + if (same_leaf_as_prev(trans, i)) + continue; + + if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c)) + return trans_lock_write_fail(trans, i); + + if (!i->cached) + bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b); + } + + return 0; +} + +static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans) +{ + struct btree_insert_entry *i; + struct btree_write_buffered_key *wb; + + trans_for_each_update(trans, i) + bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p); + + trans_for_each_wb_update(trans, wb) + bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry *i, + struct printbuf *err) +{ + struct bch_fs *c = trans->c; + int rw = (flags & BTREE_INSERT_JOURNAL_REPLAY) ? READ : WRITE; + + printbuf_reset(err); + prt_printf(err, "invalid bkey on insert from %s -> %ps", + trans->fn, (void *) i->ip_allocated); + prt_newline(err); + printbuf_indent_add(err, 2); + + bch2_bkey_val_to_text(err, c, bkey_i_to_s_c(i->k)); + prt_newline(err); + + bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, rw, err); + bch2_print_string_as_lines(KERN_ERR, err->buf); + + bch2_inconsistent_error(c); + bch2_dump_trans_updates(trans); + printbuf_exit(err); + + return -EINVAL; +} +#endif + +/* + * Get journal reservation, take write locks, and attempt to do btree update(s): + */ +static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry **stopped_at, + unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret, u64s_delta = 0; + +#ifdef CONFIG_BCACHEFS_DEBUG + struct printbuf buf = PRINTBUF; + + trans_for_each_update(trans, i) { + enum bkey_invalid_flags invalid_flags = 0; + + if (!(flags & BTREE_INSERT_JOURNAL_REPLAY)) + invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT; + + if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k), + i->bkey_type, invalid_flags, &buf))) + return bch2_trans_commit_bkey_invalid(trans, flags, i, &buf); + btree_insert_entry_checks(trans, i); + } + printbuf_exit(&buf); +#endif + + trans_for_each_update(trans, i) { + if (i->cached) + continue; + + u64s_delta += !bkey_deleted(&i->k->k) ? i->k->k.u64s : 0; + u64s_delta -= i->old_btree_u64s; + + if (!same_leaf_as_next(trans, i)) { + if (u64s_delta <= 0) { + ret = bch2_foreground_maybe_merge(trans, i->path, + i->level, flags); + if (unlikely(ret)) + return ret; + } + + u64s_delta = 0; + } + } + + ret = bch2_journal_preres_get(&c->journal, + &trans->journal_preres, trans->journal_preres_u64s, + (flags & BCH_WATERMARK_MASK)|JOURNAL_RES_GET_NONBLOCK); + if (unlikely(ret == -BCH_ERR_journal_preres_get_blocked)) + ret = bch2_trans_journal_preres_get_cold(trans, flags, trace_ip); + if (unlikely(ret)) + return ret; + + ret = trans_lock_write(trans); + if (unlikely(ret)) + return ret; + + ret = bch2_trans_commit_write_locked(trans, flags, stopped_at, trace_ip); + + if (!ret && unlikely(trans->journal_replay_not_finished)) + bch2_drop_overwrites_from_journal(trans); + + trans_for_each_update(trans, i) + if (!same_leaf_as_prev(trans, i)) + bch2_btree_node_unlock_write_inlined(trans, i->path, + insert_l(i)->b); + + if (!ret && trans->journal_pin) + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, + trans->journal_pin, NULL); + + /* + * Drop journal reservation after dropping write locks, since dropping + * the journal reservation may kick off a journal write: + */ + bch2_journal_res_put(&c->journal, &trans->journal_res); + + if (unlikely(ret)) + return ret; + + bch2_trans_downgrade(trans); + + return 0; +} + +static int journal_reclaim_wait_done(struct bch_fs *c) +{ + int ret = bch2_journal_error(&c->journal) ?: + !bch2_btree_key_cache_must_wait(c); + + if (!ret) + journal_reclaim_kick(&c->journal); + return ret; +} + +static noinline +int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, + struct btree_insert_entry *i, + int ret, unsigned long trace_ip) +{ + struct bch_fs *c = trans->c; + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_replicas_delta_list_mark(c, trans->fs_usage_deltas)); + break; + case -BCH_ERR_journal_res_get_blocked: + /* + * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK + * flag + */ + if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) && + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) { + ret = -BCH_ERR_journal_reclaim_would_deadlock; + break; + } + + ret = drop_locks_do(trans, + bch2_trans_journal_res_get(trans, + (flags & BCH_WATERMARK_MASK)| + JOURNAL_RES_GET_CHECK)); + break; + case -BCH_ERR_btree_insert_need_journal_reclaim: + bch2_trans_unlock(trans); + + trace_and_count(c, trans_blocked_journal_reclaim, trans, trace_ip); + + wait_event_freezable(c->journal.reclaim_wait, + (ret = journal_reclaim_wait_done(c))); + if (ret < 0) + break; + + ret = bch2_trans_relock(trans); + break; + case -BCH_ERR_btree_insert_need_flush_buffer: { + struct btree_write_buffer *wb = &c->btree_write_buffer; + + ret = 0; + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_unlock(trans); + mutex_lock(&wb->flush_lock); + + if (wb->state.nr > wb->size * 3 / 4) { + bch2_trans_begin(trans); + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + } else { + mutex_unlock(&wb->flush_lock); + ret = bch2_trans_relock(trans); + } + } + break; + } + default: + BUG_ON(ret >= 0); + break; + } + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && + !(flags & BTREE_INSERT_NOWAIT) && + (flags & BTREE_INSERT_NOFAIL), c, + "%s: incorrectly got %s\n", __func__, bch2_err_str(ret)); + + return ret; +} + +static noinline int +bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + int ret; + + if (likely(!(flags & BTREE_INSERT_LAZY_RW)) || + test_bit(BCH_FS_STARTED, &c->flags)) + return -BCH_ERR_erofs_trans_commit; + + ret = drop_locks_do(trans, bch2_fs_read_write_early(c)); + if (ret) + return ret; + + bch2_write_ref_get(c, BCH_WRITE_REF_trans); + return 0; +} + +/* + * This is for updates done in the early part of fsck - btree_gc - before we've + * gone RW. we only add the new key to the list of keys for journal replay to + * do. + */ +static noinline int +do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i; + int ret = 0; + + trans_for_each_update(trans, i) { + ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->k); + if (ret) + break; + } + + return ret; +} + +int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i = NULL; + struct btree_write_buffered_key *wb; + unsigned u64s; + int ret = 0; + + if (!trans->nr_updates && + !trans->nr_wb_updates && + !trans->extra_journal_entries.nr) + goto out_reset; + + if (flags & BTREE_INSERT_GC_LOCK_HELD) + lockdep_assert_held(&c->gc_lock); + + ret = bch2_trans_commit_run_triggers(trans); + if (ret) + goto out_reset; + + if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) { + ret = do_bch2_trans_commit_to_journal_replay(trans); + goto out_reset; + } + + if (!(flags & BTREE_INSERT_NOCHECK_RW) && + unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + ret = bch2_trans_commit_get_rw_cold(trans, flags); + if (ret) + goto out_reset; + } + + if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 && + mutex_trylock(&c->btree_write_buffer.flush_lock)) { + bch2_trans_begin(trans); + bch2_trans_unlock(trans); + + ret = __bch2_btree_write_buffer_flush(trans, + flags|BTREE_INSERT_NOCHECK_RW, true); + if (!ret) { + trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush); + } + goto out; + } + + EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags)); + + memset(&trans->journal_preres, 0, sizeof(trans->journal_preres)); + + trans->journal_u64s = trans->extra_journal_entries.nr; + trans->journal_preres_u64s = 0; + + trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); + + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); + + trans_for_each_update(trans, i) { + EBUG_ON(!i->path->should_be_locked); + + ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1); + if (unlikely(ret)) + goto out; + + EBUG_ON(!btree_node_intent_locked(i->path, i->level)); + + if (i->key_cache_already_flushed) + continue; + + /* we're going to journal the key being updated: */ + u64s = jset_u64s(i->k->k.u64s); + if (i->cached && + likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) + trans->journal_preres_u64s += u64s; + + if (i->flags & BTREE_UPDATE_NOJOURNAL) + continue; + + trans->journal_u64s += u64s; + + /* and we're also going to log the overwrite: */ + if (trans->journal_transaction_names) + trans->journal_u64s += jset_u64s(i->old_k.u64s); + } + + trans_for_each_wb_update(trans, wb) + trans->journal_u64s += jset_u64s(wb->k.k.u64s); + + if (trans->extra_journal_res) { + ret = bch2_disk_reservation_add(c, trans->disk_res, + trans->extra_journal_res, + (flags & BTREE_INSERT_NOFAIL) + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto err; + } +retry: + bch2_trans_verify_not_in_restart(trans); + memset(&trans->journal_res, 0, sizeof(trans->journal_res)); + + ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_); + + /* make sure we didn't drop or screw up locks: */ + bch2_trans_verify_locks(trans); + + if (ret) + goto err; + + trace_and_count(c, transaction_commit, trans, _RET_IP_); +out: + bch2_journal_preres_put(&c->journal, &trans->journal_preres); + + if (likely(!(flags & BTREE_INSERT_NOCHECK_RW))) + bch2_write_ref_put(c, BCH_WRITE_REF_trans); +out_reset: + bch2_trans_reset_updates(trans); + + return ret; +err: + ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_); + if (ret) + goto out; + + goto retry; +} + +static noinline int __check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, id, pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while (1) { + k = bch2_btree_iter_prev(&iter); + ret = bkey_err(k); + if (ret) + break; + + if (!k.k) + break; + + if (!bkey_eq(pos, k.k->p)) + break; + + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) { + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static inline int check_pos_snapshot_overwritten(struct btree_trans *trans, + enum btree_id id, + struct bpos pos) +{ + if (!btree_type_has_snapshots(id) || + bch2_snapshot_is_leaf(trans->c, pos.snapshot)) + return 0; + + return __check_pos_snapshot_overwritten(trans, id, pos); +} + +static noinline int extent_front_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bkey_i **insert, + enum btree_update_flags flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *update; + int ret; + + update = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + return ret; + + if (!bch2_bkey_merge(c, bkey_i_to_s(update), bkey_i_to_s_c(*insert))) + return 0; + + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, (*insert)->k.p); + if (ret < 0) + return ret; + if (ret) + return 0; + + ret = bch2_btree_delete_at(trans, iter, flags); + if (ret) + return ret; + + *insert = update; + return 0; +} + +static noinline int extent_back_merge(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + int ret; + + ret = check_pos_snapshot_overwritten(trans, iter->btree_id, insert->k.p) ?: + check_pos_snapshot_overwritten(trans, iter->btree_id, k.k->p); + if (ret < 0) + return ret; + if (ret) + return 0; + + bch2_bkey_merge(c, bkey_i_to_s(insert), k); + return 0; +} + +/* + * When deleting, check if we need to emit a whiteout (because we're overwriting + * something in an ancestor snapshot) + */ +static int need_whiteout_for_snapshot(struct btree_trans *trans, + enum btree_id btree_id, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot = pos.snapshot; + int ret; + + if (!bch2_snapshot_parent(trans->c, pos.snapshot)) + return 0; + + pos.snapshot++; + + for_each_btree_key_norestart(trans, iter, btree_id, pos, + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOPRESERVE, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (bch2_snapshot_is_ancestor(trans->c, snapshot, + k.k->p.snapshot)) { + ret = !bkey_whiteout(k.k); + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, + enum btree_id id, + struct bpos old_pos, + struct bpos new_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter old_iter, new_iter; + struct bkey_s_c old_k, new_k; + snapshot_id_list s; + struct bkey_i *update; + int ret; + + if (!bch2_snapshot_has_children(c, old_pos.snapshot)) + return 0; + + darray_init(&s); + + bch2_trans_iter_init(trans, &old_iter, id, old_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_ALL_SNAPSHOTS); + while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + !(ret = bkey_err(old_k)) && + bkey_eq(old_pos, old_k.k->p)) { + struct bpos whiteout_pos = + SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot);; + + if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || + snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) + continue; + + new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bkey_err(new_k); + if (ret) + break; + + if (new_k.k->type == KEY_TYPE_deleted) { + update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(update); + if (ret) + break; + + bkey_init(&update->k); + update->k.p = whiteout_pos; + update->k.type = KEY_TYPE_whiteout; + + ret = bch2_trans_update(trans, &new_iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } + bch2_trans_iter_exit(trans, &new_iter); + + ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &old_iter); + darray_exit(&s); + + return ret; +} + +int bch2_trans_update_extent(struct btree_trans *trans, + struct btree_iter *orig_iter, + struct bkey_i *insert, + enum btree_update_flags flags) +{ + struct btree_iter iter; + struct bpos start = bkey_start_pos(&insert->k); + struct bkey_i *update; + struct bkey_s_c k; + enum btree_id btree_id = orig_iter->btree_id; + int ret = 0, compressed_sectors; + + bch2_trans_iter_init(trans, &iter, btree_id, start, + BTREE_ITER_INTENT| + BTREE_ITER_WITH_UPDATES| + BTREE_ITER_NOT_EXTENTS); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + + if (bkey_eq(k.k->p, bkey_start_pos(&insert->k))) { + if (bch2_bkey_maybe_mergable(k.k, &insert->k)) { + ret = extent_front_merge(trans, &iter, k, &insert, flags); + if (ret) + goto err; + } + + goto next; + } + + while (bkey_gt(insert->k.p, bkey_start_pos(k.k))) { + bool front_split = bkey_lt(bkey_start_pos(k.k), start); + bool back_split = bkey_gt(k.k->p, insert->k.p); + + /* + * If we're going to be splitting a compressed extent, note it + * so that __bch2_trans_commit() can increase our disk + * reservation: + */ + if (((front_split && back_split) || + ((front_split || back_split) && k.k->p.snapshot != insert->k.p.snapshot)) && + (compressed_sectors = bch2_bkey_sectors_compressed(k))) + trans->extra_journal_res += compressed_sectors; + + if (front_split) { + update = bch2_bkey_make_mut_noupdate(trans, k); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bch2_cut_back(start, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + k.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + goto err; + } + + if (k.k->p.snapshot != insert->k.p.snapshot && + (front_split || back_split)) { + update = bch2_bkey_make_mut_noupdate(trans, k); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bch2_cut_front(start, update); + bch2_cut_back(insert->k.p, update); + + ret = bch2_insert_snapshot_whiteouts(trans, btree_id, + k.k->p, update->k.p) ?: + bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + goto err; + } + + if (bkey_le(k.k->p, insert->k.p)) { + update = bch2_trans_kmalloc(trans, sizeof(*update)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bkey_init(&update->k); + update->k.p = k.k->p; + update->k.p.snapshot = insert->k.p.snapshot; + + if (insert->k.p.snapshot != k.k->p.snapshot) { + update->k.type = KEY_TYPE_whiteout; + } else if (btree_type_has_snapshots(btree_id)) { + ret = need_whiteout_for_snapshot(trans, btree_id, update->k.p); + if (ret < 0) + goto err; + if (ret) + update->k.type = KEY_TYPE_whiteout; + } + + ret = bch2_btree_insert_nonextent(trans, btree_id, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|flags); + if (ret) + goto err; + } + + if (back_split) { + update = bch2_bkey_make_mut_noupdate(trans, k); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + + bch2_cut_front(insert->k.p, update); + + ret = bch2_trans_update_by_path(trans, iter.path, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + flags); + if (ret) + goto err; + goto out; + } +next: + bch2_btree_iter_advance(&iter); + k = bch2_btree_iter_peek_upto(&iter, POS(insert->k.p.inode, U64_MAX)); + if ((ret = bkey_err(k))) + goto err; + if (!k.k) + goto out; + } + + if (bch2_bkey_maybe_mergable(&insert->k, k.k)) { + ret = extent_back_merge(trans, &iter, insert, k); + if (ret) + goto err; + } +out: + if (!bkey_deleted(&insert->k)) { + /* + * Rewinding iterators is expensive: get a new one and the one + * that points to the start of insert will be cloned from: + */ + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, btree_id, insert->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, insert, flags); + } +err: + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int __must_check +bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip); + +static noinline int flush_new_cached_update(struct btree_trans *trans, + struct btree_path *path, + struct btree_insert_entry *i, + enum btree_update_flags flags, + unsigned long ip) +{ + struct btree_path *btree_path; + struct bkey k; + int ret; + + btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT, _THIS_IP_); + ret = bch2_btree_path_traverse(trans, btree_path, 0); + if (ret) + goto out; + + /* + * The old key in the insert entry might actually refer to an existing + * key in the btree that has been deleted from cache and not yet + * flushed. Check for this and skip the flush so we don't run triggers + * against a stale key. + */ + bch2_btree_path_peek_slot_exact(btree_path, &k); + if (!bkey_deleted(&k)) + goto out; + + i->key_cache_already_flushed = true; + i->flags |= BTREE_TRIGGER_NORUN; + + btree_path_set_should_be_locked(btree_path); + ret = bch2_trans_update_by_path_trace(trans, btree_path, i->k, flags, ip); +out: + bch2_path_put(trans, btree_path, true); + return ret; +} + +static int __must_check +bch2_trans_update_by_path_trace(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags, + unsigned long ip) +{ + struct bch_fs *c = trans->c; + struct btree_insert_entry *i, n; + int cmp; + + EBUG_ON(!path->should_be_locked); + EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX); + EBUG_ON(!bpos_eq(k->k.p, path->pos)); + + n = (struct btree_insert_entry) { + .flags = flags, + .bkey_type = __btree_node_type(path->level, path->btree_id), + .btree_id = path->btree_id, + .level = path->level, + .cached = path->cached, + .path = path, + .k = k, + .ip_allocated = ip, + }; + +#ifdef CONFIG_BCACHEFS_DEBUG + trans_for_each_update(trans, i) + BUG_ON(i != trans->updates && + btree_insert_entry_cmp(i - 1, i) >= 0); +#endif + + /* + * Pending updates are kept sorted: first, find position of new update, + * then delete/trim any updates the new update overwrites: + */ + trans_for_each_update(trans, i) { + cmp = btree_insert_entry_cmp(&n, i); + if (cmp <= 0) + break; + } + + if (!cmp && i < trans->updates + trans->nr_updates) { + EBUG_ON(i->insert_trigger_run || i->overwrite_trigger_run); + + bch2_path_put(trans, i->path, true); + i->flags = n.flags; + i->cached = n.cached; + i->k = n.k; + i->path = n.path; + i->ip_allocated = n.ip_allocated; + } else { + array_insert_item(trans->updates, trans->nr_updates, + i - trans->updates, n); + + i->old_v = bch2_btree_path_peek_slot_exact(path, &i->old_k).v; + i->old_btree_u64s = !bkey_deleted(&i->old_k) ? i->old_k.u64s : 0; + + if (unlikely(trans->journal_replay_not_finished)) { + struct bkey_i *j_k = + bch2_journal_keys_peek_slot(c, n.btree_id, n.level, k->k.p); + + if (j_k) { + i->old_k = j_k->k; + i->old_v = &j_k->v; + } + } + } + + __btree_path_get(i->path, true); + + /* + * If a key is present in the key cache, it must also exist in the + * btree - this is necessary for cache coherency. When iterating over + * a btree that's cached in the key cache, the btree iter code checks + * the key cache - but the key has to exist in the btree for that to + * work: + */ + if (path->cached && bkey_deleted(&i->old_k)) + return flush_new_cached_update(trans, path, i, flags, ip); + + return 0; +} + +static inline int __must_check +bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path, + struct bkey_i *k, enum btree_update_flags flags) +{ + return bch2_trans_update_by_path_trace(trans, path, k, flags, _RET_IP_); +} + +int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_path *path = iter->update_path ?: iter->path; + struct bkey_cached *ck; + int ret; + + if (iter->flags & BTREE_ITER_IS_EXTENTS) + return bch2_trans_update_extent(trans, iter, k, flags); + + if (bkey_deleted(&k->k) && + !(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)) { + ret = need_whiteout_for_snapshot(trans, iter->btree_id, k->k.p); + if (unlikely(ret < 0)) + return ret; + + if (ret) + k->k.type = KEY_TYPE_whiteout; + } + + /* + * Ensure that updates to cached btrees go to the key cache: + */ + if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) && + !path->cached && + !path->level && + btree_id_cached(trans->c, path->btree_id)) { + if (!iter->key_cache_path || + !iter->key_cache_path->should_be_locked || + !bpos_eq(iter->key_cache_path->pos, k->k.p)) { + if (!iter->key_cache_path) + iter->key_cache_path = + bch2_path_get(trans, path->btree_id, path->pos, 1, 0, + BTREE_ITER_INTENT| + BTREE_ITER_CACHED, _THIS_IP_); + + iter->key_cache_path = + bch2_btree_path_set_pos(trans, iter->key_cache_path, path->pos, + iter->flags & BTREE_ITER_INTENT, + _THIS_IP_); + + ret = bch2_btree_path_traverse(trans, iter->key_cache_path, + BTREE_ITER_CACHED); + if (unlikely(ret)) + return ret; + + ck = (void *) iter->key_cache_path->l[0].b; + + if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) { + trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_); + return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced); + } + + btree_path_set_should_be_locked(iter->key_cache_path); + } + + path = iter->key_cache_path; + } + + return bch2_trans_update_by_path(trans, path, k, flags); +} + +int __must_check bch2_trans_update_buffered(struct btree_trans *trans, + enum btree_id btree, + struct bkey_i *k) +{ + struct btree_write_buffered_key *i; + int ret; + + EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size); + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + trans_for_each_wb_update(trans, i) { + if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) { + bkey_copy(&i->k, k); + return 0; + } + } + + if (!trans->wb_updates || + trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_write_buffered_key *u; + + if (trans->nr_wb_updates == trans->wb_updates_size) { + struct btree_transaction_stats *s = btree_trans_stats(trans); + + BUG_ON(trans->wb_updates_size > U8_MAX / 2); + trans->wb_updates_size = max(1, trans->wb_updates_size * 2); + if (s) + s->wb_updates_size = trans->wb_updates_size; + } + + u = bch2_trans_kmalloc_nomemzero(trans, + trans->wb_updates_size * + sizeof(struct btree_write_buffered_key)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + if (trans->nr_wb_updates) + memcpy(u, trans->wb_updates, trans->nr_wb_updates * + sizeof(struct btree_write_buffered_key)); + trans->wb_updates = u; + } + + trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) { + .btree = btree, + }; + + bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k); + trans->nr_wb_updates++; + + return 0; +} + +int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, + enum btree_id btree, struct bpos end) +{ + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, iter, btree, POS_MAX, BTREE_ITER_INTENT); + k = bch2_btree_iter_prev(iter); + ret = bkey_err(k); + if (ret) + goto err; + + bch2_btree_iter_advance(iter); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + BUG_ON(k.k->type != KEY_TYPE_deleted); + + if (bkey_gt(k.k->p, end)) { + ret = -BCH_ERR_ENOSPC_btree_slot; + goto err; + } + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + +void bch2_trans_commit_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + h->next = trans->hooks; + trans->hooks = h; +} + +int bch2_btree_insert_nonextent(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k, + enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, btree, k->k.p, + BTREE_ITER_NOT_EXTENTS| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int __bch2_btree_insert(struct btree_trans *trans, enum btree_id id, + struct bkey_i *k, enum btree_update_flags flags) +{ + struct btree_iter iter; + int ret; + + bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), + BTREE_ITER_CACHED| + BTREE_ITER_INTENT); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, k, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/** + * bch2_btree_insert - insert keys into the extent btree + * @c: pointer to struct bch_fs + * @id: btree to insert into + * @insert_keys: list of keys to insert + * @hook: insert callback + */ +int bch2_btree_insert(struct bch_fs *c, enum btree_id id, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 *journal_seq, int flags) +{ + return bch2_trans_do(c, disk_res, journal_seq, flags, + __bch2_btree_insert(&trans, id, k, 0)); +} + +int bch2_btree_delete_extent_at(struct btree_trans *trans, struct btree_iter *iter, + unsigned len, unsigned update_flags) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = iter->pos; + bch2_key_resize(&k->k, len); + return bch2_trans_update(trans, iter, k, update_flags); +} + +int bch2_btree_delete_at(struct btree_trans *trans, + struct btree_iter *iter, unsigned update_flags) +{ + return bch2_btree_delete_extent_at(trans, iter, 0, update_flags); +} + +int bch2_btree_delete_at_buffered(struct btree_trans *trans, + enum btree_id btree, struct bpos pos) +{ + struct bkey_i *k; + + k = bch2_trans_kmalloc(trans, sizeof(*k)); + if (IS_ERR(k)) + return PTR_ERR(k); + + bkey_init(&k->k); + k->k.p = pos; + return bch2_trans_update_buffered(trans, btree, k); +} + +int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + u32 restart_count = trans->restart_count; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_INTENT); + while ((k = bch2_btree_iter_peek_upto(&iter, end)).k) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(trans->c, 0); + struct bkey_i delete; + + ret = bkey_err(k); + if (ret) + goto err; + + bkey_init(&delete.k); + + /* + * This could probably be more efficient for extents: + */ + + /* + * For extents, iter.pos won't necessarily be the same as + * bkey_start_pos(k.k) (for non extents they always will be the + * same). It's important that we delete starting from iter.pos + * because the range we want to delete could start in the middle + * of k. + * + * (bch2_btree_iter_peek() does guarantee that iter.pos >= + * bkey_start_pos(k.k)). + */ + delete.k.p = iter.pos; + + if (iter.flags & BTREE_ITER_IS_EXTENTS) + bch2_key_resize(&delete.k, + bpos_min(end, k.k->p).offset - + iter.pos.offset); + + ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?: + bch2_trans_commit(trans, &disk_res, journal_seq, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(trans->c, &disk_res); +err: + /* + * the bch2_trans_begin() call is in a weird place because we + * need to call it after every transaction commit, to avoid path + * overflow, but don't want to call it if the delete operation + * is a no-op and we have no work to do: + */ + bch2_trans_begin(trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; +} + +/* + * bch_btree_delete_range - delete everything within a given range + * + * Range is a half open interval - [start, end) + */ +int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id, + struct bpos start, struct bpos end, + unsigned update_flags, + u64 *journal_seq) +{ + int ret = bch2_trans_run(c, + bch2_btree_delete_range_trans(&trans, id, start, end, + update_flags, journal_seq)); + if (ret == -BCH_ERR_transaction_restart_nested) + ret = 0; + return ret; +} + +static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + struct jset_entry_log *l; + unsigned u64s; + int ret; + + prt_vprintf(&buf, fmt, args); + ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; + if (ret) + goto err; + + u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); + + ret = darray_make_room(entries, jset_u64s(u64s)); + if (ret) + goto err; + + l = (void *) &darray_top(*entries); + l->entry.u64s = cpu_to_le16(u64s); + l->entry.btree_id = 0; + l->entry.level = 1; + l->entry.type = BCH_JSET_ENTRY_log; + l->entry.pad[0] = 0; + l->entry.pad[1] = 0; + l->entry.pad[2] = 0; + memcpy(l->d, buf.buf, buf.pos); + while (buf.pos & 7) + l->d[buf.pos++] = '\0'; + + entries->nr += jset_u64s(u64s); +err: + printbuf_exit(&buf); + return ret; +} + +static int +__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, + va_list args) +{ + int ret; + + if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) { + ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args); + } else { + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW|commit_flags, + __bch2_trans_log_msg(&trans.extra_journal_entries, fmt, args)); + } + + return ret; +} + +int bch2_fs_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, 0, fmt, args); + va_end(args); + return ret; +} + +/* + * Use for logging messages during recovery to enable reserved space and avoid + * blocking. + */ +int bch2_journal_log_msg(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + int ret; + + va_start(args, fmt); + ret = __bch2_fs_log_msg(c, BCH_WATERMARK_reclaim, fmt, args); + va_end(args); + return ret; +} diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c new file mode 100644 index 000000000..6c30a72e6 --- /dev/null +++ b/fs/bcachefs/btree_write_buffer.c @@ -0,0 +1,346 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "error.h" +#include "journal.h" +#include "journal_reclaim.h" + +#include + +static int btree_write_buffered_key_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->btree, r->btree) ?: + bpos_cmp(l->k.k.p, r->k.k.p) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +static int btree_write_buffered_journal_cmp(const void *_l, const void *_r) +{ + const struct btree_write_buffered_key *l = _l; + const struct btree_write_buffered_key *r = _r; + + return cmp_int(l->journal_seq, r->journal_seq); +} + +static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans, + struct btree_iter *iter, + struct btree_write_buffered_key *wb, + unsigned commit_flags, + bool *write_locked, + size_t *fast) +{ + struct bch_fs *c = trans->c; + struct btree_path *path; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + path = iter->path; + + if (!*write_locked) { + ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c); + if (ret) + return ret; + + bch2_btree_node_prep_for_write(trans, path, path->l[0].b); + *write_locked = true; + } + + if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) { + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + goto trans_commit; + } + + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); + (*fast)++; + + if (path->ref > 1) { + /* + * We can't clone a path that has write locks: if the path is + * shared, unlock before set_pos(), traverse(): + */ + bch2_btree_node_unlock_write(trans, path, path->l[0].b); + *write_locked = false; + } + return 0; +trans_commit: + return bch2_trans_update(trans, iter, &wb->k, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RECLAIM); +} + +static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb) +{ + union btree_write_buffer_state old, new; + u64 v = READ_ONCE(wb->state.v); + + do { + old.v = new.v = v; + + new.nr = 0; + new.idx++; + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1) + cpu_relax(); + + smp_mb(); + + return old; +} + +int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags, + bool locked) +{ + struct bch_fs *c = trans->c; + struct journal *j = &c->journal; + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct journal_entry_pin pin; + struct btree_write_buffered_key *i, *keys; + struct btree_iter iter = { NULL }; + size_t nr = 0, skipped = 0, fast = 0, slowpath = 0; + bool write_locked = false; + union btree_write_buffer_state s; + int ret = 0; + + memset(&pin, 0, sizeof(pin)); + + if (!locked && !mutex_trylock(&wb->flush_lock)) + return 0; + + bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL); + bch2_journal_pin_drop(j, &wb->journal_pin); + + s = btree_write_buffer_switch(wb); + keys = wb->keys[s.idx]; + nr = s.nr; + + if (race_fault()) + goto slowpath; + + /* + * We first sort so that we can detect and skip redundant updates, and + * then we attempt to flush in sorted btree order, as this is most + * efficient. + * + * However, since we're not flushing in the order they appear in the + * journal we won't be able to drop our journal pin until everything is + * flushed - which means this could deadlock the journal if we weren't + * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail + * if it would block taking a journal reservation. + * + * If that happens, simply skip the key so we can optimistically insert + * as many keys as possible in the fast path. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_key_cmp, NULL); + + for (i = keys; i < keys + nr; i++) { + if (i + 1 < keys + nr && + i[0].btree == i[1].btree && + bpos_eq(i[0].k.k.p, i[1].k.k.p)) { + skipped++; + i->journal_seq = 0; + continue; + } + + if (write_locked && + (iter.path->btree_id != i->btree || + bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) { + bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + write_locked = false; + } + + if (!iter.path || iter.path->btree_id != i->btree) { + bch2_trans_iter_exit(trans, &iter); + bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p, BTREE_ITER_INTENT); + } + + bch2_btree_iter_set_pos(&iter, i->k.k.p); + iter.path->preserve = false; + + do { + ret = bch2_btree_write_buffer_flush_one(trans, &iter, i, + commit_flags, &write_locked, &fast); + if (!write_locked) + bch2_trans_begin(trans); + } while (bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + if (ret == -BCH_ERR_journal_reclaim_would_deadlock) { + slowpath++; + continue; + } + if (ret) + break; + + i->journal_seq = 0; + } + + if (write_locked) + bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b); + bch2_trans_iter_exit(trans, &iter); + + trace_write_buffer_flush(trans, nr, skipped, fast, wb->size); + + if (slowpath) + goto slowpath; + + bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)); +out: + bch2_journal_pin_drop(j, &pin); + mutex_unlock(&wb->flush_lock); + return ret; +slowpath: + trace_write_buffer_flush_slowpath(trans, i - keys, nr); + + /* + * Now sort the rest by journal seq and bump the journal pin as we go. + * The slowpath zapped the seq of keys that were successfully flushed so + * we can skip those here. + */ + sort(keys, nr, sizeof(keys[0]), + btree_write_buffered_journal_cmp, + NULL); + + commit_flags &= ~BCH_WATERMARK_MASK; + commit_flags |= BCH_WATERMARK_reclaim; + + for (i = keys; i < keys + nr; i++) { + if (!i->journal_seq) + continue; + + if (i->journal_seq > pin.seq) { + struct journal_entry_pin pin2; + + memset(&pin2, 0, sizeof(pin2)); + + bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL); + bch2_journal_pin_drop(j, &pin); + bch2_journal_pin_copy(j, &pin, &pin2, NULL); + bch2_journal_pin_drop(j, &pin2); + } + + ret = commit_do(trans, NULL, NULL, + commit_flags| + BTREE_INSERT_NOFAIL| + BTREE_INSERT_JOURNAL_RECLAIM, + __bch2_btree_insert(trans, i->btree, &i->k, 0)); + if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret))) + break; + } + + goto out; +} + +int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans) +{ + bch2_trans_unlock(trans); + mutex_lock(&trans->c->btree_write_buffer.flush_lock); + return __bch2_btree_write_buffer_flush(trans, 0, true); +} + +int bch2_btree_write_buffer_flush(struct btree_trans *trans) +{ + return __bch2_btree_write_buffer_flush(trans, 0, false); +} + +static int bch2_btree_write_buffer_journal_flush(struct journal *j, + struct journal_entry_pin *_pin, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_lock(&wb->flush_lock); + + return bch2_trans_run(c, + __bch2_btree_write_buffer_flush(&trans, BTREE_INSERT_NOCHECK_RW, true)); +} + +static inline u64 btree_write_buffer_ref(int idx) +{ + return ((union btree_write_buffer_state) { + .ref0 = idx == 0, + .ref1 = idx == 1, + }).v; +} + +int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct btree_write_buffer *wb = &c->btree_write_buffer; + struct btree_write_buffered_key *i; + union btree_write_buffer_state old, new; + int ret = 0; + u64 v; + + trans_for_each_wb_update(trans, i) { + EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + i->journal_seq = trans->journal_res.seq; + i->journal_offset = trans->journal_res.offset; + } + + preempt_disable(); + v = READ_ONCE(wb->state.v); + do { + old.v = new.v = v; + + new.v += btree_write_buffer_ref(new.idx); + new.nr += trans->nr_wb_updates; + if (new.nr > wb->size) { + ret = -BCH_ERR_btree_insert_need_flush_buffer; + goto out; + } + } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v); + + memcpy(wb->keys[new.idx] + old.nr, + trans->wb_updates, + sizeof(trans->wb_updates[0]) * trans->nr_wb_updates); + + bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin, + bch2_btree_write_buffer_journal_flush); + + atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter); +out: + preempt_enable(); + return ret; +} + +void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal)); + + kvfree(wb->keys[1]); + kvfree(wb->keys[0]); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; + + mutex_init(&wb->flush_lock); + wb->size = c->opts.btree_write_buffer_size; + + wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL); + wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL); + if (!wb->keys[0] || !wb->keys[1]) + return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init; + + return 0; +} diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h new file mode 100644 index 000000000..322df1c83 --- /dev/null +++ b/fs/bcachefs/btree_write_buffer.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H +#define _BCACHEFS_BTREE_WRITE_BUFFER_H + +int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool); +int bch2_btree_write_buffer_flush_sync(struct btree_trans *); +int bch2_btree_write_buffer_flush(struct btree_trans *); + +int bch2_btree_insert_keys_write_buffer(struct btree_trans *); + +void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +int bch2_fs_btree_write_buffer_init(struct bch_fs *); + +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h new file mode 100644 index 000000000..99993ba77 --- /dev/null +++ b/fs/bcachefs/btree_write_buffer_types.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H +#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H + +#include "journal_types.h" + +#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4 +#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX) + +struct btree_write_buffered_key { + u64 journal_seq; + unsigned journal_offset; + enum btree_id btree; + __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX); +}; + +union btree_write_buffer_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 nr:23; + u64 idx:1; + u64 ref0:20; + u64 ref1:20; + }; +}; + +struct btree_write_buffer { + struct mutex flush_lock; + struct journal_entry_pin journal_pin; + + union btree_write_buffer_state state; + size_t size; + + struct btree_write_buffered_key *keys[2]; +}; + +#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c new file mode 100644 index 000000000..797ef5ece --- /dev/null +++ b/fs/bcachefs/buckets.c @@ -0,0 +1,2171 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "backpointers.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "buckets_waiting_for_journal.h" +#include "ec.h" +#include "error.h" +#include "inode.h" +#include "movinggc.h" +#include "recovery.h" +#include "reflink.h" +#include "replicas.h" +#include "subvolume.h" +#include "trace.h" + +#include + +static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage, + enum bch_data_type data_type, + s64 sectors) +{ + switch (data_type) { + case BCH_DATA_btree: + fs_usage->btree += sectors; + break; + case BCH_DATA_user: + case BCH_DATA_parity: + fs_usage->data += sectors; + break; + case BCH_DATA_cached: + fs_usage->cached += sectors; + break; + default: + break; + } +} + +void bch2_fs_usage_initialize(struct bch_fs *c) +{ + struct bch_fs_usage *usage; + struct bch_dev *ca; + unsigned i; + + percpu_down_write(&c->mark_lock); + usage = c->usage_base; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + usage->reserved += usage->persistent_reserved[i]; + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]); + } + + for_each_member_device(ca, c, i) { + struct bch_dev_usage dev = bch2_dev_usage_read(ca); + + usage->hidden += (dev.d[BCH_DATA_sb].buckets + + dev.d[BCH_DATA_journal].buckets) * + ca->mi.bucket_size; + } + + percpu_up_write(&c->mark_lock); +} + +static inline struct bch_dev_usage *dev_usage_ptr(struct bch_dev *ca, + unsigned journal_seq, + bool gc) +{ + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc + ? ca->usage_gc + : ca->usage[journal_seq & JOURNAL_BUF_MASK]); +} + +void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) +{ + struct bch_fs *c = ca->fs; + unsigned seq, i, u64s = dev_usage_u64s(); + + do { + seq = read_seqcount_begin(&c->usage_lock); + memcpy(usage, ca->usage_base, u64s * sizeof(u64)); + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage[i], u64s); + } while (read_seqcount_retry(&c->usage_lock, seq)); +} + +static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c, + unsigned journal_seq, + bool gc) +{ + percpu_rwsem_assert_held(&c->mark_lock); + BUG_ON(!gc && !journal_seq); + + return this_cpu_ptr(gc + ? c->usage_gc + : c->usage[journal_seq & JOURNAL_BUF_MASK]); +} + +u64 bch2_fs_usage_read_one(struct bch_fs *c, u64 *v) +{ + ssize_t offset = v - (u64 *) c->usage_base; + unsigned i, seq; + u64 ret; + + BUG_ON(offset < 0 || offset >= fs_usage_u64s(c)); + percpu_rwsem_assert_held(&c->mark_lock); + + do { + seq = read_seqcount_begin(&c->usage_lock); + ret = *v; + + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + ret += percpu_u64_get((u64 __percpu *) c->usage[i] + offset); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +} + +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *c) +{ + struct bch_fs_usage_online *ret; + unsigned nr_replicas = READ_ONCE(c->replicas.nr); + unsigned seq, i; +retry: + ret = kmalloc(__fs_usage_online_u64s(nr_replicas) * sizeof(u64), GFP_KERNEL); + if (unlikely(!ret)) + return NULL; + + percpu_down_read(&c->mark_lock); + + if (nr_replicas != c->replicas.nr) { + nr_replicas = c->replicas.nr; + percpu_up_read(&c->mark_lock); + kfree(ret); + goto retry; + } + + ret->online_reserved = percpu_u64_get(c->online_reserved); + + do { + seq = read_seqcount_begin(&c->usage_lock); + unsafe_memcpy(&ret->u, c->usage_base, + __fs_usage_u64s(nr_replicas) * sizeof(u64), + "embedded variable length struct"); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + acc_u64s_percpu((u64 *) &ret->u, (u64 __percpu *) c->usage[i], + __fs_usage_u64s(nr_replicas)); + } while (read_seqcount_retry(&c->usage_lock, seq)); + + return ret; +} + +void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx) +{ + struct bch_dev *ca; + unsigned i, u64s = fs_usage_u64s(c); + + BUG_ON(idx >= ARRAY_SIZE(c->usage)); + + preempt_disable(); + write_seqcount_begin(&c->usage_lock); + + acc_u64s_percpu((u64 *) c->usage_base, + (u64 __percpu *) c->usage[idx], u64s); + percpu_memset(c->usage[idx], 0, u64s * sizeof(u64)); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) { + u64s = dev_usage_u64s(); + + acc_u64s_percpu((u64 *) ca->usage_base, + (u64 __percpu *) ca->usage[idx], u64s); + percpu_memset(ca->usage[idx], 0, u64s * sizeof(u64)); + } + rcu_read_unlock(); + + write_seqcount_end(&c->usage_lock); + preempt_enable(); +} + +void bch2_fs_usage_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_fs_usage_online *fs_usage) +{ + unsigned i; + + prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity); + + prt_printf(out, "hidden:\t\t\t\t%llu\n", + fs_usage->u.hidden); + prt_printf(out, "data:\t\t\t\t%llu\n", + fs_usage->u.data); + prt_printf(out, "cached:\t\t\t\t%llu\n", + fs_usage->u.cached); + prt_printf(out, "reserved:\t\t\t%llu\n", + fs_usage->u.reserved); + prt_printf(out, "nr_inodes:\t\t\t%llu\n", + fs_usage->u.nr_inodes); + prt_printf(out, "online reserved:\t\t%llu\n", + fs_usage->online_reserved); + + for (i = 0; + i < ARRAY_SIZE(fs_usage->u.persistent_reserved); + i++) { + prt_printf(out, "%u replicas:\n", i + 1); + prt_printf(out, "\treserved:\t\t%llu\n", + fs_usage->u.persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + prt_printf(out, "\t"); + bch2_replicas_entry_to_text(out, e); + prt_printf(out, ":\t%llu\n", fs_usage->u.replicas[i]); + } +} + +static u64 reserve_factor(u64 r) +{ + return r + (round_up(r, (1 << RESERVE_FACTOR)) >> RESERVE_FACTOR); +} + +u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage) +{ + return min(fs_usage->u.hidden + + fs_usage->u.btree + + fs_usage->u.data + + reserve_factor(fs_usage->u.reserved + + fs_usage->online_reserved), + c->capacity); +} + +static struct bch_fs_usage_short +__bch2_fs_usage_read_short(struct bch_fs *c) +{ + struct bch_fs_usage_short ret; + u64 data, reserved; + + ret.capacity = c->capacity - + bch2_fs_usage_read_one(c, &c->usage_base->hidden); + + data = bch2_fs_usage_read_one(c, &c->usage_base->data) + + bch2_fs_usage_read_one(c, &c->usage_base->btree); + reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) + + percpu_u64_get(c->online_reserved); + + ret.used = min(ret.capacity, data + reserve_factor(reserved)); + ret.free = ret.capacity - ret.used; + + ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes); + + return ret; +} + +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *c) +{ + struct bch_fs_usage_short ret; + + percpu_down_read(&c->mark_lock); + ret = __bch2_fs_usage_read_short(c); + percpu_up_read(&c->mark_lock); + + return ret; +} + +void bch2_dev_usage_init(struct bch_dev *ca) +{ + ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket; +} + +static inline int bucket_sectors_fragmented(struct bch_dev *ca, + struct bch_alloc_v4 a) +{ + return a.dirty_sectors + ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors) + : 0; +} + +static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca, + struct bch_alloc_v4 old, + struct bch_alloc_v4 new, + u64 journal_seq, bool gc) +{ + struct bch_fs_usage *fs_usage; + struct bch_dev_usage *u; + + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + + if (data_type_is_hidden(old.data_type)) + fs_usage->hidden -= ca->mi.bucket_size; + if (data_type_is_hidden(new.data_type)) + fs_usage->hidden += ca->mi.bucket_size; + + u = dev_usage_ptr(ca, journal_seq, gc); + + u->d[old.data_type].buckets--; + u->d[new.data_type].buckets++; + + u->buckets_ec -= (int) !!old.stripe; + u->buckets_ec += (int) !!new.stripe; + + u->d[old.data_type].sectors -= old.dirty_sectors; + u->d[new.data_type].sectors += new.dirty_sectors; + + u->d[BCH_DATA_cached].sectors += new.cached_sectors; + u->d[BCH_DATA_cached].sectors -= old.cached_sectors; + + u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old); + u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new); + + preempt_enable(); +} + +static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca, + struct bucket old, struct bucket new, + u64 journal_seq, bool gc) +{ + struct bch_alloc_v4 old_a = { + .gen = old.gen, + .data_type = old.data_type, + .dirty_sectors = old.dirty_sectors, + .cached_sectors = old.cached_sectors, + .stripe = old.stripe, + }; + struct bch_alloc_v4 new_a = { + .gen = new.gen, + .data_type = new.data_type, + .dirty_sectors = new.dirty_sectors, + .cached_sectors = new.cached_sectors, + .stripe = new.stripe, + }; + + bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc); +} + +static inline int __update_replicas(struct bch_fs *c, + struct bch_fs_usage *fs_usage, + struct bch_replicas_entry *r, + s64 sectors) +{ + int idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) + return -1; + + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + return 0; +} + +static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k, + struct bch_replicas_entry *r, s64 sectors, + unsigned journal_seq, bool gc) +{ + struct bch_fs_usage *fs_usage; + int idx, ret = 0; + struct printbuf buf = PRINTBUF; + + percpu_down_read(&c->mark_lock); + buf.atomic++; + + idx = bch2_replicas_entry_idx(c, r); + if (idx < 0 && + fsck_err(c, "no replicas entry\n" + " while marking %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + percpu_up_read(&c->mark_lock); + ret = bch2_mark_replicas(c, r); + percpu_down_read(&c->mark_lock); + + if (ret) + goto err; + idx = bch2_replicas_entry_idx(c, r); + } + if (idx < 0) { + ret = -1; + goto err; + } + + preempt_disable(); + fs_usage = fs_usage_ptr(c, journal_seq, gc); + fs_usage_data_type_to_base(fs_usage, r->data_type, sectors); + fs_usage->replicas[idx] += sectors; + preempt_enable(); +err: +fsck_err: + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +static inline int update_cached_sectors(struct bch_fs *c, + struct bkey_s_c k, + unsigned dev, s64 sectors, + unsigned journal_seq, bool gc) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + return update_replicas(c, k, &r.e, sectors, journal_seq, gc); +} + +static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more, + gfp_t gfp) +{ + struct replicas_delta_list *d = trans->fs_usage_deltas; + unsigned new_size = d ? (d->size + more) * 2 : 128; + unsigned alloc_size = sizeof(*d) + new_size; + + WARN_ON_ONCE(alloc_size > REPLICAS_DELTA_LIST_MAX); + + if (!d || d->used + more > d->size) { + d = krealloc(d, alloc_size, gfp|__GFP_ZERO); + + if (unlikely(!d)) { + if (alloc_size > REPLICAS_DELTA_LIST_MAX) + return -ENOMEM; + + d = mempool_alloc(&trans->c->replicas_delta_pool, gfp); + if (!d) + return -ENOMEM; + + memset(d, 0, REPLICAS_DELTA_LIST_MAX); + + if (trans->fs_usage_deltas) + memcpy(d, trans->fs_usage_deltas, + trans->fs_usage_deltas->size + sizeof(*d)); + + new_size = REPLICAS_DELTA_LIST_MAX - sizeof(*d); + kfree(trans->fs_usage_deltas); + } + + d->size = new_size; + trans->fs_usage_deltas = d; + } + + return 0; +} + +static int replicas_deltas_realloc(struct btree_trans *trans, unsigned more) +{ + return allocate_dropping_locks_errcode(trans, + __replicas_deltas_realloc(trans, more, _gfp)); +} + +static inline int update_replicas_list(struct btree_trans *trans, + struct bch_replicas_entry *r, + s64 sectors) +{ + struct replicas_delta_list *d; + struct replicas_delta *n; + unsigned b; + int ret; + + if (!sectors) + return 0; + + b = replicas_entry_bytes(r) + 8; + ret = replicas_deltas_realloc(trans, b); + if (ret) + return ret; + + d = trans->fs_usage_deltas; + n = (void *) d->d + d->used; + n->delta = sectors; + memcpy((void *) n + offsetof(struct replicas_delta, r), + r, replicas_entry_bytes(r)); + bch2_replicas_entry_sort(&n->r); + d->used += b; + return 0; +} + +static inline int update_cached_sectors_list(struct btree_trans *trans, + unsigned dev, s64 sectors) +{ + struct bch_replicas_padded r; + + bch2_replicas_entry_cached(&r.e, dev); + + return update_replicas_list(trans, &r.e, sectors); +} + +int bch2_mark_alloc(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + u64 bucket_journal_seq; + struct bch_fs *c = trans->c; + struct bch_alloc_v4 old_a_convert, new_a_convert; + const struct bch_alloc_v4 *old_a, *new_a; + struct bch_dev *ca; + int ret = 0; + + /* + * alloc btree is read in by bch2_alloc_read, not gc: + */ + if ((flags & BTREE_TRIGGER_GC) && + !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) + return 0; + + if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans, + "alloc key for invalid device or bucket")) + return -EIO; + + ca = bch_dev_bkey_exists(c, new.k->p.inode); + + old_a = bch2_alloc_to_v4(old, &old_a_convert); + new_a = bch2_alloc_to_v4(new, &new_a_convert); + + bucket_journal_seq = new_a->journal_seq; + + if ((flags & BTREE_TRIGGER_INSERT) && + data_type_is_empty(old_a->data_type) != + data_type_is_empty(new_a->data_type) && + new.k->type == KEY_TYPE_alloc_v4) { + struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v; + + EBUG_ON(!journal_seq); + + /* + * If the btree updates referring to a bucket weren't flushed + * before the bucket became empty again, then the we don't have + * to wait on a journal flush before we can reuse the bucket: + */ + v->journal_seq = bucket_journal_seq = + data_type_is_empty(new_a->data_type) && + (journal_seq == v->journal_seq || + bch2_journal_noflush_seq(&c->journal, v->journal_seq)) + ? 0 : journal_seq; + } + + if (!data_type_is_empty(old_a->data_type) && + data_type_is_empty(new_a->data_type) && + bucket_journal_seq) { + ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal, + c->journal.flushed_seq_ondisk, + new.k->p.inode, new.k->p.offset, + bucket_journal_seq); + if (ret) { + bch2_fs_fatal_error(c, + "error setting bucket_needs_journal_commit: %i", ret); + return ret; + } + } + + percpu_down_read(&c->mark_lock); + if (!gc && new_a->gen != old_a->gen) + *bucket_gen(ca, new.k->p.offset) = new_a->gen; + + bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc); + + if (gc) { + struct bucket *g = gc_bucket(ca, new.k->p.offset); + + bucket_lock(g); + + g->gen_valid = 1; + g->gen = new_a->gen; + g->data_type = new_a->data_type; + g->stripe = new_a->stripe; + g->stripe_redundancy = new_a->stripe_redundancy; + g->dirty_sectors = new_a->dirty_sectors; + g->cached_sectors = new_a->cached_sectors; + + bucket_unlock(g); + } + percpu_up_read(&c->mark_lock); + + /* + * need to know if we're getting called from the invalidate path or + * not: + */ + + if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) && + old_a->cached_sectors) { + ret = update_cached_sectors(c, new, ca->dev_idx, + -((s64) old_a->cached_sectors), + journal_seq, gc); + if (ret) { + bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", + __func__); + return ret; + } + } + + if (new_a->data_type == BCH_DATA_free && + (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk)) + closure_wake_up(&c->freelist_wait); + + if (new_a->data_type == BCH_DATA_need_discard && + (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk)) + bch2_do_discards(c); + + if (old_a->data_type != BCH_DATA_cached && + new_a->data_type == BCH_DATA_cached && + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))) + bch2_do_invalidates(c); + + if (new_a->data_type == BCH_DATA_need_gc_gens) + bch2_do_gc_gens(c); + + return 0; +} + +int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca, + size_t b, enum bch_data_type data_type, + unsigned sectors, struct gc_pos pos, + unsigned flags) +{ + struct bucket old, new, *g; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + BUG_ON(data_type != BCH_DATA_sb && + data_type != BCH_DATA_journal); + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + percpu_down_read(&c->mark_lock); + g = gc_bucket(ca, b); + + bucket_lock(g); + old = *g; + + if (bch2_fs_inconsistent_on(g->data_type && + g->data_type != data_type, c, + "different types of data in same bucket: %s, %s", + bch2_data_types[g->data_type], + bch2_data_types[data_type])) { + ret = -EIO; + goto err; + } + + if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size", + ca->dev_idx, b, g->gen, + bch2_data_types[g->data_type ?: data_type], + g->dirty_sectors, sectors)) { + ret = -EIO; + goto err; + } + + + g->data_type = data_type; + g->dirty_sectors += sectors; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, 0, true); + percpu_up_read(&c->mark_lock); + return ret; +} + +static int check_bucket_ref(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 b_gen, u8 bucket_data_type, + u32 dirty_sectors, u32 cached_sectors) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + size_t bucket_nr = PTR_BUCKET_NR(ca, ptr); + u16 bucket_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (bucket_data_type == BCH_DATA_cached) + bucket_data_type = BCH_DATA_user; + + if ((bucket_data_type == BCH_DATA_stripe && ptr_data_type == BCH_DATA_user) || + (bucket_data_type == BCH_DATA_user && ptr_data_type == BCH_DATA_stripe)) + bucket_data_type = ptr_data_type = BCH_DATA_stripe; + + if (gen_after(ptr->gen, b_gen)) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if (b_gen != ptr->gen && !ptr->cached) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + *bucket_gen(ca, bucket_nr), + bch2_data_types[bucket_data_type ?: ptr_data_type], + ptr->gen, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if (b_gen != ptr->gen) { + ret = 1; + goto out; + } + + if (!data_type_is_empty(bucket_data_type) && + ptr_data_type && + bucket_data_type != ptr_data_type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type], + bch2_data_types[ptr_data_type], + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } + + if ((unsigned) (bucket_sectors + sectors) > U32_MAX) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U16_MAX\n" + "while marking %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_types[bucket_data_type ?: ptr_data_type], + bucket_sectors, sectors, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EIO; + goto err; + } +out: + printbuf_exit(&buf); + return ret; +err: + bch2_dump_trans_updates(trans); + goto out; +} + +static int mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c k, + unsigned ptr_idx, + unsigned flags) +{ + struct bch_fs *c = trans->c; + u64 journal_seq = trans->journal_res.seq; + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned nr_data = s->nr_blocks - s->nr_redundant; + bool parity = ptr_idx >= nr_data; + enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe; + s64 sectors = parity ? le16_to_cpu(s->sectors) : 0; + const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + struct bucket old, new, *g; + struct printbuf buf = PRINTBUF; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + /* * XXX doesn't handle deletion */ + + percpu_down_read(&c->mark_lock); + buf.atomic++; + g = PTR_GC_BUCKET(ca, ptr); + + if (g->dirty_sectors || + (g->stripe && g->stripe != k.k->p.offset)) { + bch2_fs_inconsistent(c, + "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s", + ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + ret = -EINVAL; + goto err; + } + + bucket_lock(g); + old = *g; + + ret = check_bucket_ref(trans, k, ptr, sectors, data_type, + g->gen, g->data_type, + g->dirty_sectors, g->cached_sectors); + if (ret) + goto err; + + g->data_type = data_type; + g->dirty_sectors += sectors; + + g->stripe = k.k->p.offset; + g->stripe_redundancy = s->nr_redundant; + new = *g; +err: + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + percpu_up_read(&c->mark_lock); + printbuf_exit(&buf); + return ret; +} + +static int __mark_pointer(struct btree_trans *trans, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + s64 sectors, enum bch_data_type ptr_data_type, + u8 bucket_gen, u8 *bucket_data_type, + u32 *dirty_sectors, u32 *cached_sectors) +{ + u32 *dst_sectors = !ptr->cached + ? dirty_sectors + : cached_sectors; + int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type, + bucket_gen, *bucket_data_type, + *dirty_sectors, *cached_sectors); + + if (ret) + return ret; + + *dst_sectors += sectors; + *bucket_data_type = *dirty_sectors || *cached_sectors + ? ptr_data_type : 0; + return 0; +} + +static int bch2_mark_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, + struct extent_ptr_decoded p, + s64 sectors, + unsigned flags) +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev); + struct bucket old, new, *g; + enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p); + u8 bucket_data_type; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + percpu_down_read(&c->mark_lock); + g = PTR_GC_BUCKET(ca, &p.ptr); + bucket_lock(g); + old = *g; + + bucket_data_type = g->data_type; + ret = __mark_pointer(trans, k, &p.ptr, sectors, + data_type, g->gen, + &bucket_data_type, + &g->dirty_sectors, + &g->cached_sectors); + if (!ret) + g->data_type = bucket_data_type; + + new = *g; + bucket_unlock(g); + if (!ret) + bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true); + percpu_up_read(&c->mark_lock); + + return ret; +} + +static int bch2_mark_stripe_ptr(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_stripe_ptr p, + enum bch_data_type data_type, + s64 sectors, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_replicas_padded r; + struct gc_stripe *m; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL); + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + (u64) p.idx); + return -BCH_ERR_ENOMEM_mark_stripe_ptr; + } + + mutex_lock(&c->ec_stripes_heap_lock); + + if (!m || !m->alive) { + mutex_unlock(&c->ec_stripes_heap_lock); + bch_err_ratelimited(c, "pointer to nonexistent stripe %llu", + (u64) p.idx); + bch2_inconsistent_error(c); + return -EIO; + } + + m->block_sectors[p.block] += sectors; + + r = m->r; + mutex_unlock(&c->ec_stripes_heap_lock); + + r.e.data_type = data_type; + update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true); + + return 0; +} + +int bch2_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + enum bch_data_type data_type = bkey_is_btree_ptr(k.k) + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) + ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = ptr_disk_sectors(sectors, p); + + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + + ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags); + if (ret < 0) + return ret; + + stale = ret > 0; + + if (p.ptr.cached) { + if (!stale) { + ret = update_cached_sectors(c, k, p.ptr.dev, + disk_sectors, journal_seq, true); + if (ret) { + bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors", + __func__); + return ret; + } + } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type, + disk_sectors, flags); + if (ret) + return ret; + + /* + * There may be other dirty pointers in this extent, but + * if so they're not required for mounting if we have an + * erasure coded pointer in this extent: + */ + r.e.nr_required = 0; + } + } + + if (r.e.nr_devs) { + ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + +int bch2_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + bool gc = flags & BTREE_TRIGGER_GC; + u64 journal_seq = trans->journal_res.seq; + struct bch_fs *c = trans->c; + u64 idx = new.k->p.offset; + const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(old).v : NULL; + const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe + ? bkey_s_c_to_stripe(new).v : NULL; + unsigned i; + int ret; + + BUG_ON(gc && old_s); + + if (!gc) { + struct stripe *m = genradix_ptr(&c->stripes, idx); + + if (!m) { + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + + bch2_bkey_val_to_text(&buf1, c, old); + bch2_bkey_val_to_text(&buf2, c, new); + bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" + "old %s\n" + "new %s", idx, buf1.buf, buf2.buf); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + bch2_inconsistent_error(c); + return -1; + } + + if (!new_s) { + bch2_stripes_heap_del(c, m, idx); + + memset(m, 0, sizeof(*m)); + } else { + m->sectors = le16_to_cpu(new_s->sectors); + m->algorithm = new_s->algorithm; + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < new_s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(new_s, i); + + if (!old_s) + bch2_stripes_heap_insert(c, m, idx); + else + bch2_stripes_heap_update(c, m, idx); + } + } else { + struct gc_stripe *m = + genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); + + if (!m) { + bch_err(c, "error allocating memory for gc_stripes, idx %llu", + idx); + return -BCH_ERR_ENOMEM_mark_stripe; + } + /* + * This will be wrong when we bring back runtime gc: we should + * be unmarking the old key and then marking the new key + */ + m->alive = true; + m->sectors = le16_to_cpu(new_s->sectors); + m->nr_blocks = new_s->nr_blocks; + m->nr_redundant = new_s->nr_redundant; + + for (i = 0; i < new_s->nr_blocks; i++) + m->ptrs[i] = new_s->ptrs[i]; + + bch2_bkey_to_replicas(&m->r.e, new); + + /* + * gc recalculates this field from stripe ptr + * references: + */ + memset(m->block_sectors, 0, sizeof(m->block_sectors)); + + for (i = 0; i < new_s->nr_blocks; i++) { + ret = mark_stripe_bucket(trans, new, i, flags); + if (ret) + return ret; + } + + ret = update_replicas(c, new, &m->r.e, + ((s64) m->sectors * m->nr_redundant), + journal_seq, gc); + if (ret) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, new); + bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf); + printbuf_exit(&buf); + return ret; + } + } + + return 0; +} + +int bch2_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *fs_usage; + u64 journal_seq = trans->journal_res.seq; + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v; + + BUG_ON(!journal_seq); + BUG_ON(new.k->type != KEY_TYPE_inode_v3); + + v->bi_journal_seq = cpu_to_le64(journal_seq); + } + + if (flags & BTREE_TRIGGER_GC) { + percpu_down_read(&c->mark_lock); + preempt_disable(); + + fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC); + fs_usage->nr_inodes += bkey_is_inode(new.k); + fs_usage->nr_inodes -= bkey_is_inode(old.k); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + } + return 0; +} + +int bch2_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bch_fs_usage *fs_usage; + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + + fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC); + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(fs_usage->persistent_reserved)); + + fs_usage->reserved += sectors; + fs_usage->persistent_reserved[replicas - 1] += sectors; + + preempt_enable(); + percpu_up_read(&c->mark_lock); + + return 0; +} + +static s64 __bch2_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 start, u64 end, + u64 *idx, unsigned flags, size_t r_idx) +{ + struct bch_fs *c = trans->c; + struct reflink_gc *r; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + u64 next_idx = end; + s64 ret = 0; + struct printbuf buf = PRINTBUF; + + if (r_idx >= c->reflink_gc_nr) + goto not_found; + + r = genradix_ptr(&c->reflink_gc_table, r_idx); + next_idx = min(next_idx, r->offset - r->size); + if (*idx < next_idx) + goto not_found; + + BUG_ON((s64) r->refcount + add < 0); + + r->refcount += add; + *idx = r->offset; + return 0; +not_found: + if (fsck_err(c, "pointer to missing indirect extent\n" + " %s\n" + " missing range %llu-%llu", + (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), + *idx, next_idx)) { + struct bkey_i_error *new; + + new = bch2_trans_kmalloc(trans, sizeof(*new)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_init(&new->k); + new->k.type = KEY_TYPE_error; + new->k.p = bkey_start_pos(p.k); + new->k.p.offset += *idx - start; + bch2_key_resize(&new->k, next_idx - *idx); + ret = __bch2_btree_insert(trans, BTREE_ID_extents, &new->k_i, + BTREE_TRIGGER_NORUN); + } + + *idx = next_idx; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE ? old : new; + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + struct reflink_gc *ref; + size_t l, r, m; + u64 idx = le64_to_cpu(p.v->idx), start = idx; + u64 end = le64_to_cpu(p.v->idx) + p.k->size; + int ret = 0; + + BUG_ON(!(flags & BTREE_TRIGGER_GC)); + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) { + idx -= le32_to_cpu(p.v->front_pad); + end += le32_to_cpu(p.v->back_pad); + } + + l = 0; + r = c->reflink_gc_nr; + while (l < r) { + m = l + (r - l) / 2; + + ref = genradix_ptr(&c->reflink_gc_table, m); + if (ref->offset <= idx) + l = m + 1; + else + r = m; + } + + while (idx < end && !ret) + ret = __bch2_mark_reflink_p(trans, p, start, end, + &idx, flags, l++); + + return ret; +} + +void bch2_trans_fs_usage_revert(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + struct bch_fs_usage *dst; + struct replicas_delta *d, *top = (void *) deltas->d + deltas->used; + s64 added = 0; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + /* revert changes: */ + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + BUG_ON(__update_replicas(c, dst, &d->r, -d->delta)); + } + + dst->nr_inodes -= deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added -= deltas->persistent_reserved[i]; + dst->reserved -= deltas->persistent_reserved[i]; + dst->persistent_reserved[i] -= deltas->persistent_reserved[i]; + } + + if (added > 0) { + trans->disk_res->sectors += added; + this_cpu_add(*c->online_reserved, added); + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); +} + +int bch2_trans_fs_usage_apply(struct btree_trans *trans, + struct replicas_delta_list *deltas) +{ + struct bch_fs *c = trans->c; + static int warned_disk_usage = 0; + bool warn = false; + unsigned disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0; + struct replicas_delta *d = deltas->d, *d2; + struct replicas_delta *top = (void *) deltas->d + deltas->used; + struct bch_fs_usage *dst; + s64 added = 0, should_not_have_added; + unsigned i; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + dst = fs_usage_ptr(c, trans->journal_res.seq, false); + + for (d = deltas->d; d != top; d = replicas_delta_next(d)) { + switch (d->r.data_type) { + case BCH_DATA_btree: + case BCH_DATA_user: + case BCH_DATA_parity: + added += d->delta; + } + + if (__update_replicas(c, dst, &d->r, d->delta)) + goto need_mark; + } + + dst->nr_inodes += deltas->nr_inodes; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + added += deltas->persistent_reserved[i]; + dst->reserved += deltas->persistent_reserved[i]; + dst->persistent_reserved[i] += deltas->persistent_reserved[i]; + } + + /* + * Not allowed to reduce sectors_available except by getting a + * reservation: + */ + should_not_have_added = added - (s64) disk_res_sectors; + if (unlikely(should_not_have_added > 0)) { + u64 old, new, v = atomic64_read(&c->sectors_available); + + do { + old = v; + new = max_t(s64, 0, old - should_not_have_added); + } while ((v = atomic64_cmpxchg(&c->sectors_available, + old, new)) != old); + + added -= should_not_have_added; + warn = true; + } + + if (added > 0) { + trans->disk_res->sectors -= added; + this_cpu_sub(*c->online_reserved, added); + } + + preempt_enable(); + percpu_up_read(&c->mark_lock); + + if (unlikely(warn) && !xchg(&warned_disk_usage, 1)) + bch2_trans_inconsistent(trans, + "disk usage increased %lli more than %u sectors reserved)", + should_not_have_added, disk_res_sectors); + return 0; +need_mark: + /* revert changes: */ + for (d2 = deltas->d; d2 != d; d2 = replicas_delta_next(d2)) + BUG_ON(__update_replicas(c, dst, &d2->r, -d2->delta)); + + preempt_enable(); + percpu_up_read(&c->mark_lock); + return -1; +} + +/* trans_mark: */ + +static inline int bch2_trans_mark_pointer(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c k, struct extent_ptr_decoded p, + unsigned flags) +{ + bool insert = !(flags & BTREE_TRIGGER_OVERWRITE); + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + struct bpos bucket; + struct bch_backpointer bp; + s64 sectors; + int ret; + + bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp); + sectors = bp.bucket_len; + if (!insert) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, bucket); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type, + a->v.gen, &a->v.data_type, + &a->v.dirty_sectors, &a->v.cached_sectors) ?: + bch2_trans_update(trans, &iter, &a->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + if (!p.ptr.cached) { + ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert); + if (ret) + return ret; + } + + return 0; +} + +static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans, + struct extent_ptr_decoded p, + s64 sectors, enum bch_data_type data_type) +{ + struct btree_iter iter; + struct bkey_i_stripe *s; + struct bch_replicas_padded r; + int ret = 0; + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_stripes, POS(0, p.ec.idx), + BTREE_ITER_WITH_UPDATES, stripe); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans, + "pointer to nonexistent stripe %llu", + (u64) p.ec.idx); + goto err; + } + + if (!bch2_ptr_matches_stripe(&s->v, p)) { + bch2_trans_inconsistent(trans, + "stripe pointer doesn't match stripe %llu", + (u64) p.ec.idx); + ret = -EIO; + goto err; + } + + stripe_blockcount_set(&s->v, p.ec.block, + stripe_blockcount_get(&s->v, p.ec.block) + + sectors); + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i)); + r.e.data_type = data_type; + ret = update_replicas_list(trans, &r.e, sectors); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_extent(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_replicas_padded r; + enum bch_data_type data_type = bkey_is_btree_ptr(k.k) + ? BCH_DATA_btree + : BCH_DATA_user; + s64 sectors = bkey_is_btree_ptr(k.k) + ? btree_sectors(c) + : k.k->size; + s64 dirty_sectors = 0; + bool stale; + int ret = 0; + + r.e.data_type = data_type; + r.e.nr_devs = 0; + r.e.nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + s64 disk_sectors = ptr_disk_sectors(sectors, p); + + if (flags & BTREE_TRIGGER_OVERWRITE) + disk_sectors = -disk_sectors; + + ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags); + if (ret < 0) + return ret; + + stale = ret > 0; + + if (p.ptr.cached) { + if (!stale) { + ret = update_cached_sectors_list(trans, p.ptr.dev, + disk_sectors); + if (ret) + return ret; + } + } else if (!p.has_ec) { + dirty_sectors += disk_sectors; + r.e.devs[r.e.nr_devs++] = p.ptr.dev; + } else { + ret = bch2_trans_mark_stripe_ptr(trans, p, + disk_sectors, data_type); + if (ret) + return ret; + + r.e.nr_required = 0; + } + } + + if (r.e.nr_devs) + ret = update_replicas_list(trans, &r.e, dirty_sectors); + + return ret; +} + +static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans, + struct bkey_s_c_stripe s, + unsigned idx, bool deleting) +{ + struct bch_fs *c = trans->c; + const struct bch_extent_ptr *ptr = &s.v->ptrs[idx]; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity : 0; + s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0; + int ret = 0; + + if (deleting) + sectors = -sectors; + + a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr)); + if (IS_ERR(a)) + return PTR_ERR(a); + + ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type, + a->v.gen, a->v.data_type, + a->v.dirty_sectors, a->v.cached_sectors); + if (ret) + goto err; + + if (!deleting) { + if (bch2_trans_inconsistent_on(a->v.stripe || + a->v.stripe_redundancy, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + a->v.stripe, s.k->p.offset)) { + ret = -EIO; + goto err; + } + + if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans, + "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + a->v.dirty_sectors, + s.k->p.offset)) { + ret = -EIO; + goto err; + } + + a->v.stripe = s.k->p.offset; + a->v.stripe_redundancy = s.v->nr_redundant; + a->v.data_type = BCH_DATA_stripe; + } else { + if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset || + a->v.stripe_redundancy != s.v->nr_redundant, trans, + "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)", + iter.pos.inode, iter.pos.offset, a->v.gen, + s.k->p.offset, a->v.stripe)) { + ret = -EIO; + goto err; + } + + a->v.stripe = 0; + a->v.stripe_redundancy = 0; + a->v.data_type = alloc_data_type(a->v, BCH_DATA_user); + } + + a->v.dirty_sectors += sectors; + if (data_type) + a->v.data_type = !deleting ? data_type : 0; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_stripe(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + const struct bch_stripe *old_s = NULL; + struct bch_stripe *new_s = NULL; + struct bch_replicas_padded r; + unsigned i, nr_blocks; + int ret = 0; + + if (old.k->type == KEY_TYPE_stripe) + old_s = bkey_s_c_to_stripe(old).v; + if (new->k.type == KEY_TYPE_stripe) + new_s = &bkey_i_to_stripe(new)->v; + + /* + * If the pointers aren't changing, we don't need to do anything: + */ + if (new_s && old_s && + new_s->nr_blocks == old_s->nr_blocks && + new_s->nr_redundant == old_s->nr_redundant && + !memcmp(old_s->ptrs, new_s->ptrs, + new_s->nr_blocks * sizeof(struct bch_extent_ptr))) + return 0; + + BUG_ON(new_s && old_s && + (new_s->nr_blocks != old_s->nr_blocks || + new_s->nr_redundant != old_s->nr_redundant)); + + nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks; + + if (new_s) { + s64 sectors = le16_to_cpu(new_s->sectors); + + bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new)); + ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant); + if (ret) + return ret; + } + + if (old_s) { + s64 sectors = -((s64) le16_to_cpu(old_s->sectors)); + + bch2_bkey_to_replicas(&r.e, old); + ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant); + if (ret) + return ret; + } + + for (i = 0; i < nr_blocks; i++) { + if (new_s && old_s && + !memcmp(&new_s->ptrs[i], + &old_s->ptrs[i], + sizeof(new_s->ptrs[i]))) + continue; + + if (new_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_i_to_s_c_stripe(new), i, false); + if (ret) + break; + } + + if (old_s) { + ret = bch2_trans_mark_stripe_bucket(trans, + bkey_s_c_to_stripe(old), i, true); + if (ret) + break; + } + } + + return ret; +} + +int bch2_trans_mark_inode(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k); + + if (nr) { + int ret = replicas_deltas_realloc(trans, 0); + struct replicas_delta_list *d = trans->fs_usage_deltas; + + if (ret) + return ret; + + d->nr_inodes += nr; + } + + return 0; +} + +int bch2_trans_mark_reservation(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); + unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas; + s64 sectors = (s64) k.k->size; + struct replicas_delta_list *d; + int ret; + + if (flags & BTREE_TRIGGER_OVERWRITE) + sectors = -sectors; + sectors *= replicas; + + ret = replicas_deltas_realloc(trans, 0); + if (ret) + return ret; + + d = trans->fs_usage_deltas; + replicas = clamp_t(unsigned, replicas, 1, + ARRAY_SIZE(d->persistent_reserved)); + + d->persistent_reserved[replicas - 1] += sectors; + return 0; +} + +static int __bch2_trans_mark_reflink_p(struct btree_trans *trans, + struct bkey_s_c_reflink_p p, + u64 *idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i *k; + __le64 *refcount; + int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1; + struct printbuf buf = PRINTBUF; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, + BTREE_ID_reflink, POS(0, *idx), + BTREE_ITER_WITH_UPDATES); + ret = PTR_ERR_OR_ZERO(k); + if (ret) + goto err; + + refcount = bkey_refcount(k); + if (!refcount) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "nonexistent indirect extent at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) { + bch2_bkey_val_to_text(&buf, c, p.s_c); + bch2_trans_inconsistent(trans, + "indirect extent refcount underflow at %llu while marking\n %s", + *idx, buf.buf); + ret = -EIO; + goto err; + } + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + u64 pad; + + pad = max_t(s64, le32_to_cpu(v->front_pad), + le64_to_cpu(v->idx) - bkey_start_offset(&k->k)); + BUG_ON(pad > U32_MAX); + v->front_pad = cpu_to_le32(pad); + + pad = max_t(s64, le32_to_cpu(v->back_pad), + k->k.p.offset - p.k->size - le64_to_cpu(v->idx)); + BUG_ON(pad > U32_MAX); + v->back_pad = cpu_to_le32(pad); + } + + le64_add_cpu(refcount, add); + + bch2_btree_iter_set_pos_to_extent_start(&iter); + ret = bch2_trans_update(trans, &iter, k, 0); + if (ret) + goto err; + + *idx = k->k.p.offset; +err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +} + +int bch2_trans_mark_reflink_p(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, + struct bkey_i *new, + unsigned flags) +{ + struct bkey_s_c k = flags & BTREE_TRIGGER_OVERWRITE + ? old + : bkey_i_to_s_c(new); + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx, end_idx; + int ret = 0; + + if (flags & BTREE_TRIGGER_INSERT) { + struct bch_reflink_p *v = (struct bch_reflink_p *) p.v; + + v->front_pad = v->back_pad = 0; + } + + idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad); + end_idx = le64_to_cpu(p.v->idx) + p.k->size + + le32_to_cpu(p.v->back_pad); + + while (idx < end_idx && !ret) + ret = __bch2_trans_mark_reflink_p(trans, p, &idx, flags); + + return ret; +} + +static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_alloc_v4 *a; + int ret = 0; + + /* + * Backup superblock might be past the end of our normal usable space: + */ + if (b >= ca->mi.nbuckets) + return 0; + + a = bch2_trans_start_alloc_update(trans, &iter, POS(ca->dev_idx, b)); + if (IS_ERR(a)) + return PTR_ERR(a); + + if (a->v.data_type && type && a->v.data_type != type) { + bch2_fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, + "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_types[a->v.data_type], + bch2_data_types[type], + bch2_data_types[type]); + ret = -EIO; + goto out; + } + + a->v.data_type = type; + a->v.dirty_sectors = sectors; + + ret = bch2_trans_update(trans, &iter, &a->k_i, 0); + if (ret) + goto out; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, + struct bch_dev *ca, size_t b, + enum bch_data_type type, + unsigned sectors) +{ + return commit_do(trans, NULL, NULL, 0, + __bch2_trans_mark_metadata_bucket(trans, ca, b, type, sectors)); +} + +static int bch2_trans_mark_metadata_sectors(struct btree_trans *trans, + struct bch_dev *ca, + u64 start, u64 end, + enum bch_data_type type, + u64 *bucket, unsigned *bucket_sectors) +{ + do { + u64 b = sector_to_bucket(ca, start); + unsigned sectors = + min_t(u64, bucket_to_sector(ca, b + 1), end) - start; + + if (b != *bucket && *bucket_sectors) { + int ret = bch2_trans_mark_metadata_bucket(trans, ca, *bucket, + type, *bucket_sectors); + if (ret) + return ret; + + *bucket_sectors = 0; + } + + *bucket = b; + *bucket_sectors += sectors; + start += sectors; + } while (start < end); + + return 0; +} + +static int __bch2_trans_mark_dev_sb(struct btree_trans *trans, + struct bch_dev *ca) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 bucket = 0; + unsigned i, bucket_sectors = 0; + int ret; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset == BCH_SB_SECTOR) { + ret = bch2_trans_mark_metadata_sectors(trans, ca, + 0, BCH_SB_SECTOR, + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + ret = bch2_trans_mark_metadata_sectors(trans, ca, offset, + offset + (1 << layout->sb_max_size_bits), + BCH_DATA_sb, &bucket, &bucket_sectors); + if (ret) + return ret; + } + + if (bucket_sectors) { + ret = bch2_trans_mark_metadata_bucket(trans, ca, + bucket, BCH_DATA_sb, bucket_sectors); + if (ret) + return ret; + } + + for (i = 0; i < ca->journal.nr; i++) { + ret = bch2_trans_mark_metadata_bucket(trans, ca, + ca->journal.buckets[i], + BCH_DATA_journal, ca->mi.bucket_size); + if (ret) + return ret; + } + + return 0; +} + +int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca) +{ + int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(&trans, ca)); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* Disk reservations: */ + +#define SECTORS_CACHE 1024 + +int __bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, int flags) +{ + struct bch_fs_pcpu *pcpu; + u64 old, v, get; + s64 sectors_available; + int ret; + + percpu_down_read(&c->mark_lock); + preempt_disable(); + pcpu = this_cpu_ptr(c->pcpu); + + if (sectors <= pcpu->sectors_available) + goto out; + + v = atomic64_read(&c->sectors_available); + do { + old = v; + get = min((u64) sectors + SECTORS_CACHE, old); + + if (get < sectors) { + preempt_enable(); + goto recalculate; + } + } while ((v = atomic64_cmpxchg(&c->sectors_available, + old, old - get)) != old); + + pcpu->sectors_available += get; + +out: + pcpu->sectors_available -= sectors; + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + + preempt_enable(); + percpu_up_read(&c->mark_lock); + return 0; + +recalculate: + mutex_lock(&c->sectors_available_lock); + + percpu_u64_set(&c->pcpu->sectors_available, 0); + sectors_available = avail_factor(__bch2_fs_usage_read_short(c).free); + + if (sectors <= sectors_available || + (flags & BCH_DISK_RESERVATION_NOFAIL)) { + atomic64_set(&c->sectors_available, + max_t(s64, 0, sectors_available - sectors)); + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + ret = 0; + } else { + atomic64_set(&c->sectors_available, sectors_available); + ret = -BCH_ERR_ENOSPC_disk_reservation; + } + + mutex_unlock(&c->sectors_available_lock); + percpu_up_read(&c->mark_lock); + + return ret; +} + +/* Startup/shutdown: */ + +static void bucket_gens_free_rcu(struct rcu_head *rcu) +{ + struct bucket_gens *buckets = + container_of(rcu, struct bucket_gens, rcu); + + kvpfree(buckets, sizeof(*buckets) + buckets->nbuckets); +} + +int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ + struct bucket_gens *bucket_gens = NULL, *old_bucket_gens = NULL; + unsigned long *buckets_nouse = NULL; + bool resize = ca->bucket_gens != NULL; + int ret; + + if (!(bucket_gens = kvpmalloc(sizeof(struct bucket_gens) + nbuckets, + GFP_KERNEL|__GFP_ZERO))) { + ret = -BCH_ERR_ENOMEM_bucket_gens; + goto err; + } + + if ((c->opts.buckets_nouse && + !(buckets_nouse = kvpmalloc(BITS_TO_LONGS(nbuckets) * + sizeof(unsigned long), + GFP_KERNEL|__GFP_ZERO)))) { + ret = -BCH_ERR_ENOMEM_buckets_nouse; + goto err; + } + + bucket_gens->first_bucket = ca->mi.first_bucket; + bucket_gens->nbuckets = nbuckets; + + bch2_copygc_stop(c); + + if (resize) { + down_write(&c->gc_lock); + down_write(&ca->bucket_lock); + percpu_down_write(&c->mark_lock); + } + + old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); + + if (resize) { + size_t n = min(bucket_gens->nbuckets, old_bucket_gens->nbuckets); + + memcpy(bucket_gens->b, + old_bucket_gens->b, + n); + if (buckets_nouse) + memcpy(buckets_nouse, + ca->buckets_nouse, + BITS_TO_LONGS(n) * sizeof(unsigned long)); + } + + rcu_assign_pointer(ca->bucket_gens, bucket_gens); + bucket_gens = old_bucket_gens; + + swap(ca->buckets_nouse, buckets_nouse); + + nbuckets = ca->mi.nbuckets; + + if (resize) { + percpu_up_write(&c->mark_lock); + up_write(&ca->bucket_lock); + up_write(&c->gc_lock); + } + + ret = 0; +err: + kvpfree(buckets_nouse, + BITS_TO_LONGS(nbuckets) * sizeof(unsigned long)); + if (bucket_gens) + call_rcu(&bucket_gens->rcu, bucket_gens_free_rcu); + + return ret; +} + +void bch2_dev_buckets_free(struct bch_dev *ca) +{ + unsigned i; + + kvpfree(ca->buckets_nouse, + BITS_TO_LONGS(ca->mi.nbuckets) * sizeof(unsigned long)); + kvpfree(rcu_dereference_protected(ca->bucket_gens, 1), + sizeof(struct bucket_gens) + ca->mi.nbuckets); + + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) + free_percpu(ca->usage[i]); + kfree(ca->usage_base); +} + +int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned i; + + ca->usage_base = kzalloc(sizeof(struct bch_dev_usage), GFP_KERNEL); + if (!ca->usage_base) + return -BCH_ERR_ENOMEM_usage_init; + + for (i = 0; i < ARRAY_SIZE(ca->usage); i++) { + ca->usage[i] = alloc_percpu(struct bch_dev_usage); + if (!ca->usage[i]) + return -BCH_ERR_ENOMEM_usage_init; + } + + return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); +} diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h new file mode 100644 index 000000000..f9d7dda07 --- /dev/null +++ b/fs/bcachefs/buckets.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Code for manipulating bucket marks for garbage collection. + * + * Copyright 2014 Datera, Inc. + */ + +#ifndef _BUCKETS_H +#define _BUCKETS_H + +#include "buckets_types.h" +#include "extents.h" +#include "super.h" + +#define for_each_bucket(_b, _buckets) \ + for (_b = (_buckets)->b + (_buckets)->first_bucket; \ + _b < (_buckets)->b + (_buckets)->nbuckets; _b++) + +static inline void bucket_unlock(struct bucket *b) +{ + smp_store_release(&b->lock, 0); +} + +static inline void bucket_lock(struct bucket *b) +{ + while (xchg(&b->lock, 1)) + cpu_relax(); +} + +static inline struct bucket_array *gc_bucket_array(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->buckets_gc, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + +static inline struct bucket *gc_bucket(struct bch_dev *ca, size_t b) +{ + struct bucket_array *buckets = gc_bucket_array(ca); + + BUG_ON(b < buckets->first_bucket || b >= buckets->nbuckets); + return buckets->b + b; +} + +static inline struct bucket_gens *bucket_gens(struct bch_dev *ca) +{ + return rcu_dereference_check(ca->bucket_gens, + !ca->fs || + percpu_rwsem_is_held(&ca->fs->mark_lock) || + lockdep_is_held(&ca->fs->gc_lock) || + lockdep_is_held(&ca->bucket_lock)); +} + +static inline u8 *bucket_gen(struct bch_dev *ca, size_t b) +{ + struct bucket_gens *gens = bucket_gens(ca); + + BUG_ON(b < gens->first_bucket || b >= gens->nbuckets); + return gens->b + b; +} + +static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return sector_to_bucket(ca, ptr->offset); +} + +static inline struct bpos PTR_BUCKET_POS(const struct bch_fs *c, + const struct bch_extent_ptr *ptr) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, PTR_BUCKET_NR(ca, ptr)); +} + +static inline struct bpos PTR_BUCKET_POS_OFFSET(const struct bch_fs *c, + const struct bch_extent_ptr *ptr, + u32 *bucket_offset) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + + return POS(ptr->dev, sector_to_bucket_and_offset(ca, ptr->offset, bucket_offset)); +} + +static inline struct bucket *PTR_GC_BUCKET(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + return gc_bucket(ca, PTR_BUCKET_NR(ca, ptr)); +} + +static inline enum bch_data_type ptr_data_type(const struct bkey *k, + const struct bch_extent_ptr *ptr) +{ + if (bkey_is_btree_ptr(k)) + return BCH_DATA_btree; + + return ptr->cached ? BCH_DATA_cached : BCH_DATA_user; +} + +static inline s64 ptr_disk_sectors(s64 sectors, struct extent_ptr_decoded p) +{ + EBUG_ON(sectors < 0); + + return crc_is_compressed(p.crc) + ? DIV_ROUND_UP_ULL(sectors * p.crc.compressed_size, + p.crc.uncompressed_size) + : sectors; +} + +static inline int gen_cmp(u8 a, u8 b) +{ + return (s8) (a - b); +} + +static inline int gen_after(u8 a, u8 b) +{ + int r = gen_cmp(a, b); + + return r > 0 ? r : 0; +} + +/** + * ptr_stale() - check if a pointer points into a bucket that has been + * invalidated. + */ +static inline u8 ptr_stale(struct bch_dev *ca, + const struct bch_extent_ptr *ptr) +{ + u8 ret; + + rcu_read_lock(); + ret = gen_after(*bucket_gen(ca, PTR_BUCKET_NR(ca, ptr)), ptr->gen); + rcu_read_unlock(); + + return ret; +} + +/* Device usage: */ + +void bch2_dev_usage_read_fast(struct bch_dev *, struct bch_dev_usage *); +static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) +{ + struct bch_dev_usage ret; + + bch2_dev_usage_read_fast(ca, &ret); + return ret; +} + +void bch2_dev_usage_init(struct bch_dev *); + +static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) +{ + s64 reserved = 0; + + switch (watermark) { + case BCH_WATERMARK_NR: + unreachable(); + case BCH_WATERMARK_stripe: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case BCH_WATERMARK_normal: + reserved += ca->mi.nbuckets >> 6; + fallthrough; + case BCH_WATERMARK_copygc: + reserved += ca->nr_btree_reserve; + fallthrough; + case BCH_WATERMARK_btree: + reserved += ca->nr_btree_reserve; + fallthrough; + case BCH_WATERMARK_btree_copygc: + case BCH_WATERMARK_reclaim: + break; + } + + return reserved; +} + +static inline u64 dev_buckets_free(struct bch_dev *ca, + struct bch_dev_usage usage, + enum bch_watermark watermark) +{ + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets - + ca->nr_open_buckets - + bch2_dev_buckets_reserved(ca, watermark)); +} + +static inline u64 __dev_buckets_available(struct bch_dev *ca, + struct bch_dev_usage usage, + enum bch_watermark watermark) +{ + return max_t(s64, 0, + usage.d[BCH_DATA_free].buckets + + usage.d[BCH_DATA_cached].buckets + + usage.d[BCH_DATA_need_gc_gens].buckets + + usage.d[BCH_DATA_need_discard].buckets + - ca->nr_open_buckets + - bch2_dev_buckets_reserved(ca, watermark)); +} + +static inline u64 dev_buckets_available(struct bch_dev *ca, + enum bch_watermark watermark) +{ + return __dev_buckets_available(ca, bch2_dev_usage_read(ca), watermark); +} + +/* Filesystem usage: */ + +static inline unsigned __fs_usage_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage) / sizeof(u64) + nr_replicas; +} + +static inline unsigned fs_usage_u64s(struct bch_fs *c) +{ + return __fs_usage_u64s(READ_ONCE(c->replicas.nr)); +} + +static inline unsigned __fs_usage_online_u64s(unsigned nr_replicas) +{ + return sizeof(struct bch_fs_usage_online) / sizeof(u64) + nr_replicas; +} + +static inline unsigned fs_usage_online_u64s(struct bch_fs *c) +{ + return __fs_usage_online_u64s(READ_ONCE(c->replicas.nr)); +} + +static inline unsigned dev_usage_u64s(void) +{ + return sizeof(struct bch_dev_usage) / sizeof(u64); +} + +u64 bch2_fs_usage_read_one(struct bch_fs *, u64 *); + +struct bch_fs_usage_online *bch2_fs_usage_read(struct bch_fs *); + +void bch2_fs_usage_acc_to_base(struct bch_fs *, unsigned); + +void bch2_fs_usage_to_text(struct printbuf *, + struct bch_fs *, struct bch_fs_usage_online *); + +u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *); + +struct bch_fs_usage_short +bch2_fs_usage_read_short(struct bch_fs *); + +/* key/bucket marking: */ + +void bch2_fs_usage_initialize(struct bch_fs *); + +int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *, + size_t, enum bch_data_type, unsigned, + struct gc_pos, unsigned); + +int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); +int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + +int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); +int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned); + +void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *); +int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *); + +int bch2_trans_mark_metadata_bucket(struct btree_trans *, struct bch_dev *, + size_t, enum bch_data_type, unsigned); +int bch2_trans_mark_dev_sb(struct bch_fs *, struct bch_dev *); + +/* disk reservations: */ + +static inline void bch2_disk_reservation_put(struct bch_fs *c, + struct disk_reservation *res) +{ + if (res->sectors) { + this_cpu_sub(*c->online_reserved, res->sectors); + res->sectors = 0; + } +} + +#define BCH_DISK_RESERVATION_NOFAIL (1 << 0) + +int __bch2_disk_reservation_add(struct bch_fs *, + struct disk_reservation *, + u64, int); + +static inline int bch2_disk_reservation_add(struct bch_fs *c, struct disk_reservation *res, + u64 sectors, int flags) +{ +#ifdef __KERNEL__ + u64 old, new; + + do { + old = this_cpu_read(c->pcpu->sectors_available); + if (sectors > old) + return __bch2_disk_reservation_add(c, res, sectors, flags); + + new = old - sectors; + } while (this_cpu_cmpxchg(c->pcpu->sectors_available, old, new) != old); + + this_cpu_add(*c->online_reserved, sectors); + res->sectors += sectors; + return 0; +#else + return __bch2_disk_reservation_add(c, res, sectors, flags); +#endif +} + +static inline struct disk_reservation +bch2_disk_reservation_init(struct bch_fs *c, unsigned nr_replicas) +{ + return (struct disk_reservation) { + .sectors = 0, +#if 0 + /* not used yet: */ + .gen = c->capacity_gen, +#endif + .nr_replicas = nr_replicas, + }; +} + +static inline int bch2_disk_reservation_get(struct bch_fs *c, + struct disk_reservation *res, + u64 sectors, unsigned nr_replicas, + int flags) +{ + *res = bch2_disk_reservation_init(c, nr_replicas); + + return bch2_disk_reservation_add(c, res, sectors * nr_replicas, flags); +} + +#define RESERVE_FACTOR 6 + +static inline u64 avail_factor(u64 r) +{ + return div_u64(r << RESERVE_FACTOR, (1 << RESERVE_FACTOR) + 1); +} + +int bch2_dev_buckets_resize(struct bch_fs *, struct bch_dev *, u64); +void bch2_dev_buckets_free(struct bch_dev *); +int bch2_dev_buckets_alloc(struct bch_fs *, struct bch_dev *); + +#endif /* _BUCKETS_H */ diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h new file mode 100644 index 000000000..2a9dab900 --- /dev/null +++ b/fs/bcachefs/buckets_types.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_TYPES_H +#define _BUCKETS_TYPES_H + +#include "bcachefs_format.h" +#include "util.h" + +#define BUCKET_JOURNAL_SEQ_BITS 16 + +struct bucket { + u8 lock; + u8 gen_valid:1; + u8 data_type:7; + u8 gen; + u8 stripe_redundancy; + u32 stripe; + u32 dirty_sectors; + u32 cached_sectors; +}; + +struct bucket_array { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + struct bucket b[]; +}; + +struct bucket_gens { + struct rcu_head rcu; + u16 first_bucket; + size_t nbuckets; + u8 b[]; +}; + +struct bch_dev_usage { + u64 buckets_ec; + + struct { + u64 buckets; + u64 sectors; /* _compressed_ sectors: */ + /* + * XXX + * Why do we have this? Isn't it just buckets * bucket_size - + * sectors? + */ + u64 fragmented; + } d[BCH_DATA_NR]; +}; + +struct bch_fs_usage { + /* all fields are in units of 512 byte sectors: */ + u64 hidden; + u64 btree; + u64 data; + u64 cached; + u64 reserved; + u64 nr_inodes; + + /* XXX: add stats for compression ratio */ +#if 0 + u64 uncompressed; + u64 compressed; +#endif + + /* broken out: */ + + u64 persistent_reserved[BCH_REPLICAS_MAX]; + u64 replicas[]; +}; + +struct bch_fs_usage_online { + u64 online_reserved; + struct bch_fs_usage u; +}; + +struct bch_fs_usage_short { + u64 capacity; + u64 used; + u64 free; + u64 nr_inodes; +}; + +/* + * A reservation for space on disk: + */ +struct disk_reservation { + u64 sectors; + u32 gen; + unsigned nr_replicas; +}; + +#endif /* _BUCKETS_TYPES_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c new file mode 100644 index 000000000..81ab685cd --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets_waiting_for_journal.h" +#include +#include + +static inline struct bucket_hashed * +bucket_hash(struct buckets_waiting_for_journal_table *t, + unsigned hash_seed_idx, u64 dev_bucket) +{ + return t->d + hash_64(dev_bucket ^ t->hash_seeds[hash_seed_idx], t->bits); +} + +static void bucket_table_init(struct buckets_waiting_for_journal_table *t, size_t bits) +{ + unsigned i; + + t->bits = bits; + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) + get_random_bytes(&t->hash_seeds[i], sizeof(t->hash_seeds[i])); + memset(t->d, 0, sizeof(t->d[0]) << t->bits); +} + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket) +{ + struct buckets_waiting_for_journal_table *t; + u64 dev_bucket = (u64) dev << 56 | bucket; + bool ret = false; + unsigned i; + + mutex_lock(&b->lock); + t = b->t; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + struct bucket_hashed *h = bucket_hash(t, i, dev_bucket); + + if (h->dev_bucket == dev_bucket) { + ret = h->journal_seq > flushed_seq; + break; + } + } + + mutex_unlock(&b->lock); + + return ret; +} + +static bool bucket_table_insert(struct buckets_waiting_for_journal_table *t, + struct bucket_hashed *new, + u64 flushed_seq) +{ + struct bucket_hashed *last_evicted = NULL; + unsigned tries, i; + + for (tries = 0; tries < 10; tries++) { + struct bucket_hashed *old, *victim = NULL; + + for (i = 0; i < ARRAY_SIZE(t->hash_seeds); i++) { + old = bucket_hash(t, i, new->dev_bucket); + + if (old->dev_bucket == new->dev_bucket || + old->journal_seq <= flushed_seq) { + *old = *new; + return true; + } + + if (last_evicted != old) + victim = old; + } + + /* hashed to same slot 3 times: */ + if (!victim) + break; + + /* Failed to find an empty slot: */ + swap(*new, *victim); + last_evicted = victim; + } + + return false; +} + +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, + u64 flushed_seq, + unsigned dev, u64 bucket, + u64 journal_seq) +{ + struct buckets_waiting_for_journal_table *t, *n; + struct bucket_hashed tmp, new = { + .dev_bucket = (u64) dev << 56 | bucket, + .journal_seq = journal_seq, + }; + size_t i, size, new_bits, nr_elements = 1, nr_rehashes = 0; + int ret = 0; + + mutex_lock(&b->lock); + + if (likely(bucket_table_insert(b->t, &new, flushed_seq))) + goto out; + + t = b->t; + size = 1UL << t->bits; + for (i = 0; i < size; i++) + nr_elements += t->d[i].journal_seq > flushed_seq; + + new_bits = t->bits + (nr_elements * 3 > size); + + n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); + if (!n) { + ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; + goto out; + } + +retry_rehash: + nr_rehashes++; + bucket_table_init(n, new_bits); + + tmp = new; + BUG_ON(!bucket_table_insert(n, &tmp, flushed_seq)); + + for (i = 0; i < 1UL << t->bits; i++) { + if (t->d[i].journal_seq <= flushed_seq) + continue; + + tmp = t->d[i]; + if (!bucket_table_insert(n, &tmp, flushed_seq)) + goto retry_rehash; + } + + b->t = n; + kvfree(t); + + pr_debug("took %zu rehashes, table at %zu/%zu elements", + nr_rehashes, nr_elements, 1UL << b->t->bits); +out: + mutex_unlock(&b->lock); + + return ret; +} + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + kvfree(b->t); +} + +#define INITIAL_TABLE_BITS 3 + +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *c) +{ + struct buckets_waiting_for_journal *b = &c->buckets_waiting_for_journal; + + mutex_init(&b->lock); + + b->t = kvmalloc(sizeof(*b->t) + + (sizeof(b->t->d[0]) << INITIAL_TABLE_BITS), GFP_KERNEL); + if (!b->t) + return -BCH_ERR_ENOMEM_buckets_waiting_for_journal_init; + + bucket_table_init(b->t, INITIAL_TABLE_BITS); + return 0; +} diff --git a/fs/bcachefs/buckets_waiting_for_journal.h b/fs/bcachefs/buckets_waiting_for_journal.h new file mode 100644 index 000000000..d2ae19cbe --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_H +#define _BUCKETS_WAITING_FOR_JOURNAL_H + +#include "buckets_waiting_for_journal_types.h" + +bool bch2_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64); +int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *, + u64, unsigned, u64, u64); + +void bch2_fs_buckets_waiting_for_journal_exit(struct bch_fs *); +int bch2_fs_buckets_waiting_for_journal_init(struct bch_fs *); + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_H */ diff --git a/fs/bcachefs/buckets_waiting_for_journal_types.h b/fs/bcachefs/buckets_waiting_for_journal_types.h new file mode 100644 index 000000000..e593db061 --- /dev/null +++ b/fs/bcachefs/buckets_waiting_for_journal_types.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H +#define _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H + +#include + +struct bucket_hashed { + u64 dev_bucket; + u64 journal_seq; +}; + +struct buckets_waiting_for_journal_table { + unsigned bits; + u64 hash_seeds[3]; + struct bucket_hashed d[]; +}; + +struct buckets_waiting_for_journal { + struct mutex lock; + struct buckets_waiting_for_journal_table *t; +}; + +#endif /* _BUCKETS_WAITING_FOR_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c new file mode 100644 index 000000000..fb603df09 --- /dev/null +++ b/fs/bcachefs/chardev.c @@ -0,0 +1,769 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_CHARDEV + +#include "bcachefs.h" +#include "bcachefs_ioctl.h" +#include "buckets.h" +#include "chardev.h" +#include "journal.h" +#include "move.h" +#include "replicas.h" +#include "super.h" +#include "super-io.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* returns with ref on ca->ref */ +static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev, + unsigned flags) +{ + struct bch_dev *ca; + + if (flags & BCH_BY_INDEX) { + if (dev >= c->sb.nr_devices) + return ERR_PTR(-EINVAL); + + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + if (!ca) + return ERR_PTR(-EINVAL); + } else { + char *path; + + path = strndup_user((const char __user *) + (unsigned long) dev, PATH_MAX); + if (IS_ERR(path)) + return ERR_CAST(path); + + ca = bch2_dev_lookup(c, path); + kfree(path); + } + + return ca; +} + +#if 0 +static long bch2_ioctl_assemble(struct bch_ioctl_assemble __user *user_arg) +{ + struct bch_ioctl_assemble arg; + struct bch_fs *c; + u64 *user_devs = NULL; + char **devs = NULL; + unsigned i; + int ret = -EFAULT; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + user_devs = kmalloc_array(arg.nr_devs, sizeof(u64), GFP_KERNEL); + if (!user_devs) + return -ENOMEM; + + devs = kcalloc(arg.nr_devs, sizeof(char *), GFP_KERNEL); + + if (copy_from_user(user_devs, user_arg->devs, + sizeof(u64) * arg.nr_devs)) + goto err; + + for (i = 0; i < arg.nr_devs; i++) { + devs[i] = strndup_user((const char __user *)(unsigned long) + user_devs[i], + PATH_MAX); + if (!devs[i]) { + ret = -ENOMEM; + goto err; + } + } + + c = bch2_fs_open(devs, arg.nr_devs, bch2_opts_empty()); + ret = PTR_ERR_OR_ZERO(c); + if (!ret) + closure_put(&c->cl); +err: + if (devs) + for (i = 0; i < arg.nr_devs; i++) + kfree(devs[i]); + kfree(devs); + return ret; +} + +static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg) +{ + struct bch_ioctl_incremental arg; + const char *err; + char *path; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + if (!path) + return -ENOMEM; + + err = bch2_fs_open_incremental(path); + kfree(path); + + if (err) { + pr_err("Could not register bcachefs devices: %s", err); + return -EINVAL; + } + + return 0; +} +#endif + +static long bch2_global_ioctl(unsigned cmd, void __user *arg) +{ + switch (cmd) { +#if 0 + case BCH_IOCTL_ASSEMBLE: + return bch2_ioctl_assemble(arg); + case BCH_IOCTL_INCREMENTAL: + return bch2_ioctl_incremental(arg); +#endif + default: + return -ENOTTY; + } +} + +static long bch2_ioctl_query_uuid(struct bch_fs *c, + struct bch_ioctl_query_uuid __user *user_arg) +{ + return copy_to_user(&user_arg->uuid, + &c->sb.user_uuid, + sizeof(c->sb.user_uuid)); +} + +#if 0 +static long bch2_ioctl_start(struct bch_fs *c, struct bch_ioctl_start arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.flags || arg.pad) + return -EINVAL; + + return bch2_fs_start(c); +} + +static long bch2_ioctl_stop(struct bch_fs *c) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + bch2_fs_stop(c); + return 0; +} +#endif + +static long bch2_ioctl_disk_add(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + char *path; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + if (!path) + return -ENOMEM; + + ret = bch2_dev_add(c, path); + kfree(path); + + return ret; +} + +static long bch2_ioctl_disk_remove(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + struct bch_dev *ca; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + return bch2_dev_remove(c, ca, arg.flags); +} + +static long bch2_ioctl_disk_online(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + char *path; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.flags || arg.pad) + return -EINVAL; + + path = strndup_user((const char __user *)(unsigned long) arg.dev, PATH_MAX); + if (!path) + return -ENOMEM; + + ret = bch2_dev_online(c, path); + kfree(path); + return ret; +} + +static long bch2_ioctl_disk_offline(struct bch_fs *c, struct bch_ioctl_disk arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_offline(c, ca, arg.flags); + percpu_ref_put(&ca->ref); + return ret; +} + +static long bch2_ioctl_disk_set_state(struct bch_fs *c, + struct bch_ioctl_disk_set_state arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_FORCE_IF_DATA_LOST| + BCH_FORCE_IF_METADATA_LOST| + BCH_FORCE_IF_DEGRADED| + BCH_BY_INDEX)) || + arg.pad[0] || arg.pad[1] || arg.pad[2] || + arg.new_state >= BCH_MEMBER_STATE_NR) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_set_state(c, ca, arg.new_state, arg.flags); + if (ret) + bch_err(c, "Error setting device state: %s", bch2_err_str(ret)); + + percpu_ref_put(&ca->ref); + return ret; +} + +struct bch_data_ctx { + struct bch_fs *c; + struct bch_ioctl_data arg; + struct bch_move_stats stats; + + int ret; + + struct task_struct *thread; +}; + +static int bch2_data_thread(void *arg) +{ + struct bch_data_ctx *ctx = arg; + + ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); + + ctx->stats.data_type = U8_MAX; + return 0; +} + +static int bch2_data_job_release(struct inode *inode, struct file *file) +{ + struct bch_data_ctx *ctx = file->private_data; + + kthread_stop(ctx->thread); + put_task_struct(ctx->thread); + kfree(ctx); + return 0; +} + +static ssize_t bch2_data_job_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct bch_data_ctx *ctx = file->private_data; + struct bch_fs *c = ctx->c; + struct bch_ioctl_data_event e = { + .type = BCH_DATA_EVENT_PROGRESS, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.btree_id, + .p.pos = ctx->stats.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_total = bch2_fs_usage_read_short(c).used, + }; + + if (len < sizeof(e)) + return -EINVAL; + + return copy_to_user(buf, &e, sizeof(e)) ?: sizeof(e); +} + +static const struct file_operations bcachefs_data_ops = { + .release = bch2_data_job_release, + .read = bch2_data_job_read, + .llseek = no_llseek, +}; + +static long bch2_ioctl_data(struct bch_fs *c, + struct bch_ioctl_data arg) +{ + struct bch_data_ctx *ctx = NULL; + struct file *file = NULL; + unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK; + int ret, fd = -1; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (arg.op >= BCH_DATA_OP_NR || arg.flags) + return -EINVAL; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->c = c; + ctx->arg = arg; + + ctx->thread = kthread_create(bch2_data_thread, ctx, + "bch-data/%s", c->name); + if (IS_ERR(ctx->thread)) { + ret = PTR_ERR(ctx->thread); + goto err; + } + + ret = get_unused_fd_flags(flags); + if (ret < 0) + goto err; + fd = ret; + + file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags); + if (IS_ERR(file)) { + ret = PTR_ERR(file); + goto err; + } + + fd_install(fd, file); + + get_task_struct(ctx->thread); + wake_up_process(ctx->thread); + + return fd; +err: + if (fd >= 0) + put_unused_fd(fd); + if (!IS_ERR_OR_NULL(ctx->thread)) + kthread_stop(ctx->thread); + kfree(ctx); + return ret; +} + +static long bch2_ioctl_fs_usage(struct bch_fs *c, + struct bch_ioctl_fs_usage __user *user_arg) +{ + struct bch_ioctl_fs_usage *arg = NULL; + struct bch_replicas_usage *dst_e, *dst_end; + struct bch_fs_usage_online *src; + u32 replica_entries_bytes; + unsigned i; + int ret = 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes)) + return -EFAULT; + + arg = kzalloc(sizeof(*arg) + replica_entries_bytes, GFP_KERNEL); + if (!arg) + return -ENOMEM; + + src = bch2_fs_usage_read(c); + if (!src) { + ret = -ENOMEM; + goto err; + } + + arg->capacity = c->capacity; + arg->used = bch2_fs_sectors_used(c, src); + arg->online_reserved = src->online_reserved; + + for (i = 0; i < BCH_REPLICAS_MAX; i++) + arg->persistent_reserved[i] = src->u.persistent_reserved[i]; + + dst_e = arg->replicas; + dst_end = (void *) arg->replicas + replica_entries_bytes; + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *src_e = + cpu_replicas_entry(&c->replicas, i); + + /* check that we have enough space for one replicas entry */ + if (dst_e + 1 > dst_end) { + ret = -ERANGE; + break; + } + + dst_e->sectors = src->u.replicas[i]; + dst_e->r = *src_e; + + /* recheck after setting nr_devs: */ + if (replicas_usage_next(dst_e) > dst_end) { + ret = -ERANGE; + break; + } + + memcpy(dst_e->r.devs, src_e->devs, src_e->nr_devs); + + dst_e = replicas_usage_next(dst_e); + } + + arg->replica_entries_bytes = (void *) dst_e - (void *) arg->replicas; + + percpu_up_read(&c->mark_lock); + kfree(src); + + if (!ret) + ret = copy_to_user(user_arg, arg, + sizeof(*arg) + arg->replica_entries_bytes); +err: + kfree(arg); + return ret; +} + +static long bch2_ioctl_dev_usage(struct bch_fs *c, + struct bch_ioctl_dev_usage __user *user_arg) +{ + struct bch_ioctl_dev_usage arg; + struct bch_dev_usage src; + struct bch_dev *ca; + unsigned i; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + if (copy_from_user(&arg, user_arg, sizeof(arg))) + return -EFAULT; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad[0] || + arg.pad[1] || + arg.pad[2]) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + src = bch2_dev_usage_read(ca); + + arg.state = ca->mi.state; + arg.bucket_size = ca->mi.bucket_size; + arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket; + arg.buckets_ec = src.buckets_ec; + + for (i = 0; i < BCH_DATA_NR; i++) { + arg.d[i].buckets = src.d[i].buckets; + arg.d[i].sectors = src.d[i].sectors; + arg.d[i].fragmented = src.d[i].fragmented; + } + + percpu_ref_put(&ca->ref); + + return copy_to_user(user_arg, &arg, sizeof(arg)); +} + +static long bch2_ioctl_read_super(struct bch_fs *c, + struct bch_ioctl_read_super arg) +{ + struct bch_dev *ca = NULL; + struct bch_sb *sb; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~(BCH_BY_INDEX|BCH_READ_DEV)) || + arg.pad) + return -EINVAL; + + mutex_lock(&c->sb_lock); + + if (arg.flags & BCH_READ_DEV) { + ca = bch2_device_lookup(c, arg.dev, arg.flags); + + if (IS_ERR(ca)) { + ret = PTR_ERR(ca); + goto err; + } + + sb = ca->disk_sb.sb; + } else { + sb = c->disk_sb.sb; + } + + if (vstruct_bytes(sb) > arg.size) { + ret = -ERANGE; + goto err; + } + + ret = copy_to_user((void __user *)(unsigned long)arg.sb, + sb, vstruct_bytes(sb)); +err: + if (!IS_ERR_OR_NULL(ca)) + percpu_ref_put(&ca->ref); + mutex_unlock(&c->sb_lock); + return ret; +} + +static long bch2_ioctl_disk_get_idx(struct bch_fs *c, + struct bch_ioctl_disk_get_idx arg) +{ + dev_t dev = huge_decode_dev(arg.dev); + struct bch_dev *ca; + unsigned i; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) + return -EINVAL; + + for_each_online_member(ca, c, i) + if (ca->dev == dev) { + percpu_ref_put(&ca->io_ref); + return i; + } + + return -BCH_ERR_ENOENT_dev_idx_not_found; +} + +static long bch2_ioctl_disk_resize(struct bch_fs *c, + struct bch_ioctl_disk_resize arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_dev_resize(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + +static long bch2_ioctl_disk_resize_journal(struct bch_fs *c, + struct bch_ioctl_disk_resize_journal arg) +{ + struct bch_dev *ca; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((arg.flags & ~BCH_BY_INDEX) || + arg.pad) + return -EINVAL; + + ca = bch2_device_lookup(c, arg.dev, arg.flags); + if (IS_ERR(ca)) + return PTR_ERR(ca); + + ret = bch2_set_nr_journal_buckets(c, ca, arg.nbuckets); + + percpu_ref_put(&ca->ref); + return ret; +} + +#define BCH_IOCTL(_name, _argtype) \ +do { \ + _argtype i; \ + \ + if (copy_from_user(&i, arg, sizeof(i))) \ + return -EFAULT; \ + ret = bch2_ioctl_##_name(c, i); \ + goto out; \ +} while (0) + +long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) +{ + long ret; + + switch (cmd) { + case BCH_IOCTL_QUERY_UUID: + return bch2_ioctl_query_uuid(c, arg); + case BCH_IOCTL_FS_USAGE: + return bch2_ioctl_fs_usage(c, arg); + case BCH_IOCTL_DEV_USAGE: + return bch2_ioctl_dev_usage(c, arg); +#if 0 + case BCH_IOCTL_START: + BCH_IOCTL(start, struct bch_ioctl_start); + case BCH_IOCTL_STOP: + return bch2_ioctl_stop(c); +#endif + case BCH_IOCTL_READ_SUPER: + BCH_IOCTL(read_super, struct bch_ioctl_read_super); + case BCH_IOCTL_DISK_GET_IDX: + BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx); + } + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EINVAL; + + switch (cmd) { + case BCH_IOCTL_DISK_ADD: + BCH_IOCTL(disk_add, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_REMOVE: + BCH_IOCTL(disk_remove, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_ONLINE: + BCH_IOCTL(disk_online, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_OFFLINE: + BCH_IOCTL(disk_offline, struct bch_ioctl_disk); + case BCH_IOCTL_DISK_SET_STATE: + BCH_IOCTL(disk_set_state, struct bch_ioctl_disk_set_state); + case BCH_IOCTL_DATA: + BCH_IOCTL(data, struct bch_ioctl_data); + case BCH_IOCTL_DISK_RESIZE: + BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize); + case BCH_IOCTL_DISK_RESIZE_JOURNAL: + BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal); + + default: + return -ENOTTY; + } +out: + if (ret < 0) + ret = bch2_err_class(ret); + return ret; +} + +static DEFINE_IDR(bch_chardev_minor); + +static long bch2_chardev_ioctl(struct file *filp, unsigned cmd, unsigned long v) +{ + unsigned minor = iminor(file_inode(filp)); + struct bch_fs *c = minor < U8_MAX ? idr_find(&bch_chardev_minor, minor) : NULL; + void __user *arg = (void __user *) v; + + return c + ? bch2_fs_ioctl(c, cmd, arg) + : bch2_global_ioctl(cmd, arg); +} + +static const struct file_operations bch_chardev_fops = { + .owner = THIS_MODULE, + .unlocked_ioctl = bch2_chardev_ioctl, + .open = nonseekable_open, +}; + +static int bch_chardev_major; +static struct class *bch_chardev_class; +static struct device *bch_chardev; + +void bch2_fs_chardev_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->chardev)) + device_unregister(c->chardev); + if (c->minor >= 0) + idr_remove(&bch_chardev_minor, c->minor); +} + +int bch2_fs_chardev_init(struct bch_fs *c) +{ + c->minor = idr_alloc(&bch_chardev_minor, c, 0, 0, GFP_KERNEL); + if (c->minor < 0) + return c->minor; + + c->chardev = device_create(bch_chardev_class, NULL, + MKDEV(bch_chardev_major, c->minor), c, + "bcachefs%u-ctl", c->minor); + if (IS_ERR(c->chardev)) + return PTR_ERR(c->chardev); + + return 0; +} + +void bch2_chardev_exit(void) +{ + if (!IS_ERR_OR_NULL(bch_chardev_class)) + device_destroy(bch_chardev_class, + MKDEV(bch_chardev_major, U8_MAX)); + if (!IS_ERR_OR_NULL(bch_chardev_class)) + class_destroy(bch_chardev_class); + if (bch_chardev_major > 0) + unregister_chrdev(bch_chardev_major, "bcachefs"); +} + +int __init bch2_chardev_init(void) +{ + bch_chardev_major = register_chrdev(0, "bcachefs-ctl", &bch_chardev_fops); + if (bch_chardev_major < 0) + return bch_chardev_major; + + bch_chardev_class = class_create("bcachefs"); + if (IS_ERR(bch_chardev_class)) + return PTR_ERR(bch_chardev_class); + + bch_chardev = device_create(bch_chardev_class, NULL, + MKDEV(bch_chardev_major, U8_MAX), + NULL, "bcachefs-ctl"); + if (IS_ERR(bch_chardev)) + return PTR_ERR(bch_chardev); + + return 0; +} + +#endif /* NO_BCACHEFS_CHARDEV */ diff --git a/fs/bcachefs/chardev.h b/fs/bcachefs/chardev.h new file mode 100644 index 000000000..3a4890d39 --- /dev/null +++ b/fs/bcachefs/chardev.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHARDEV_H +#define _BCACHEFS_CHARDEV_H + +#ifndef NO_BCACHEFS_FS + +long bch2_fs_ioctl(struct bch_fs *, unsigned, void __user *); + +void bch2_fs_chardev_exit(struct bch_fs *); +int bch2_fs_chardev_init(struct bch_fs *); + +void bch2_chardev_exit(void); +int __init bch2_chardev_init(void); + +#else + +static inline long bch2_fs_ioctl(struct bch_fs *c, + unsigned cmd, void __user * arg) +{ + return -ENOSYS; +} + +static inline void bch2_fs_chardev_exit(struct bch_fs *c) {} +static inline int bch2_fs_chardev_init(struct bch_fs *c) { return 0; } + +static inline void bch2_chardev_exit(void) {} +static inline int __init bch2_chardev_init(void) { return 0; } + +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_CHARDEV_H */ diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c new file mode 100644 index 000000000..a08997a5b --- /dev/null +++ b/fs/bcachefs/checksum.c @@ -0,0 +1,709 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" +#include "errcode.h" +#include "super.h" +#include "super-io.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * bch2_checksum state is an abstraction of the checksum state calculated over different pages. + * it features page merging without having the checksum algorithm lose its state. + * for native checksum aglorithms (like crc), a default seed value will do. + * for hash-like algorithms, a state needs to be stored + */ + +struct bch2_checksum_state { + union { + u64 seed; + struct xxh64_state h64state; + }; + unsigned int type; +}; + +static void bch2_checksum_init(struct bch2_checksum_state *state) +{ + switch (state->type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + state->seed = 0; + break; + case BCH_CSUM_crc32c_nonzero: + state->seed = U32_MAX; + break; + case BCH_CSUM_crc64_nonzero: + state->seed = U64_MAX; + break; + case BCH_CSUM_xxhash: + xxh64_reset(&state->h64state, 0); + break; + default: + BUG(); + } +} + +static u64 bch2_checksum_final(const struct bch2_checksum_state *state) +{ + switch (state->type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + return state->seed; + case BCH_CSUM_crc32c_nonzero: + return state->seed ^ U32_MAX; + case BCH_CSUM_crc64_nonzero: + return state->seed ^ U64_MAX; + case BCH_CSUM_xxhash: + return xxh64_digest(&state->h64state); + default: + BUG(); + } +} + +static void bch2_checksum_update(struct bch2_checksum_state *state, const void *data, size_t len) +{ + switch (state->type) { + case BCH_CSUM_none: + return; + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc32c: + state->seed = crc32c(state->seed, data, len); + break; + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc64: + state->seed = crc64_be(state->seed, data, len); + break; + case BCH_CSUM_xxhash: + xxh64_update(&state->h64state, data, len); + break; + default: + BUG(); + } +} + +static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + struct scatterlist *sg, size_t len) +{ + SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + int ret; + + skcipher_request_set_sync_tfm(req, tfm); + skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + + ret = crypto_skcipher_encrypt(req); + if (ret) + pr_err("got error %i from crypto_skcipher_encrypt()", ret); + + return ret; +} + +static inline int do_encrypt(struct crypto_sync_skcipher *tfm, + struct nonce nonce, + void *buf, size_t len) +{ + if (!is_vmalloc_addr(buf)) { + struct scatterlist sg; + + sg_init_table(&sg, 1); + sg_set_page(&sg, + is_vmalloc_addr(buf) + ? vmalloc_to_page(buf) + : virt_to_page(buf), + len, offset_in_page(buf)); + return do_encrypt_sg(tfm, nonce, &sg, len); + } else { + unsigned pages = buf_pages(buf, len); + struct scatterlist *sg; + size_t orig_len = len; + int ret, i; + + sg = kmalloc_array(pages, sizeof(*sg), GFP_KERNEL); + if (!sg) + return -BCH_ERR_ENOMEM_do_encrypt; + + sg_init_table(sg, pages); + + for (i = 0; i < pages; i++) { + unsigned offset = offset_in_page(buf); + unsigned pg_len = min(len, PAGE_SIZE - offset); + + sg_set_page(sg + i, vmalloc_to_page(buf), pg_len, offset); + buf += pg_len; + len -= pg_len; + } + + ret = do_encrypt_sg(tfm, nonce, sg, orig_len); + kfree(sg); + return ret; + } +} + +int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, + void *buf, size_t len) +{ + struct crypto_sync_skcipher *chacha20 = + crypto_alloc_sync_skcipher("chacha20", 0, 0); + int ret; + + if (!chacha20) { + pr_err("error requesting chacha20 module: %li", PTR_ERR(chacha20)); + return PTR_ERR(chacha20); + } + + ret = crypto_skcipher_setkey(&chacha20->base, + (void *) key, sizeof(*key)); + if (ret) { + pr_err("crypto_skcipher_setkey() error: %i", ret); + goto err; + } + + ret = do_encrypt(chacha20, nonce, buf, len); +err: + crypto_free_sync_skcipher(chacha20); + return ret; +} + +static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, + struct nonce nonce) +{ + u8 key[POLY1305_KEY_SIZE]; + int ret; + + nonce.d[3] ^= BCH_NONCE_POLY; + + memset(key, 0, sizeof(key)); + ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); + if (ret) + return ret; + + desc->tfm = c->poly1305; + crypto_shash_init(desc); + crypto_shash_update(desc, key, sizeof(key)); + return 0; +} + +struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, + struct nonce nonce, const void *data, size_t len) +{ + switch (type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; + + bch2_checksum_init(&state); + bch2_checksum_update(&state, data, len); + + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + + crypto_shash_update(desc, data, len); + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +int bch2_encrypt(struct bch_fs *c, unsigned type, + struct nonce nonce, void *data, size_t len) +{ + if (!bch2_csum_type_is_encryption(type)) + return 0; + + return do_encrypt(c->chacha20, nonce, data, len); +} + +static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio, + struct bvec_iter *iter) +{ + struct bio_vec bv; + + switch (type) { + case BCH_CSUM_none: + return (struct bch_csum) { 0 }; + case BCH_CSUM_crc32c_nonzero: + case BCH_CSUM_crc64_nonzero: + case BCH_CSUM_crc32c: + case BCH_CSUM_xxhash: + case BCH_CSUM_crc64: { + struct bch2_checksum_state state; + + state.type = type; + bch2_checksum_init(&state); + +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + bch2_checksum_update(&state, p, bv.bv_len); + kunmap_atomic(p); + } +#else + __bio_for_each_bvec(bv, bio, *iter, *iter) + bch2_checksum_update(&state, page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); +#endif + return (struct bch_csum) { .lo = cpu_to_le64(bch2_checksum_final(&state)) }; + } + + case BCH_CSUM_chacha20_poly1305_80: + case BCH_CSUM_chacha20_poly1305_128: { + SHASH_DESC_ON_STACK(desc, c->poly1305); + u8 digest[POLY1305_DIGEST_SIZE]; + struct bch_csum ret = { 0 }; + + gen_poly_key(c, desc, nonce); + +#ifdef CONFIG_HIGHMEM + __bio_for_each_segment(bv, bio, *iter, *iter) { + void *p = kmap_atomic(bv.bv_page) + bv.bv_offset; + + crypto_shash_update(desc, p, bv.bv_len); + kunmap_atomic(p); + } +#else + __bio_for_each_bvec(bv, bio, *iter, *iter) + crypto_shash_update(desc, + page_address(bv.bv_page) + bv.bv_offset, + bv.bv_len); +#endif + crypto_shash_final(desc, digest); + + memcpy(&ret, digest, bch_crc_bytes[type]); + return ret; + } + default: + BUG(); + } +} + +struct bch_csum bch2_checksum_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bvec_iter iter = bio->bi_iter; + + return __bch2_checksum_bio(c, type, nonce, bio, &iter); +} + +int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + struct bio_vec bv; + struct bvec_iter iter; + struct scatterlist sgl[16], *sg = sgl; + size_t bytes = 0; + int ret = 0; + + if (!bch2_csum_type_is_encryption(type)) + return 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + + bio_for_each_segment(bv, bio, iter) { + if (sg == sgl + ARRAY_SIZE(sgl)) { + sg_mark_end(sg - 1); + + ret = do_encrypt_sg(c->chacha20, nonce, sgl, bytes); + if (ret) + return ret; + + nonce = nonce_add(nonce, bytes); + bytes = 0; + + sg_init_table(sgl, ARRAY_SIZE(sgl)); + sg = sgl; + } + + sg_set_page(sg++, bv.bv_page, bv.bv_len, bv.bv_offset); + bytes += bv.bv_len; + } + + sg_mark_end(sg - 1); + return do_encrypt_sg(c->chacha20, nonce, sgl, bytes); +} + +struct bch_csum bch2_checksum_merge(unsigned type, struct bch_csum a, + struct bch_csum b, size_t b_len) +{ + struct bch2_checksum_state state; + + state.type = type; + bch2_checksum_init(&state); + state.seed = a.lo; + + BUG_ON(!bch2_checksum_mergeable(type)); + + while (b_len) { + unsigned b = min_t(unsigned, b_len, PAGE_SIZE); + + bch2_checksum_update(&state, + page_address(ZERO_PAGE(0)), b); + b_len -= b; + } + a.lo = bch2_checksum_final(&state); + a.lo ^= b.lo; + a.hi ^= b.hi; + return a; +} + +int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, + struct bversion version, + struct bch_extent_crc_unpacked crc_old, + struct bch_extent_crc_unpacked *crc_a, + struct bch_extent_crc_unpacked *crc_b, + unsigned len_a, unsigned len_b, + unsigned new_csum_type) +{ + struct bvec_iter iter = bio->bi_iter; + struct nonce nonce = extent_nonce(version, crc_old); + struct bch_csum merged = { 0 }; + struct crc_split { + struct bch_extent_crc_unpacked *crc; + unsigned len; + unsigned csum_type; + struct bch_csum csum; + } splits[3] = { + { crc_a, len_a, new_csum_type }, + { crc_b, len_b, new_csum_type }, + { NULL, bio_sectors(bio) - len_a - len_b, new_csum_type }, + }, *i; + bool mergeable = crc_old.csum_type == new_csum_type && + bch2_checksum_mergeable(new_csum_type); + unsigned crc_nonce = crc_old.nonce; + + BUG_ON(len_a + len_b > bio_sectors(bio)); + BUG_ON(crc_old.uncompressed_size != bio_sectors(bio)); + BUG_ON(crc_is_compressed(crc_old)); + BUG_ON(bch2_csum_type_is_encryption(crc_old.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)); + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { + iter.bi_size = i->len << 9; + if (mergeable || i->crc) + i->csum = __bch2_checksum_bio(c, i->csum_type, + nonce, bio, &iter); + else + bio_advance_iter(bio, &iter, i->len << 9); + nonce = nonce_add(nonce, i->len << 9); + } + + if (mergeable) + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) + merged = bch2_checksum_merge(new_csum_type, merged, + i->csum, i->len << 9); + else + merged = bch2_checksum_bio(c, crc_old.csum_type, + extent_nonce(version, crc_old), bio); + + if (bch2_crc_cmp(merged, crc_old.csum)) { + bch_err(c, "checksum error in bch2_rechecksum_bio() (memory corruption or bug?)\n" + "expected %0llx:%0llx got %0llx:%0llx (old type %s new type %s)", + crc_old.csum.hi, + crc_old.csum.lo, + merged.hi, + merged.lo, + bch2_csum_types[crc_old.csum_type], + bch2_csum_types[new_csum_type]); + return -EIO; + } + + for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { + if (i->crc) + *i->crc = (struct bch_extent_crc_unpacked) { + .csum_type = i->csum_type, + .compression_type = crc_old.compression_type, + .compressed_size = i->len, + .uncompressed_size = i->len, + .offset = 0, + .live_size = i->len, + .nonce = crc_nonce, + .csum = i->csum, + }; + + if (bch2_csum_type_is_encryption(new_csum_type)) + crc_nonce += i->len; + } + + return 0; +} + +#ifdef __KERNEL__ +static int __bch2_request_key(char *key_description, struct bch_key *key) +{ + struct key *keyring_key; + const struct user_key_payload *ukp; + int ret; + + keyring_key = request_key(&key_type_user, key_description, NULL); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + down_read(&keyring_key->sem); + ukp = dereference_key_locked(keyring_key); + if (ukp->datalen == sizeof(*key)) { + memcpy(key, ukp->data, ukp->datalen); + ret = 0; + } else { + ret = -EINVAL; + } + up_read(&keyring_key->sem); + key_put(keyring_key); + + return ret; +} +#else +#include + +static int __bch2_request_key(char *key_description, struct bch_key *key) +{ + key_serial_t key_id; + + key_id = request_key("user", key_description, NULL, + KEY_SPEC_USER_KEYRING); + if (key_id < 0) + return -errno; + + if (keyctl_read(key_id, (void *) key, sizeof(*key)) != sizeof(*key)) + return -1; + + return 0; +} +#endif + +int bch2_request_key(struct bch_sb *sb, struct bch_key *key) +{ + struct printbuf key_description = PRINTBUF; + int ret; + + prt_printf(&key_description, "bcachefs:"); + pr_uuid(&key_description, sb->user_uuid.b); + + ret = __bch2_request_key(key_description.buf, key); + printbuf_exit(&key_description); + return ret; +} + +int bch2_decrypt_sb_key(struct bch_fs *c, + struct bch_sb_field_crypt *crypt, + struct bch_key *key) +{ + struct bch_encrypted_key sb_key = crypt->key; + struct bch_key user_key; + int ret = 0; + + /* is key encrypted? */ + if (!bch2_key_is_encrypted(&sb_key)) + goto out; + + ret = bch2_request_key(c->disk_sb.sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); + goto err; + } + + /* decrypt real key: */ + ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), + &sb_key, sizeof(sb_key)); + if (ret) + goto err; + + if (bch2_key_is_encrypted(&sb_key)) { + bch_err(c, "incorrect encryption key"); + ret = -EINVAL; + goto err; + } +out: + *key = sb_key.key; +err: + memzero_explicit(&sb_key, sizeof(sb_key)); + memzero_explicit(&user_key, sizeof(user_key)); + return ret; +} + +static int bch2_alloc_ciphers(struct bch_fs *c) +{ + int ret; + + if (!c->chacha20) + c->chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); + ret = PTR_ERR_OR_ZERO(c->chacha20); + + if (ret) { + bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); + return ret; + } + + if (!c->poly1305) + c->poly1305 = crypto_alloc_shash("poly1305", 0, 0); + ret = PTR_ERR_OR_ZERO(c->poly1305); + + if (ret) { + bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); + return ret; + } + + return 0; +} + +int bch2_disable_encryption(struct bch_fs *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + crypt = bch2_sb_get_crypt(c->disk_sb.sb); + if (!crypt) + goto out; + + /* is key encrypted? */ + ret = 0; + if (bch2_key_is_encrypted(&crypt->key)) + goto out; + + ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) + goto out; + + crypt->key.magic = BCH_KEY_MAGIC; + crypt->key.key = key; + + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 0); + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_enable_encryption(struct bch_fs *c, bool keyed) +{ + struct bch_encrypted_key key; + struct bch_key user_key; + struct bch_sb_field_crypt *crypt; + int ret = -EINVAL; + + mutex_lock(&c->sb_lock); + + /* Do we already have an encryption key? */ + if (bch2_sb_get_crypt(c->disk_sb.sb)) + goto err; + + ret = bch2_alloc_ciphers(c); + if (ret) + goto err; + + key.magic = BCH_KEY_MAGIC; + get_random_bytes(&key.key, sizeof(key.key)); + + if (keyed) { + ret = bch2_request_key(c->disk_sb.sb, &user_key); + if (ret) { + bch_err(c, "error requesting encryption key: %s", bch2_err_str(ret)); + goto err; + } + + ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), + &key, sizeof(key)); + if (ret) + goto err; + } + + ret = crypto_skcipher_setkey(&c->chacha20->base, + (void *) &key.key, sizeof(key.key)); + if (ret) + goto err; + + crypt = bch2_sb_resize_crypt(&c->disk_sb, sizeof(*crypt) / sizeof(u64)); + if (!crypt) { + ret = -BCH_ERR_ENOSPC_sb_crypt; + goto err; + } + + crypt->key = key; + + /* write superblock */ + SET_BCH_SB_ENCRYPTION_TYPE(c->disk_sb.sb, 1); + bch2_write_super(c); +err: + mutex_unlock(&c->sb_lock); + memzero_explicit(&user_key, sizeof(user_key)); + memzero_explicit(&key, sizeof(key)); + return ret; +} + +void bch2_fs_encryption_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->poly1305)) + crypto_free_shash(c->poly1305); + if (!IS_ERR_OR_NULL(c->chacha20)) + crypto_free_sync_skcipher(c->chacha20); + if (!IS_ERR_OR_NULL(c->sha256)) + crypto_free_shash(c->sha256); +} + +int bch2_fs_encryption_init(struct bch_fs *c) +{ + struct bch_sb_field_crypt *crypt; + struct bch_key key; + int ret = 0; + + c->sha256 = crypto_alloc_shash("sha256", 0, 0); + ret = PTR_ERR_OR_ZERO(c->sha256); + if (ret) { + bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); + goto out; + } + + crypt = bch2_sb_get_crypt(c->disk_sb.sb); + if (!crypt) + goto out; + + ret = bch2_alloc_ciphers(c); + if (ret) + goto out; + + ret = bch2_decrypt_sb_key(c, crypt, &key); + if (ret) + goto out; + + ret = crypto_skcipher_setkey(&c->chacha20->base, + (void *) &key.key, sizeof(key.key)); + if (ret) + goto out; +out: + memzero_explicit(&key, sizeof(key)); + return ret; +} diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h new file mode 100644 index 000000000..1ad1d5f03 --- /dev/null +++ b/fs/bcachefs/checksum.h @@ -0,0 +1,209 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CHECKSUM_H +#define _BCACHEFS_CHECKSUM_H + +#include "bcachefs.h" +#include "extents_types.h" +#include "super-io.h" + +#include +#include + +static inline bool bch2_checksum_mergeable(unsigned type) +{ + + switch (type) { + case BCH_CSUM_none: + case BCH_CSUM_crc32c: + case BCH_CSUM_crc64: + return true; + default: + return false; + } +} + +struct bch_csum bch2_checksum_merge(unsigned, struct bch_csum, + struct bch_csum, size_t); + +#define BCH_NONCE_EXTENT cpu_to_le32(1 << 28) +#define BCH_NONCE_BTREE cpu_to_le32(2 << 28) +#define BCH_NONCE_JOURNAL cpu_to_le32(3 << 28) +#define BCH_NONCE_PRIO cpu_to_le32(4 << 28) +#define BCH_NONCE_POLY cpu_to_le32(1 << 31) + +struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce, + const void *, size_t); + +/* + * This is used for various on disk data structures - bch_sb, prio_set, bset, + * jset: The checksum is _always_ the first field of these structs + */ +#define csum_vstruct(_c, _type, _nonce, _i) \ +({ \ + const void *start = ((const void *) (_i)) + sizeof((_i)->csum); \ + const void *end = vstruct_end(_i); \ + \ + bch2_checksum(_c, _type, _nonce, start, end - start); \ +}) + +int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); +int bch2_request_key(struct bch_sb *, struct bch_key *); + +int bch2_encrypt(struct bch_fs *, unsigned, struct nonce, + void *data, size_t); + +struct bch_csum bch2_checksum_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); + +int bch2_rechecksum_bio(struct bch_fs *, struct bio *, struct bversion, + struct bch_extent_crc_unpacked, + struct bch_extent_crc_unpacked *, + struct bch_extent_crc_unpacked *, + unsigned, unsigned, unsigned); + +int __bch2_encrypt_bio(struct bch_fs *, unsigned, + struct nonce, struct bio *); + +static inline int bch2_encrypt_bio(struct bch_fs *c, unsigned type, + struct nonce nonce, struct bio *bio) +{ + return bch2_csum_type_is_encryption(type) + ? __bch2_encrypt_bio(c, type, nonce, bio) + : 0; +} + +int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, + struct bch_key *); + +int bch2_disable_encryption(struct bch_fs *); +int bch2_enable_encryption(struct bch_fs *, bool); + +void bch2_fs_encryption_exit(struct bch_fs *); +int bch2_fs_encryption_init(struct bch_fs *); + +static inline enum bch_csum_type bch2_csum_opt_to_type(enum bch_csum_opts type, + bool data) +{ + switch (type) { + case BCH_CSUM_OPT_none: + return BCH_CSUM_none; + case BCH_CSUM_OPT_crc32c: + return data ? BCH_CSUM_crc32c : BCH_CSUM_crc32c_nonzero; + case BCH_CSUM_OPT_crc64: + return data ? BCH_CSUM_crc64 : BCH_CSUM_crc64_nonzero; + case BCH_CSUM_OPT_xxhash: + return BCH_CSUM_xxhash; + default: + BUG(); + } +} + +static inline enum bch_csum_type bch2_data_checksum_type(struct bch_fs *c, + struct bch_io_opts opts) +{ + if (opts.nocow) + return 0; + + if (c->sb.encryption_type) + return c->opts.wide_macs + ? BCH_CSUM_chacha20_poly1305_128 + : BCH_CSUM_chacha20_poly1305_80; + + return bch2_csum_opt_to_type(opts.data_checksum, true); +} + +static inline enum bch_csum_type bch2_meta_checksum_type(struct bch_fs *c) +{ + if (c->sb.encryption_type) + return BCH_CSUM_chacha20_poly1305_128; + + return bch2_csum_opt_to_type(c->opts.metadata_checksum, false); +} + +static inline bool bch2_checksum_type_valid(const struct bch_fs *c, + unsigned type) +{ + if (type >= BCH_CSUM_NR) + return false; + + if (bch2_csum_type_is_encryption(type) && !c->chacha20) + return false; + + return true; +} + +/* returns true if not equal */ +static inline bool bch2_crc_cmp(struct bch_csum l, struct bch_csum r) +{ + /* + * XXX: need some way of preventing the compiler from optimizing this + * into a form that isn't constant time.. + */ + return ((l.lo ^ r.lo) | (l.hi ^ r.hi)) != 0; +} + +/* for skipping ahead and encrypting/decrypting at an offset: */ +static inline struct nonce nonce_add(struct nonce nonce, unsigned offset) +{ + EBUG_ON(offset & (CHACHA_BLOCK_SIZE - 1)); + + le32_add_cpu(&nonce.d[0], offset / CHACHA_BLOCK_SIZE); + return nonce; +} + +static inline struct nonce null_nonce(void) +{ + struct nonce ret; + + memset(&ret, 0, sizeof(ret)); + return ret; +} + +static inline struct nonce extent_nonce(struct bversion version, + struct bch_extent_crc_unpacked crc) +{ + unsigned compression_type = crc_is_compressed(crc) + ? crc.compression_type + : 0; + unsigned size = compression_type ? crc.uncompressed_size : 0; + struct nonce nonce = (struct nonce) {{ + [0] = cpu_to_le32(size << 22), + [1] = cpu_to_le32(version.lo), + [2] = cpu_to_le32(version.lo >> 32), + [3] = cpu_to_le32(version.hi| + (compression_type << 24))^BCH_NONCE_EXTENT, + }}; + + return nonce_add(nonce, crc.nonce << 9); +} + +static inline bool bch2_key_is_encrypted(struct bch_encrypted_key *key) +{ + return le64_to_cpu(key->magic) != BCH_KEY_MAGIC; +} + +static inline struct nonce __bch2_sb_key_nonce(struct bch_sb *sb) +{ + __le64 magic = __bch2_sb_magic(sb); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +static inline struct nonce bch2_sb_key_nonce(struct bch_fs *c) +{ + __le64 magic = bch2_sb_magic(c); + + return (struct nonce) {{ + [0] = 0, + [1] = 0, + [2] = ((__le32 *) &magic)[0], + [3] = ((__le32 *) &magic)[1], + }}; +} + +#endif /* _BCACHEFS_CHECKSUM_H */ diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c new file mode 100644 index 000000000..f41889093 --- /dev/null +++ b/fs/bcachefs/clock.c @@ -0,0 +1,193 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "clock.h" + +#include +#include +#include + +static inline long io_timer_cmp(io_timer_heap *h, + struct io_timer *l, + struct io_timer *r) +{ + return l->expire - r->expire; +} + +void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) +{ + size_t i; + + spin_lock(&clock->timer_lock); + + if (time_after_eq((unsigned long) atomic64_read(&clock->now), + timer->expire)) { + spin_unlock(&clock->timer_lock); + timer->fn(timer); + return; + } + + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) + goto out; + + BUG_ON(!heap_add(&clock->timers, timer, io_timer_cmp, NULL)); +out: + spin_unlock(&clock->timer_lock); +} + +void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) +{ + size_t i; + + spin_lock(&clock->timer_lock); + + for (i = 0; i < clock->timers.used; i++) + if (clock->timers.data[i] == timer) { + heap_del(&clock->timers, i, io_timer_cmp, NULL); + break; + } + + spin_unlock(&clock->timer_lock); +} + +struct io_clock_wait { + struct io_timer io_timer; + struct timer_list cpu_timer; + struct task_struct *task; + int expired; +}; + +static void io_clock_wait_fn(struct io_timer *timer) +{ + struct io_clock_wait *wait = container_of(timer, + struct io_clock_wait, io_timer); + + wait->expired = 1; + wake_up_process(wait->task); +} + +static void io_clock_cpu_timeout(struct timer_list *timer) +{ + struct io_clock_wait *wait = container_of(timer, + struct io_clock_wait, cpu_timer); + + wait->expired = 1; + wake_up_process(wait->task); +} + +void bch2_io_clock_schedule_timeout(struct io_clock *clock, unsigned long until) +{ + struct io_clock_wait wait; + + /* XXX: calculate sleep time rigorously */ + wait.io_timer.expire = until; + wait.io_timer.fn = io_clock_wait_fn; + wait.task = current; + wait.expired = 0; + bch2_io_timer_add(clock, &wait.io_timer); + + schedule(); + + bch2_io_timer_del(clock, &wait.io_timer); +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + unsigned long io_until, + unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct io_clock_wait wait; + + wait.io_timer.expire = io_until; + wait.io_timer.fn = io_clock_wait_fn; + wait.task = current; + wait.expired = 0; + bch2_io_timer_add(clock, &wait.io_timer); + + timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); + + if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) + mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); + + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + if (kthread && kthread_should_stop()) + break; + + if (wait.expired) + break; + + schedule(); + try_to_freeze(); + } + + __set_current_state(TASK_RUNNING); + del_timer_sync(&wait.cpu_timer); + destroy_timer_on_stack(&wait.cpu_timer); + bch2_io_timer_del(clock, &wait.io_timer); +} + +static struct io_timer *get_expired_timer(struct io_clock *clock, + unsigned long now) +{ + struct io_timer *ret = NULL; + + spin_lock(&clock->timer_lock); + + if (clock->timers.used && + time_after_eq(now, clock->timers.data[0]->expire)) + heap_pop(&clock->timers, ret, io_timer_cmp, NULL); + + spin_unlock(&clock->timer_lock); + + return ret; +} + +void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) +{ + struct io_timer *timer; + unsigned long now = atomic64_add_return(sectors, &clock->now); + + while ((timer = get_expired_timer(clock, now))) + timer->fn(timer); +} + +void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) +{ + unsigned long now; + unsigned i; + + out->atomic++; + spin_lock(&clock->timer_lock); + now = atomic64_read(&clock->now); + + for (i = 0; i < clock->timers.used; i++) + prt_printf(out, "%ps:\t%li\n", + clock->timers.data[i]->fn, + clock->timers.data[i]->expire - now); + spin_unlock(&clock->timer_lock); + --out->atomic; +} + +void bch2_io_clock_exit(struct io_clock *clock) +{ + free_heap(&clock->timers); + free_percpu(clock->pcpu_buf); +} + +int bch2_io_clock_init(struct io_clock *clock) +{ + atomic64_set(&clock->now, 0); + spin_lock_init(&clock->timer_lock); + + clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); + + clock->pcpu_buf = alloc_percpu(*clock->pcpu_buf); + if (!clock->pcpu_buf) + return -BCH_ERR_ENOMEM_io_clock_init; + + if (!init_heap(&clock->timers, NR_IO_TIMERS, GFP_KERNEL)) + return -BCH_ERR_ENOMEM_io_clock_init; + + return 0; +} diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h new file mode 100644 index 000000000..70a0f7436 --- /dev/null +++ b/fs/bcachefs/clock.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CLOCK_H +#define _BCACHEFS_CLOCK_H + +void bch2_io_timer_add(struct io_clock *, struct io_timer *); +void bch2_io_timer_del(struct io_clock *, struct io_timer *); +void bch2_kthread_io_clock_wait(struct io_clock *, unsigned long, + unsigned long); + +void __bch2_increment_clock(struct io_clock *, unsigned); + +static inline void bch2_increment_clock(struct bch_fs *c, unsigned sectors, + int rw) +{ + struct io_clock *clock = &c->io_clock[rw]; + + if (unlikely(this_cpu_add_return(*clock->pcpu_buf, sectors) >= + IO_CLOCK_PCPU_SECTORS)) + __bch2_increment_clock(clock, this_cpu_xchg(*clock->pcpu_buf, 0)); +} + +void bch2_io_clock_schedule_timeout(struct io_clock *, unsigned long); + +#define bch2_kthread_wait_event_ioclock_timeout(condition, clock, timeout)\ +({ \ + long __ret = timeout; \ + might_sleep(); \ + if (!___wait_cond_timeout(condition)) \ + __ret = __wait_event_timeout(wq, condition, timeout); \ + __ret; \ +}) + +void bch2_io_timers_to_text(struct printbuf *, struct io_clock *); + +void bch2_io_clock_exit(struct io_clock *); +int bch2_io_clock_init(struct io_clock *); + +#endif /* _BCACHEFS_CLOCK_H */ diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h new file mode 100644 index 000000000..5fae0012d --- /dev/null +++ b/fs/bcachefs/clock_types.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_CLOCK_TYPES_H +#define _BCACHEFS_CLOCK_TYPES_H + +#include "util.h" + +#define NR_IO_TIMERS (BCH_SB_MEMBERS_MAX * 3) + +/* + * Clocks/timers in units of sectors of IO: + * + * Note - they use percpu batching, so they're only approximate. + */ + +struct io_timer; +typedef void (*io_timer_fn)(struct io_timer *); + +struct io_timer { + io_timer_fn fn; + unsigned long expire; +}; + +/* Amount to buffer up on a percpu counter */ +#define IO_CLOCK_PCPU_SECTORS 128 + +typedef HEAP(struct io_timer *) io_timer_heap; + +struct io_clock { + atomic64_t now; + u16 __percpu *pcpu_buf; + unsigned max_slop; + + spinlock_t timer_lock; + io_timer_heap timers; +}; + +#endif /* _BCACHEFS_CLOCK_TYPES_H */ diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c new file mode 100644 index 000000000..560214c15 --- /dev/null +++ b/fs/bcachefs/compress.c @@ -0,0 +1,712 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "checksum.h" +#include "compress.h" +#include "extents.h" +#include "io.h" +#include "super-io.h" + +#include +#include +#include + +/* Bounce buffer: */ +struct bbuf { + void *b; + enum { + BB_NONE, + BB_VMAP, + BB_KMALLOC, + BB_MEMPOOL, + } type; + int rw; +}; + +static struct bbuf __bounce_alloc(struct bch_fs *c, unsigned size, int rw) +{ + void *b; + + BUG_ON(size > c->opts.encoded_extent_max); + + b = kmalloc(size, GFP_NOFS|__GFP_NOWARN); + if (b) + return (struct bbuf) { .b = b, .type = BB_KMALLOC, .rw = rw }; + + b = mempool_alloc(&c->compression_bounce[rw], GFP_NOFS); + if (b) + return (struct bbuf) { .b = b, .type = BB_MEMPOOL, .rw = rw }; + + BUG(); +} + +static bool bio_phys_contig(struct bio *bio, struct bvec_iter start) +{ + struct bio_vec bv; + struct bvec_iter iter; + void *expected_start = NULL; + + __bio_for_each_bvec(bv, bio, iter, start) { + if (expected_start && + expected_start != page_address(bv.bv_page) + bv.bv_offset) + return false; + + expected_start = page_address(bv.bv_page) + + bv.bv_offset + bv.bv_len; + } + + return true; +} + +static struct bbuf __bio_map_or_bounce(struct bch_fs *c, struct bio *bio, + struct bvec_iter start, int rw) +{ + struct bbuf ret; + struct bio_vec bv; + struct bvec_iter iter; + unsigned nr_pages = 0; + struct page *stack_pages[16]; + struct page **pages = NULL; + void *data; + + BUG_ON(start.bi_size > c->opts.encoded_extent_max); + + if (!PageHighMem(bio_iter_page(bio, start)) && + bio_phys_contig(bio, start)) + return (struct bbuf) { + .b = page_address(bio_iter_page(bio, start)) + + bio_iter_offset(bio, start), + .type = BB_NONE, .rw = rw + }; + + /* check if we can map the pages contiguously: */ + __bio_for_each_segment(bv, bio, iter, start) { + if (iter.bi_size != start.bi_size && + bv.bv_offset) + goto bounce; + + if (bv.bv_len < iter.bi_size && + bv.bv_offset + bv.bv_len < PAGE_SIZE) + goto bounce; + + nr_pages++; + } + + BUG_ON(DIV_ROUND_UP(start.bi_size, PAGE_SIZE) > nr_pages); + + pages = nr_pages > ARRAY_SIZE(stack_pages) + ? kmalloc_array(nr_pages, sizeof(struct page *), GFP_NOFS) + : stack_pages; + if (!pages) + goto bounce; + + nr_pages = 0; + __bio_for_each_segment(bv, bio, iter, start) + pages[nr_pages++] = bv.bv_page; + + data = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (pages != stack_pages) + kfree(pages); + + if (data) + return (struct bbuf) { + .b = data + bio_iter_offset(bio, start), + .type = BB_VMAP, .rw = rw + }; +bounce: + ret = __bounce_alloc(c, start.bi_size, rw); + + if (rw == READ) + memcpy_from_bio(ret.b, bio, start); + + return ret; +} + +static struct bbuf bio_map_or_bounce(struct bch_fs *c, struct bio *bio, int rw) +{ + return __bio_map_or_bounce(c, bio, bio->bi_iter, rw); +} + +static void bio_unmap_or_unbounce(struct bch_fs *c, struct bbuf buf) +{ + switch (buf.type) { + case BB_NONE: + break; + case BB_VMAP: + vunmap((void *) ((unsigned long) buf.b & PAGE_MASK)); + break; + case BB_KMALLOC: + kfree(buf.b); + break; + case BB_MEMPOOL: + mempool_free(buf.b, &c->compression_bounce[buf.rw]); + break; + } +} + +static inline void zlib_set_workspace(z_stream *strm, void *workspace) +{ +#ifdef __KERNEL__ + strm->workspace = workspace; +#endif +} + +static int __bio_uncompress(struct bch_fs *c, struct bio *src, + void *dst_data, struct bch_extent_crc_unpacked crc) +{ + struct bbuf src_data = { NULL }; + size_t src_len = src->bi_iter.bi_size; + size_t dst_len = crc.uncompressed_size << 9; + void *workspace; + int ret; + + src_data = bio_map_or_bounce(c, src, READ); + + switch (crc.compression_type) { + case BCH_COMPRESSION_TYPE_lz4_old: + case BCH_COMPRESSION_TYPE_lz4: + ret = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret != dst_len) + goto err; + break; + case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src_data.b, + .avail_in = src_len, + .next_out = dst_data, + .avail_out = dst_len, + }; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + + zlib_set_workspace(&strm, workspace); + zlib_inflateInit2(&strm, -MAX_WBITS); + ret = zlib_inflate(&strm, Z_FINISH); + + mempool_free(workspace, &c->decompress_workspace); + + if (ret != Z_STREAM_END) + goto err; + break; + } + case BCH_COMPRESSION_TYPE_zstd: { + ZSTD_DCtx *ctx; + size_t real_src_len = le32_to_cpup(src_data.b); + + if (real_src_len > src_len - 4) + goto err; + + workspace = mempool_alloc(&c->decompress_workspace, GFP_NOFS); + ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); + + ret = zstd_decompress_dctx(ctx, + dst_data, dst_len, + src_data.b + 4, real_src_len); + + mempool_free(workspace, &c->decompress_workspace); + + if (ret != dst_len) + goto err; + break; + } + default: + BUG(); + } + ret = 0; +out: + bio_unmap_or_unbounce(c, src_data); + return ret; +err: + ret = -EIO; + goto out; +} + +int bch2_bio_uncompress_inplace(struct bch_fs *c, struct bio *bio, + struct bch_extent_crc_unpacked *crc) +{ + struct bbuf data = { NULL }; + size_t dst_len = crc->uncompressed_size << 9; + + /* bio must own its pages: */ + BUG_ON(!bio->bi_vcnt); + BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); + + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || + crc->compressed_size << 9 > c->opts.encoded_extent_max) { + bch_err(c, "error rewriting existing data: extent too big"); + return -EIO; + } + + data = __bounce_alloc(c, dst_len, WRITE); + + if (__bio_uncompress(c, bio, data.b, *crc)) { + bch_err(c, "error rewriting existing data: decompression error"); + bio_unmap_or_unbounce(c, data); + return -EIO; + } + + /* + * XXX: don't have a good way to assert that the bio was allocated with + * enough space, we depend on bch2_move_extent doing the right thing + */ + bio->bi_iter.bi_size = crc->live_size << 9; + + memcpy_to_bio(bio, bio->bi_iter, data.b + (crc->offset << 9)); + + crc->csum_type = 0; + crc->compression_type = 0; + crc->compressed_size = crc->live_size; + crc->uncompressed_size = crc->live_size; + crc->offset = 0; + crc->csum = (struct bch_csum) { 0, 0 }; + + bio_unmap_or_unbounce(c, data); + return 0; +} + +int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, + struct bio *dst, struct bvec_iter dst_iter, + struct bch_extent_crc_unpacked crc) +{ + struct bbuf dst_data = { NULL }; + size_t dst_len = crc.uncompressed_size << 9; + int ret; + + if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || + crc.compressed_size << 9 > c->opts.encoded_extent_max) + return -EIO; + + dst_data = dst_len == dst_iter.bi_size + ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) + : __bounce_alloc(c, dst_len, WRITE); + + ret = __bio_uncompress(c, src, dst_data.b, crc); + if (ret) + goto err; + + if (dst_data.type != BB_NONE && + dst_data.type != BB_VMAP) + memcpy_to_bio(dst, dst_iter, dst_data.b + (crc.offset << 9)); +err: + bio_unmap_or_unbounce(c, dst_data); + return ret; +} + +static int attempt_compress(struct bch_fs *c, + void *workspace, + void *dst, size_t dst_len, + void *src, size_t src_len, + struct bch_compression_opt compression) +{ + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; + + switch (compression_type) { + case BCH_COMPRESSION_TYPE_lz4: + if (compression.level < LZ4HC_MIN_CLEVEL) { + int len = src_len; + int ret = LZ4_compress_destSize( + src, dst, + &len, dst_len, + workspace); + if (len < src_len) + return -len; + + return ret; + } else { + int ret = LZ4_compress_HC( + src, dst, + src_len, dst_len, + compression.level, + workspace); + + return ret ?: -1; + } + case BCH_COMPRESSION_TYPE_gzip: { + z_stream strm = { + .next_in = src, + .avail_in = src_len, + .next_out = dst, + .avail_out = dst_len, + }; + + zlib_set_workspace(&strm, workspace); + zlib_deflateInit2(&strm, + compression.level + ? clamp_t(unsigned, compression.level, + Z_BEST_SPEED, Z_BEST_COMPRESSION) + : Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, + Z_DEFAULT_STRATEGY); + + if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) + return 0; + + if (zlib_deflateEnd(&strm) != Z_OK) + return 0; + + return strm.total_out; + } + case BCH_COMPRESSION_TYPE_zstd: { + /* + * rescale: + * zstd max compression level is 22, our max level is 15 + */ + unsigned level = min((compression.level * 3) / 2, zstd_max_clevel()); + ZSTD_parameters params = zstd_get_params(level, c->opts.encoded_extent_max); + ZSTD_CCtx *ctx = zstd_init_cctx(workspace, + zstd_cctx_workspace_bound(¶ms.cParams)); + + /* + * ZSTD requires that when we decompress we pass in the exact + * compressed size - rounding it up to the nearest sector + * doesn't work, so we use the first 4 bytes of the buffer for + * that. + * + * Additionally, the ZSTD code seems to have a bug where it will + * write just past the end of the buffer - so subtract a fudge + * factor (7 bytes) from the dst buffer size to account for + * that. + */ + size_t len = zstd_compress_cctx(ctx, + dst + 4, dst_len - 4 - 7, + src, src_len, + &c->zstd_params); + if (zstd_is_error(len)) + return 0; + + *((__le32 *) dst) = cpu_to_le32(len); + return len + 4; + } + default: + BUG(); + } +} + +static unsigned __bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + struct bch_compression_opt compression) +{ + struct bbuf src_data = { NULL }, dst_data = { NULL }; + void *workspace; + enum bch_compression_type compression_type = + __bch2_compression_opt_to_type[compression.type]; + unsigned pad; + int ret = 0; + + BUG_ON(compression_type >= BCH_COMPRESSION_TYPE_NR); + BUG_ON(!mempool_initialized(&c->compress_workspace[compression_type])); + + /* If it's only one block, don't bother trying to compress: */ + if (src->bi_iter.bi_size <= c->opts.block_size) + return BCH_COMPRESSION_TYPE_incompressible; + + dst_data = bio_map_or_bounce(c, dst, WRITE); + src_data = bio_map_or_bounce(c, src, READ); + + workspace = mempool_alloc(&c->compress_workspace[compression_type], GFP_NOFS); + + *src_len = src->bi_iter.bi_size; + *dst_len = dst->bi_iter.bi_size; + + /* + * XXX: this algorithm sucks when the compression code doesn't tell us + * how much would fit, like LZ4 does: + */ + while (1) { + if (*src_len <= block_bytes(c)) { + ret = -1; + break; + } + + ret = attempt_compress(c, workspace, + dst_data.b, *dst_len, + src_data.b, *src_len, + compression); + if (ret > 0) { + *dst_len = ret; + ret = 0; + break; + } + + /* Didn't fit: should we retry with a smaller amount? */ + if (*src_len <= *dst_len) { + ret = -1; + break; + } + + /* + * If ret is negative, it's a hint as to how much data would fit + */ + BUG_ON(-ret >= *src_len); + + if (ret < 0) + *src_len = -ret; + else + *src_len -= (*src_len - *dst_len) / 2; + *src_len = round_down(*src_len, block_bytes(c)); + } + + mempool_free(workspace, &c->compress_workspace[compression_type]); + + if (ret) + goto err; + + /* Didn't get smaller: */ + if (round_up(*dst_len, block_bytes(c)) >= *src_len) + goto err; + + pad = round_up(*dst_len, block_bytes(c)) - *dst_len; + + memset(dst_data.b + *dst_len, 0, pad); + *dst_len += pad; + + if (dst_data.type != BB_NONE && + dst_data.type != BB_VMAP) + memcpy_to_bio(dst, dst->bi_iter, dst_data.b); + + BUG_ON(!*dst_len || *dst_len > dst->bi_iter.bi_size); + BUG_ON(!*src_len || *src_len > src->bi_iter.bi_size); + BUG_ON(*dst_len & (block_bytes(c) - 1)); + BUG_ON(*src_len & (block_bytes(c) - 1)); + ret = compression_type; +out: + bio_unmap_or_unbounce(c, src_data); + bio_unmap_or_unbounce(c, dst_data); + return ret; +err: + ret = BCH_COMPRESSION_TYPE_incompressible; + goto out; +} + +unsigned bch2_bio_compress(struct bch_fs *c, + struct bio *dst, size_t *dst_len, + struct bio *src, size_t *src_len, + unsigned compression_opt) +{ + unsigned orig_dst = dst->bi_iter.bi_size; + unsigned orig_src = src->bi_iter.bi_size; + unsigned compression_type; + + /* Don't consume more than BCH_ENCODED_EXTENT_MAX from @src: */ + src->bi_iter.bi_size = min_t(unsigned, src->bi_iter.bi_size, + c->opts.encoded_extent_max); + /* Don't generate a bigger output than input: */ + dst->bi_iter.bi_size = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + + compression_type = + __bio_compress(c, dst, dst_len, src, src_len, + bch2_compression_decode(compression_opt)); + + dst->bi_iter.bi_size = orig_dst; + src->bi_iter.bi_size = orig_src; + return compression_type; +} + +static int __bch2_fs_compress_init(struct bch_fs *, u64); + +#define BCH_FEATURE_none 0 + +static const unsigned bch2_compression_opt_to_feature[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_FEATURE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + +#undef BCH_FEATURE_none + +static int __bch2_check_set_has_compressed_data(struct bch_fs *c, u64 f) +{ + int ret = 0; + + if ((c->sb.features & f) == f) + return 0; + + mutex_lock(&c->sb_lock); + + if ((c->sb.features & f) == f) { + mutex_unlock(&c->sb_lock); + return 0; + } + + ret = __bch2_fs_compress_init(c, c->sb.features|f); + if (ret) { + mutex_unlock(&c->sb_lock); + return ret; + } + + c->disk_sb.sb->features[0] |= cpu_to_le64(f); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +int bch2_check_set_has_compressed_data(struct bch_fs *c, + unsigned compression_opt) +{ + unsigned compression_type = bch2_compression_decode(compression_opt).type; + + BUG_ON(compression_type >= ARRAY_SIZE(bch2_compression_opt_to_feature)); + + return compression_type + ? __bch2_check_set_has_compressed_data(c, + 1ULL << bch2_compression_opt_to_feature[compression_type]) + : 0; +} + +void bch2_fs_compress_exit(struct bch_fs *c) +{ + unsigned i; + + mempool_exit(&c->decompress_workspace); + for (i = 0; i < ARRAY_SIZE(c->compress_workspace); i++) + mempool_exit(&c->compress_workspace[i]); + mempool_exit(&c->compression_bounce[WRITE]); + mempool_exit(&c->compression_bounce[READ]); +} + +static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) +{ + size_t decompress_workspace_size = 0; + bool decompress_workspace_needed; + ZSTD_parameters params = zstd_get_params(zstd_max_clevel(), + c->opts.encoded_extent_max); + struct { + unsigned feature; + enum bch_compression_type type; + size_t compress_workspace; + size_t decompress_workspace; + } compression_types[] = { + { BCH_FEATURE_lz4, BCH_COMPRESSION_TYPE_lz4, + max_t(size_t, LZ4_MEM_COMPRESS, LZ4HC_MEM_COMPRESS) }, + { BCH_FEATURE_gzip, BCH_COMPRESSION_TYPE_gzip, + zlib_deflate_workspacesize(MAX_WBITS, DEF_MEM_LEVEL), + zlib_inflate_workspacesize(), }, + { BCH_FEATURE_zstd, BCH_COMPRESSION_TYPE_zstd, + zstd_cctx_workspace_bound(¶ms.cParams), + zstd_dctx_workspace_bound() }, + }, *i; + bool have_compressed = false; + + c->zstd_params = params; + + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) + have_compressed |= (features & (1 << i->feature)) != 0; + + if (!have_compressed) + return 0; + + if (!mempool_initialized(&c->compression_bounce[READ]) && + mempool_init_kvpmalloc_pool(&c->compression_bounce[READ], + 1, c->opts.encoded_extent_max)) + return -BCH_ERR_ENOMEM_compression_bounce_read_init; + + if (!mempool_initialized(&c->compression_bounce[WRITE]) && + mempool_init_kvpmalloc_pool(&c->compression_bounce[WRITE], + 1, c->opts.encoded_extent_max)) + return -BCH_ERR_ENOMEM_compression_bounce_write_init; + + for (i = compression_types; + i < compression_types + ARRAY_SIZE(compression_types); + i++) { + decompress_workspace_size = + max(decompress_workspace_size, i->decompress_workspace); + + if (!(features & (1 << i->feature))) + continue; + + if (i->decompress_workspace) + decompress_workspace_needed = true; + + if (mempool_initialized(&c->compress_workspace[i->type])) + continue; + + if (mempool_init_kvpmalloc_pool( + &c->compress_workspace[i->type], + 1, i->compress_workspace)) + return -BCH_ERR_ENOMEM_compression_workspace_init; + } + + if (!mempool_initialized(&c->decompress_workspace) && + mempool_init_kvpmalloc_pool(&c->decompress_workspace, + 1, decompress_workspace_size)) + return -BCH_ERR_ENOMEM_decompression_workspace_init; + + return 0; +} + +static u64 compression_opt_to_feature(unsigned v) +{ + unsigned type = bch2_compression_decode(v).type; + return 1ULL << bch2_compression_opt_to_feature[type]; +} + +int bch2_fs_compress_init(struct bch_fs *c) +{ + u64 f = c->sb.features; + + f |= compression_opt_to_feature(c->opts.compression); + f |= compression_opt_to_feature(c->opts.background_compression); + + return __bch2_fs_compress_init(c, f); +} + +int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, + struct printbuf *err) +{ + char *val = kstrdup(_val, GFP_KERNEL); + char *p = val, *type_str, *level_str; + struct bch_compression_opt opt = { 0 }; + int ret; + + if (!val) + return -ENOMEM; + + type_str = strsep(&p, ":"); + level_str = p; + + ret = match_string(bch2_compression_opts, -1, type_str); + if (ret < 0 && err) + prt_str(err, "invalid compression type"); + if (ret < 0) + goto err; + + opt.type = ret; + + if (level_str) { + unsigned level; + + ret = kstrtouint(level_str, 10, &level); + if (!ret && !opt.type && level) + ret = -EINVAL; + if (!ret && level > 15) + ret = -EINVAL; + if (ret < 0 && err) + prt_str(err, "invalid compression level"); + if (ret < 0) + goto err; + + opt.level = level; + } + + *res = bch2_compression_encode(opt); +err: + kfree(val); + return ret; +} + +void bch2_opt_compression_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + struct bch_compression_opt opt = bch2_compression_decode(v); + + prt_str(out, bch2_compression_opts[opt.type]); + if (opt.level) + prt_printf(out, ":%u", opt.level); +} diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h new file mode 100644 index 000000000..052ea3032 --- /dev/null +++ b/fs/bcachefs/compress.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COMPRESS_H +#define _BCACHEFS_COMPRESS_H + +#include "extents_types.h" + +struct bch_compression_opt { + u8 type:4, + level:4; +}; + +static inline struct bch_compression_opt bch2_compression_decode(unsigned v) +{ + return (struct bch_compression_opt) { + .type = v & 15, + .level = v >> 4, + }; +} + +static inline unsigned bch2_compression_encode(struct bch_compression_opt opt) +{ + return opt.type|(opt.level << 4); +} + +static const unsigned __bch2_compression_opt_to_type[] = { +#define x(t, n) [BCH_COMPRESSION_OPT_##t] = BCH_COMPRESSION_TYPE_##t, + BCH_COMPRESSION_OPTS() +#undef x +}; + +static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v) +{ + return __bch2_compression_opt_to_type[bch2_compression_decode(v).type]; +} + +int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *, + struct bch_extent_crc_unpacked *); +int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *, + struct bvec_iter, struct bch_extent_crc_unpacked); +unsigned bch2_bio_compress(struct bch_fs *, struct bio *, size_t *, + struct bio *, size_t *, unsigned); + +int bch2_check_set_has_compressed_data(struct bch_fs *, unsigned); +void bch2_fs_compress_exit(struct bch_fs *); +int bch2_fs_compress_init(struct bch_fs *); + +int bch2_opt_compression_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_compression_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + +#define bch2_opt_compression (struct bch_opt_fn) { \ + .parse = bch2_opt_compression_parse, \ + .to_text = bch2_opt_compression_to_text, \ +} + +#endif /* _BCACHEFS_COMPRESS_H */ diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/counters.c new file mode 100644 index 000000000..442a9b806 --- /dev/null +++ b/fs/bcachefs/counters.c @@ -0,0 +1,107 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "super-io.h" +#include "counters.h" + +/* BCH_SB_FIELD_counters */ + +static const char * const bch2_counter_names[] = { +#define x(t, n, ...) (#t), + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; + +static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) +{ + if (!ctrs) + return 0; + + return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; +}; + +static int bch2_sb_counters_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + return 0; +}; + +static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_counters *ctrs = field_to_type(f, counters); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + for (i = 0; i < nr; i++) { + if (i < BCH_COUNTER_NR) + prt_printf(out, "%s ", bch2_counter_names[i]); + else + prt_printf(out, "(unknown)"); + + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(ctrs->d[i])); + prt_newline(out); + }; +}; + +int bch2_sb_counters_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + u64 val = 0; + + for (i = 0; i < BCH_COUNTER_NR; i++) + c->counters_on_mount[i] = 0; + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { + val = le64_to_cpu(ctrs->d[i]); + percpu_u64_set(&c->counters[i], val); + c->counters_on_mount[i] = val; + } + return 0; +}; + +int bch2_sb_counters_from_cpu(struct bch_fs *c) +{ + struct bch_sb_field_counters *ctrs = bch2_sb_get_counters(c->disk_sb.sb); + struct bch_sb_field_counters *ret; + unsigned int i; + unsigned int nr = bch2_sb_counter_nr_entries(ctrs); + + if (nr < BCH_COUNTER_NR) { + ret = bch2_sb_resize_counters(&c->disk_sb, + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); + + if (ret) { + ctrs = ret; + nr = bch2_sb_counter_nr_entries(ctrs); + } + } + + + for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) + ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + return 0; +} + +void bch2_fs_counters_exit(struct bch_fs *c) +{ + free_percpu(c->counters); +} + +int bch2_fs_counters_init(struct bch_fs *c) +{ + c->counters = __alloc_percpu(sizeof(u64) * BCH_COUNTER_NR, sizeof(u64)); + if (!c->counters) + return -BCH_ERR_ENOMEM_fs_counters_init; + + return bch2_sb_counters_to_cpu(c); +} + +const struct bch_sb_field_ops bch_sb_field_ops_counters = { + .validate = bch2_sb_counters_validate, + .to_text = bch2_sb_counters_to_text, +}; diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/counters.h new file mode 100644 index 000000000..4778aa19b --- /dev/null +++ b/fs/bcachefs/counters.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_COUNTERS_H +#define _BCACHEFS_COUNTERS_H + +#include "bcachefs.h" +#include "super-io.h" + + +int bch2_sb_counters_to_cpu(struct bch_fs *); +int bch2_sb_counters_from_cpu(struct bch_fs *); + +void bch2_fs_counters_exit(struct bch_fs *); +int bch2_fs_counters_init(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_counters; + +#endif // _BCACHEFS_COUNTERS_H diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h new file mode 100644 index 000000000..d4485fa01 --- /dev/null +++ b/fs/bcachefs/darray.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DARRAY_H +#define _BCACHEFS_DARRAY_H + +/* + * Dynamic arrays: + * + * Inspired by CCAN's darray + */ + +#include "util.h" +#include + +#define DARRAY(type) \ +struct { \ + size_t nr, size; \ + type *data; \ +} + +typedef DARRAY(void) darray_void; + +static inline int __darray_make_room(darray_void *d, size_t t_size, size_t more, gfp_t gfp) +{ + if (d->nr + more > d->size) { + size_t new_size = roundup_pow_of_two(d->nr + more); + void *data = krealloc_array(d->data, new_size, t_size, gfp); + + if (!data) + return -ENOMEM; + + d->data = data; + d->size = new_size; + } + + return 0; +} + +#define darray_make_room_gfp(_d, _more, _gfp) \ + __darray_make_room((darray_void *) (_d), sizeof((_d)->data[0]), (_more), _gfp) + +#define darray_make_room(_d, _more) \ + darray_make_room_gfp(_d, _more, GFP_KERNEL) + +#define darray_top(_d) ((_d).data[(_d).nr]) + +#define darray_push_gfp(_d, _item, _gfp) \ +({ \ + int _ret = darray_make_room_gfp((_d), 1, _gfp); \ + \ + if (!_ret) \ + (_d)->data[(_d)->nr++] = (_item); \ + _ret; \ +}) + +#define darray_push(_d, _item) darray_push_gfp(_d, _item, GFP_KERNEL) + +#define darray_pop(_d) ((_d)->data[--(_d)->nr]) + +#define darray_first(_d) ((_d).data[0]) +#define darray_last(_d) ((_d).data[(_d).nr - 1]) + +#define darray_insert_item(_d, _pos, _item) \ +({ \ + size_t pos = (_pos); \ + int _ret = darray_make_room((_d), 1); \ + \ + if (!_ret) \ + array_insert_item((_d)->data, (_d)->nr, pos, (_item)); \ + _ret; \ +}) + +#define darray_for_each(_d, _i) \ + for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++) + +#define darray_init(_d) \ +do { \ + (_d)->data = NULL; \ + (_d)->nr = (_d)->size = 0; \ +} while (0) + +#define darray_exit(_d) \ +do { \ + kfree((_d)->data); \ + darray_init(_d); \ +} while (0) + +#endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c new file mode 100644 index 000000000..cfc624463 --- /dev/null +++ b/fs/bcachefs/data_update.c @@ -0,0 +1,562 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "data_update.h" +#include "ec.h" +#include "error.h" +#include "extents.h" +#include "io.h" +#include "keylist.h" +#include "move.h" +#include "nocow_locking.h" +#include "subvolume.h" +#include "trace.h" + +static void trace_move_extent_finish2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_finish_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_finish(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_fail2(struct data_update *m, + struct bkey_s_c new, + struct bkey_s_c wrote, + struct bkey_i *insert, + const char *msg) +{ + struct bch_fs *c = m->op.c; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + const union bch_extent_entry *entry; + struct bch_extent_ptr *ptr; + struct extent_ptr_decoded p; + struct printbuf buf = PRINTBUF; + unsigned i, rewrites_found = 0; + + if (!trace_move_extent_fail_enabled()) + return; + + prt_str(&buf, msg); + + if (insert) { + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry) { + struct bkey_s new_s; + new_s.k = (void *) new.k; + new_s.v = (void *) new.v; + + if (((1U << i) & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) + rewrites_found |= 1U << i; + i++; + } + } + + prt_printf(&buf, "\nrewrite ptrs: %u%u%u%u", + (m->data_opts.rewrite_ptrs & (1 << 0)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 1)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 2)) != 0, + (m->data_opts.rewrite_ptrs & (1 << 3)) != 0); + + prt_printf(&buf, "\nrewrites found: %u%u%u%u", + (rewrites_found & (1 << 0)) != 0, + (rewrites_found & (1 << 1)) != 0, + (rewrites_found & (1 << 2)) != 0, + (rewrites_found & (1 << 3)) != 0); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, new); + + prt_str(&buf, "\nwrote: "); + bch2_bkey_val_to_text(&buf, c, wrote); + + if (insert) { + prt_str(&buf, "\ninsert: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + } + + trace_move_extent_fail(c, buf.buf); + printbuf_exit(&buf); +} + +static int __bch2_data_update_index_update(struct btree_trans *trans, + struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_iter iter; + struct data_update *m = + container_of(op, struct data_update, op); + struct keylist *keys = &op->insert_keys; + struct bkey_buf _new, _insert; + int ret = 0; + + bch2_bkey_buf_init(&_new); + bch2_bkey_buf_init(&_insert); + bch2_bkey_buf_realloc(&_insert, c, U8_MAX); + + bch2_trans_iter_init(trans, &iter, m->btree_id, + bkey_start_pos(&bch2_keylist_front(keys)->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (1) { + struct bkey_s_c k; + struct bkey_s_c old = bkey_i_to_s_c(m->k.k); + struct bkey_i *insert = NULL; + struct bkey_i_extent *new; + const union bch_extent_entry *entry_c; + union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_extent_ptr *ptr; + const struct bch_extent_ptr *ptr_c; + struct bpos next_pos; + bool should_check_enospc; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + unsigned rewrites_found = 0, durability, i; + + bch2_trans_begin(trans); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + new = bkey_i_to_extent(bch2_keylist_front(keys)); + + if (!bch2_extents_match(k, old)) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), + NULL, "no match:"); + goto nowork; + } + + bkey_reassemble(_insert.k, k); + insert = _insert.k; + + bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); + new = bkey_i_to_extent(_new.k); + bch2_cut_front(iter.pos, &new->k_i); + + bch2_cut_front(iter.pos, insert); + bch2_cut_back(new->k.p, insert); + bch2_cut_back(insert->k.p, &new->k_i); + + /* + * @old: extent that we read from + * @insert: key that we're going to update, initialized from + * extent currently in btree - same as @old unless we raced with + * other updates + * @new: extent with new pointers that we'll be adding to @insert + * + * Fist, drop rewrite_ptrs from @new: + */ + i = 0; + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs_c(old), p, entry_c) { + if (((1U << i) & m->data_opts.rewrite_ptrs) && + (ptr = bch2_extent_has_ptr(old, p, bkey_i_to_s(insert))) && + !ptr->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); + /* + * See comment below: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), ptr); + */ + rewrites_found |= 1U << i; + } + i++; + } + + if (m->data_opts.rewrite_ptrs && + !rewrites_found && + bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + goto nowork; + } + + /* + * A replica that we just wrote might conflict with a replica + * that we want to keep, due to racing with another move: + */ +restart_drop_conflicting_replicas: + extent_for_each_ptr(extent_i_to_s(new), ptr) + if ((ptr_c = bch2_bkey_has_device_c(bkey_i_to_s_c(insert), ptr->dev)) && + !ptr_c->cached) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(&new->k_i), ptr); + goto restart_drop_conflicting_replicas; + } + + if (!bkey_val_u64s(&new->k)) { + trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + goto nowork; + } + + /* Now, drop pointers that conflict with what we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + if ((ptr = bch2_bkey_has_device(bkey_i_to_s(insert), p.ptr.dev))) + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), ptr); + + durability = bch2_bkey_durability(c, bkey_i_to_s_c(insert)) + + bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); + + /* Now, drop excess replicas: */ +restart_drop_extra_replicas: + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(insert), &entry->ptr); + /* + * Currently, we're dropping unneeded replicas + * instead of marking them as cached, since + * cached data in stripe buckets prevents them + * from being reused: + bch2_extent_ptr_set_cached(bkey_i_to_s(insert), &entry->ptr); + */ + goto restart_drop_extra_replicas; + } + } + + /* Finally, add the pointers we just wrote: */ + extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) + bch2_extent_ptr_decoded_append(insert, &p); + + bch2_bkey_narrow_crcs(insert, (struct bch_extent_crc_unpacked) { 0 }); + bch2_extent_normalize(c, bkey_i_to_s(insert)); + + ret = bch2_sum_sector_overwrites(trans, &iter, insert, + &should_check_enospc, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + goto err; + + if (disk_sectors_delta > (s64) op->res.sectors) { + ret = bch2_disk_reservation_add(c, &op->res, + disk_sectors_delta - op->res.sectors, + !should_check_enospc + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + goto out; + } + + next_pos = insert->k.p; + + ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, bkey_start_pos(&insert->k)) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, + k.k->p, insert->k.p); + if (ret) + goto err; + + ret = bch2_trans_update(trans, &iter, insert, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, &op->res, + NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL| + m->data_opts.btree_insert_flags); + if (!ret) { + bch2_btree_iter_set_pos(&iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); + trace_move_extent_finish2(c, bkey_i_to_s_c(&new->k_i)); + } +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + if (ret) + break; +next: + while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { + bch2_keylist_pop_front(keys); + if (bch2_keylist_empty(keys)) + goto out; + } + continue; +nowork: + if (m->ctxt && m->ctxt->stats) { + BUG_ON(k.k->p.offset <= iter.pos.offset); + atomic64_inc(&m->ctxt->stats->keys_raced); + atomic64_add(k.k->p.offset - iter.pos.offset, + &m->ctxt->stats->sectors_raced); + } + + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]); + + bch2_btree_iter_advance(&iter); + goto next; + } +out: + bch2_trans_iter_exit(trans, &iter); + bch2_bkey_buf_exit(&_insert, c); + bch2_bkey_buf_exit(&_new, c); + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + return ret; +} + +int bch2_data_update_index_update(struct bch_write_op *op) +{ + return bch2_trans_run(op->c, __bch2_data_update_index_update(&trans, op)); +} + +void bch2_data_update_read_done(struct data_update *m, + struct bch_extent_crc_unpacked crc) +{ + /* write bio must own pages: */ + BUG_ON(!m->op.wbio.bio.bi_vcnt); + + m->op.crc = crc; + m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + + closure_call(&m->op.cl, bch2_write, NULL, NULL); +} + +void bch2_data_update_exit(struct data_update *update) +{ + struct bch_fs *c = update->op.c; + struct bkey_ptrs_c ptrs = + bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k)); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) { + if (c->opts.nocow_enabled) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), 0); + percpu_ref_put(&bch_dev_bkey_exists(c, ptr->dev)->ref); + } + + bch2_bkey_buf_exit(&update->k, c); + bch2_disk_reservation_put(c, &update->op.res); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); +} + +void bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) +{ + struct bch_fs *c = update->op.c; + struct bio *bio = &update->op.wbio.bio; + struct bkey_i_extent *e; + struct write_point *wp; + struct bch_extent_ptr *ptr; + struct closure cl; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + closure_init_stack(&cl); + bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); + + while (bio_sectors(bio)) { + unsigned sectors = bio_sectors(bio); + + bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, + BTREE_ITER_SLOTS); + ret = lockrestart_do(trans, ({ + k = bch2_btree_iter_peek_slot(&iter); + bkey_err(k); + })); + bch2_trans_iter_exit(trans, &iter); + + if (ret || !bch2_extents_match(k, bkey_i_to_s_c(update->k.k))) + break; + + e = bkey_extent_init(update->op.insert_keys.top); + e->k.p = update->op.pos; + + ret = bch2_alloc_sectors_start_trans(trans, + update->op.target, + false, + update->op.write_point, + &update->op.devs_have, + update->op.nr_replicas, + update->op.nr_replicas, + update->op.watermark, + 0, &cl, &wp); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) { + bch2_trans_unlock(trans); + closure_sync(&cl); + continue; + } + + if (ret) + return; + + sectors = min(sectors, wp->sectors_free); + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &update->op.open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + bio_advance(bio, sectors << 9); + update->op.pos.offset += sectors; + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + bch2_keylist_push(&update->op.insert_keys); + + ret = __bch2_data_update_index_update(trans, &update->op); + + bch2_open_buckets_put(c, &update->op.open_buckets); + + if (ret) + break; + } + + if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } +} + +int bch2_data_update_init(struct btree_trans *trans, + struct moving_context *ctxt, + struct data_update *m, + struct write_point_specifier wp, + struct bch_io_opts io_opts, + struct data_update_opts data_opts, + enum btree_id btree_id, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + const struct bch_extent_ptr *ptr; + unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned ptrs_locked = 0; + int ret; + + bch2_bkey_buf_init(&m->k); + bch2_bkey_buf_reassemble(&m->k, c, k); + m->btree_id = btree_id; + m->data_opts = data_opts; + + bch2_write_op_init(&m->op, c, io_opts); + m->op.pos = bkey_start_pos(k.k); + m->op.version = k.k->version; + m->op.target = data_opts.target; + m->op.write_point = wp; + m->op.nr_replicas = 0; + m->op.flags |= BCH_WRITE_PAGES_STABLE| + BCH_WRITE_PAGES_OWNED| + BCH_WRITE_DATA_ENCODED| + BCH_WRITE_MOVE| + m->data_opts.write_flags; + m->op.compression_opt = io_opts.background_compression ?: io_opts.compression; + m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; + + bkey_for_each_ptr(ptrs, ptr) + percpu_ref_get(&bch_dev_bkey_exists(c, ptr->dev)->ref); + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + bool locked; + + if (((1U << i) & m->data_opts.rewrite_ptrs)) { + BUG_ON(p.ptr.cached); + + if (crc_is_compressed(p.crc)) + reserve_sectors += k.k->size; + + m->op.nr_replicas += bch2_extent_ptr_desired_durability(c, &p); + } else if (!p.ptr.cached) { + bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); + } + + /* + * op->csum_type is normally initialized from the fs/file's + * current options - but if an extent is encrypted, we require + * that it stays encrypted: + */ + if (bch2_csum_type_is_encryption(p.crc.csum_type)) { + m->op.nonce = p.crc.nonce + p.crc.offset; + m->op.csum_type = p.crc.csum_type; + } + + if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + m->op.incompressible = true; + + if (c->opts.nocow_enabled) { + if (ctxt) { + move_ctxt_wait_event(ctxt, trans, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) || + !atomic_read(&ctxt->read_sectors)); + + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + } else { + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto err; + } + } + ptrs_locked |= (1U << i); + } + + i++; + } + + if (reserve_sectors) { + ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, + m->data_opts.extra_replicas + ? 0 + : BCH_DISK_RESERVATION_NOFAIL); + if (ret) + goto err; + } + + m->op.nr_replicas += m->data_opts.extra_replicas; + m->op.nr_replicas_required = m->op.nr_replicas; + + BUG_ON(!m->op.nr_replicas); + + /* Special handling required: */ + if (bkey_extent_is_unwritten(k)) + return -BCH_ERR_unwritten_extent_update; + return 0; +err: + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if ((1U << i) & ptrs_locked) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, &p.ptr), 0); + percpu_ref_put(&bch_dev_bkey_exists(c, p.ptr.dev)->ref); + i++; + } + + bch2_bkey_buf_exit(&m->k, c); + bch2_bio_free_pages_pool(c, &m->op.wbio.bio); + return ret; +} + +void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if ((opts->rewrite_ptrs & (1U << i)) && ptr->cached) { + opts->kill_ptrs |= 1U << i; + opts->rewrite_ptrs ^= 1U << i; + } + + i++; + } +} diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h new file mode 100644 index 000000000..49e9055cb --- /dev/null +++ b/fs/bcachefs/data_update.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BCACHEFS_DATA_UPDATE_H +#define _BCACHEFS_DATA_UPDATE_H + +#include "bkey_buf.h" +#include "io_types.h" + +struct moving_context; + +struct data_update_opts { + unsigned rewrite_ptrs; + unsigned kill_ptrs; + u16 target; + u8 extra_replicas; + unsigned btree_insert_flags; + unsigned write_flags; +}; + +struct data_update { + /* extent being updated: */ + enum btree_id btree_id; + struct bkey_buf k; + struct data_update_opts data_opts; + struct moving_context *ctxt; + struct bch_write_op op; +}; + +int bch2_data_update_index_update(struct bch_write_op *); + +void bch2_data_update_read_done(struct data_update *, + struct bch_extent_crc_unpacked); + +void bch2_data_update_exit(struct data_update *); +void bch2_update_unwritten_extent(struct btree_trans *, struct data_update *); +int bch2_data_update_init(struct btree_trans *, struct moving_context *, + struct data_update *, + struct write_point_specifier, + struct bch_io_opts, struct data_update_opts, + enum btree_id, struct bkey_s_c); +void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); + +#endif /* _BCACHEFS_DATA_UPDATE_H */ diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c new file mode 100644 index 000000000..ae47e1854 --- /dev/null +++ b/fs/bcachefs/debug.c @@ -0,0 +1,957 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Assorted bcachefs debug code + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "btree_update.h" +#include "buckets.h" +#include "debug.h" +#include "error.h" +#include "extents.h" +#include "fsck.h" +#include "inode.h" +#include "io.h" +#include "super.h" + +#include +#include +#include +#include +#include + +static struct dentry *bch_debug; + +static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, + struct extent_ptr_decoded pick) +{ + struct btree *v = c->verify_data; + struct btree_node *n_ondisk = c->verify_ondisk; + struct btree_node *n_sorted = c->verify_data->data; + struct bset *sorted, *inmemory = &b->data->keys; + struct bch_dev *ca = bch_dev_bkey_exists(c, pick.ptr.dev); + struct bio *bio; + bool failed = false, saw_error = false; + + if (!bch2_dev_get_ioref(ca, READ)) + return false; + + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_sorted, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOFS, + &c->btree_bio); + bio->bi_iter.bi_sector = pick.ptr.offset; + bch2_bio_map(bio, n_sorted, btree_bytes(c)); + + submit_bio_wait(bio); + + bio_put(bio); + percpu_ref_put(&ca->io_ref); + + memcpy(n_ondisk, n_sorted, btree_bytes(c)); + + v->written = 0; + if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) + return false; + + n_sorted = c->verify_data->data; + sorted = &n_sorted->keys; + + if (inmemory->u64s != sorted->u64s || + memcmp(inmemory->start, + sorted->start, + vstruct_end(inmemory) - (void *) inmemory->start)) { + unsigned offset = 0, sectors; + struct bset *i; + unsigned j; + + console_lock(); + + printk(KERN_ERR "*** in memory:\n"); + bch2_dump_bset(c, b, inmemory, 0); + + printk(KERN_ERR "*** read back in:\n"); + bch2_dump_bset(c, v, sorted, 0); + + while (offset < v->written) { + if (!offset) { + i = &n_ondisk->keys; + sectors = vstruct_blocks(n_ondisk, c->block_bits) << + c->block_bits; + } else { + struct btree_node_entry *bne = + (void *) n_ondisk + (offset << 9); + i = &bne->keys; + + sectors = vstruct_blocks(bne, c->block_bits) << + c->block_bits; + } + + printk(KERN_ERR "*** on disk block %u:\n", offset); + bch2_dump_bset(c, b, i, offset); + + offset += sectors; + } + + for (j = 0; j < le16_to_cpu(inmemory->u64s); j++) + if (inmemory->_data[j] != sorted->_data[j]) + break; + + console_unlock(); + bch_err(c, "verify failed at key %u", j); + + failed = true; + } + + if (v->written != b->written) { + bch_err(c, "written wrong: expected %u, got %u", + b->written, v->written); + failed = true; + } + + return failed; +} + +void __bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + struct bkey_ptrs_c ptrs; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + struct btree *v; + struct bset *inmemory = &b->data->keys; + struct bkey_packed *k; + bool failed = false; + + if (c->opts.nochanges) + return; + + bch2_btree_node_io_lock(b); + mutex_lock(&c->verify_lock); + + if (!c->verify_ondisk) { + c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + if (!c->verify_ondisk) + goto out; + } + + if (!c->verify_data) { + c->verify_data = __bch2_btree_node_mem_alloc(c); + if (!c->verify_data) + goto out; + + list_del_init(&c->verify_data->list); + } + + BUG_ON(b->nsets != 1); + + for (k = inmemory->start; k != vstruct_last(inmemory); k = bkey_p_next(k)) + if (k->type == KEY_TYPE_btree_ptr_v2) { + struct bch_btree_ptr_v2 *v = (void *) bkeyp_val(&b->format, k); + v->mem_ptr = 0; + } + + v = c->verify_data; + bkey_copy(&v->key, &b->key); + v->c.level = b->c.level; + v->c.btree_id = b->c.btree_id; + bch2_btree_keys_init(v); + + ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&b->key)); + bkey_for_each_ptr_decode(&b->key.k, ptrs, p, entry) + failed |= bch2_btree_verify_replica(c, b, p); + + if (failed) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + bch2_fs_fatal_error(c, "btree node verify failed for : %s\n", buf.buf); + printbuf_exit(&buf); + } +out: + mutex_unlock(&c->verify_lock); + bch2_btree_node_io_unlock(b); +} + +void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, + const struct btree *b) +{ + struct btree_node *n_ondisk = NULL; + struct extent_ptr_decoded pick; + struct bch_dev *ca; + struct bio *bio = NULL; + unsigned offset = 0; + int ret; + + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + prt_printf(out, "error getting device to read from: invalid device\n"); + return; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + if (!bch2_dev_get_ioref(ca, READ)) { + prt_printf(out, "error getting device to read from: not online\n"); + return; + } + + n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL); + if (!n_ondisk) { + prt_printf(out, "memory allocation failure\n"); + goto out; + } + + bio = bio_alloc_bioset(ca->disk_sb.bdev, + buf_pages(n_ondisk, btree_bytes(c)), + REQ_OP_READ|REQ_META, + GFP_NOFS, + &c->btree_bio); + bio->bi_iter.bi_sector = pick.ptr.offset; + bch2_bio_map(bio, n_ondisk, btree_bytes(c)); + + ret = submit_bio_wait(bio); + if (ret) { + prt_printf(out, "IO error reading btree node: %s\n", bch2_err_str(ret)); + goto out; + } + + while (offset < btree_sectors(c)) { + struct bset *i; + struct nonce nonce; + struct bch_csum csum; + struct bkey_packed *k; + unsigned sectors; + + if (!offset) { + i = &n_ondisk->keys; + + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { + prt_printf(out, "unknown checksum type at offset %u: %llu\n", + offset, BSET_CSUM_TYPE(i)); + goto out; + } + + nonce = btree_nonce(i, offset << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, n_ondisk); + + if (bch2_crc_cmp(csum, n_ondisk->csum)) { + prt_printf(out, "invalid checksum\n"); + goto out; + } + + bset_encrypt(c, i, offset << 9); + + sectors = vstruct_sectors(n_ondisk, c->block_bits); + } else { + struct btree_node_entry *bne = (void *) n_ondisk + (offset << 9); + + i = &bne->keys; + + if (i->seq != n_ondisk->keys.seq) + break; + + if (!bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i))) { + prt_printf(out, "unknown checksum type at offset %u: %llu\n", + offset, BSET_CSUM_TYPE(i)); + goto out; + } + + nonce = btree_nonce(i, offset << 9); + csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + + if (bch2_crc_cmp(csum, bne->csum)) { + prt_printf(out, "invalid checksum"); + goto out; + } + + bset_encrypt(c, i, offset << 9); + + sectors = vstruct_sectors(bne, c->block_bits); + } + + prt_printf(out, " offset %u version %u, journal seq %llu\n", + offset, + le16_to_cpu(i->version), + le64_to_cpu(i->journal_seq)); + offset += sectors; + + printbuf_indent_add(out, 4); + + for (k = i->start; k != vstruct_last(i); k = bkey_p_next(k)) { + struct bkey u; + + bch2_bkey_val_to_text(out, c, bkey_disassemble(b, k, &u)); + prt_newline(out); + } + + printbuf_indent_sub(out, 4); + } +out: + if (bio) + bio_put(bio); + kvpfree(n_ondisk, btree_bytes(c)); + percpu_ref_put(&ca->io_ref); +} + +#ifdef CONFIG_DEBUG_FS + +/* XXX: bch_fs refcounting */ + +struct dump_iter { + struct bch_fs *c; + enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; + + struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +static ssize_t flush_buf(struct dump_iter *i) +{ + if (i->buf.pos) { + size_t bytes = min_t(size_t, i->buf.pos, i->size); + int err = copy_to_user(i->ubuf, i->buf.buf, bytes); + + if (err) + return err; + + i->ret += bytes; + i->ubuf += bytes; + i->size -= bytes; + i->buf.pos -= bytes; + memmove(i->buf.buf, i->buf.buf + bytes, i->buf.pos); + } + + return i->size ? 0 : i->ret; +} + +static int bch2_dump_open(struct inode *inode, struct file *file) +{ + struct btree_debug *bd = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->iter = 0; + i->c = container_of(bd, struct bch_fs, btree_debug[bd->id]); + i->id = bd->id; + i->buf = PRINTBUF; + + return 0; +} + +static int bch2_dump_release(struct inode *inode, struct file *file) +{ + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); + return 0; +} + +static ssize_t bch2_read_btree(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + ssize_t ret; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + ret = flush_buf(i); + if (ret) + return ret; + + bch2_trans_init(&trans, i->c, 0, 0); + ret = for_each_btree_key2(&trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + bch2_bkey_val_to_text(&i->buf, i->c, k); + prt_newline(&i->buf); + drop_locks_do(&trans, flush_buf(i)); + })); + i->from = iter.pos; + + bch2_trans_exit(&trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_btree, +}; + +static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_trans trans; + struct btree_iter iter; + struct btree *b; + ssize_t ret; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + ret = flush_buf(i); + if (ret) + return ret; + + if (bpos_eq(SPOS_MAX, i->from)) + return i->ret; + + bch2_trans_init(&trans, i->c, 0, 0); +retry: + bch2_trans_begin(&trans); + + for_each_btree_node(&trans, iter, i->id, i->from, 0, b, ret) { + bch2_btree_node_to_text(&i->buf, i->c, b); + i->from = !bpos_eq(SPOS_MAX, b->key.k.p) + ? bpos_successor(b->key.k.p) + : b->key.k.p; + + ret = drop_locks_do(&trans, flush_buf(i)); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_format_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_btree_formats, +}; + +static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + ssize_t ret; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + ret = flush_buf(i); + if (ret) + return ret; + + bch2_trans_init(&trans, i->c, 0, 0); + + ret = for_each_btree_key2(&trans, iter, i->id, i->from, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ({ + struct btree_path_level *l = &iter.path->l[0]; + struct bkey_packed *_k = + bch2_btree_node_iter_peek(&l->iter, l->b); + + if (bpos_gt(l->b->key.k.p, i->prev_node)) { + bch2_btree_node_to_text(&i->buf, i->c, l->b); + i->prev_node = l->b->key.k.p; + } + + bch2_bfloat_to_text(&i->buf, l->b, _k); + drop_locks_do(&trans, flush_buf(i)); + })); + i->from = iter.pos; + + bch2_trans_exit(&trans); + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations bfloat_failed_debug_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_read_bfloat_failed, +}; + +static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs *c, + struct btree *b) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "%px btree=%s l=%u ", + b, + bch2_btree_ids[b->c.btree_id], + b->c.level); + prt_newline(out); + + printbuf_indent_add(out, 2); + + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(&b->key)); + prt_newline(out); + + prt_printf(out, "flags: "); + prt_tab(out); + prt_bitflags(out, bch2_btree_node_flags, b->flags); + prt_newline(out); + + prt_printf(out, "pcpu read locks: "); + prt_tab(out); + prt_printf(out, "%u", b->c.lock.readers != NULL); + prt_newline(out); + + prt_printf(out, "written:"); + prt_tab(out); + prt_printf(out, "%u", b->written); + prt_newline(out); + + prt_printf(out, "writes blocked:"); + prt_tab(out); + prt_printf(out, "%u", !list_empty_careful(&b->write_blocked)); + prt_newline(out); + + prt_printf(out, "will make reachable:"); + prt_tab(out); + prt_printf(out, "%lx", b->will_make_reachable); + prt_newline(out); + + prt_printf(out, "journal pin %px:", &b->writes[0].journal); + prt_tab(out); + prt_printf(out, "%llu", b->writes[0].journal.seq); + prt_newline(out); + + prt_printf(out, "journal pin %px:", &b->writes[1].journal); + prt_tab(out); + prt_printf(out, "%llu", b->writes[1].journal.seq); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + struct bucket_table *tbl; + struct rhash_head *pos; + struct btree *b; + + ret = flush_buf(i); + if (ret) + return ret; + + rcu_read_lock(); + i->buf.atomic++; + tbl = rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } + --i->buf.atomic; + rcu_read_unlock(); + } while (!done); + + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations cached_btree_nodes_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_cached_btree_nodes_read, +}; + +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS +static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + u32 seq; + + i->ubuf = buf; + i->size = size; + i->ret = 0; +restart: + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid <= i->iter) + continue; + + closure_get(&trans->ref); + seq = seqmutex_seq(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + + ret = flush_buf(i); + if (ret) { + closure_put(&trans->ref); + goto unlocked; + } + + bch2_btree_trans_to_text(&i->buf, trans); + + prt_printf(&i->buf, "backtrace:"); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task); + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + + i->iter = trans->locking_wait.task->pid; + + closure_put(&trans->ref); + + if (!seqmutex_relock(&c->btree_trans_lock, seq)) + goto restart; + } + seqmutex_unlock(&c->btree_trans_lock); +unlocked: + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_transactions_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_transactions_read, +}; +#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */ + +static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + bool done = false; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + do { + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + done = bch2_journal_seq_pins_to_text(&i->buf, &c->journal, &i->iter); + i->iter++; + } while (!done); + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations journal_pins_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_journal_pins_read, +}; + +static int lock_held_stats_open(struct inode *inode, struct file *file) +{ + struct bch_fs *c = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + + if (!i) + return -ENOMEM; + + i->iter = 0; + i->c = c; + i->buf = PRINTBUF; + file->private_data = i; + + return 0; +} + +static int lock_held_stats_release(struct inode *inode, struct file *file) +{ + struct dump_iter *i = file->private_data; + + printbuf_exit(&i->buf); + kfree(i); + + return 0; +} + +static ssize_t lock_held_stats_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + int err; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + while (1) { + struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; + + err = flush_buf(i); + if (err) + return err; + + if (!i->size) + break; + + if (i->iter == ARRAY_SIZE(bch2_btree_transaction_fns) || + !bch2_btree_transaction_fns[i->iter]) + break; + + prt_printf(&i->buf, "%s: ", bch2_btree_transaction_fns[i->iter]); + prt_newline(&i->buf); + printbuf_indent_add(&i->buf, 2); + + mutex_lock(&s->lock); + + prt_printf(&i->buf, "Max mem used: %u", s->max_mem); + prt_newline(&i->buf); + + if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) { + prt_printf(&i->buf, "Lock hold times:"); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + bch2_time_stats_to_text(&i->buf, &s->lock_hold_times); + printbuf_indent_sub(&i->buf, 2); + } + + if (s->max_paths_text) { + prt_printf(&i->buf, "Maximum allocated btree paths (%u):", s->nr_max_paths); + prt_newline(&i->buf); + + printbuf_indent_add(&i->buf, 2); + prt_str_indented(&i->buf, s->max_paths_text); + printbuf_indent_sub(&i->buf, 2); + } + + mutex_unlock(&s->lock); + + printbuf_indent_sub(&i->buf, 2); + prt_newline(&i->buf); + i->iter++; + } + + if (i->buf.allocation_failure) + return -ENOMEM; + + return i->ret; +} + +static const struct file_operations lock_held_stats_op = { + .owner = THIS_MODULE, + .open = lock_held_stats_open, + .release = lock_held_stats_release, + .read = lock_held_stats_read, +}; + +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct bch_fs *c = i->c; + struct btree_trans *trans; + ssize_t ret = 0; + u32 seq; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + if (i->iter) + goto out; +restart: + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + if (trans->locking_wait.task->pid <= i->iter) + continue; + + closure_get(&trans->ref); + seq = seqmutex_seq(&c->btree_trans_lock); + seqmutex_unlock(&c->btree_trans_lock); + + ret = flush_buf(i); + if (ret) { + closure_put(&trans->ref); + goto out; + } + + bch2_check_for_deadlock(trans, &i->buf); + + i->iter = trans->locking_wait.task->pid; + + closure_put(&trans->ref); + + if (!seqmutex_relock(&c->btree_trans_lock, seq)) + goto restart; + } + seqmutex_unlock(&c->btree_trans_lock); +out: + if (i->buf.allocation_failure) + ret = -ENOMEM; + + if (!ret) + ret = flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations btree_deadlock_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_btree_deadlock_read, +}; + +void bch2_fs_debug_exit(struct bch_fs *c) +{ + if (!IS_ERR_OR_NULL(c->fs_debug_dir)) + debugfs_remove_recursive(c->fs_debug_dir); +} + +void bch2_fs_debug_init(struct bch_fs *c) +{ + struct btree_debug *bd; + char name[100]; + + if (IS_ERR_OR_NULL(bch_debug)) + return; + + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); + if (IS_ERR_OR_NULL(c->fs_debug_dir)) + return; + + debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir, + c->btree_debug, &cached_btree_nodes_ops); + +#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS + debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir, + c->btree_debug, &btree_transactions_ops); +#endif + + debugfs_create_file("journal_pins", 0400, c->fs_debug_dir, + c->btree_debug, &journal_pins_ops); + + debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir, + c, &lock_held_stats_op); + + debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, + c->btree_debug, &btree_deadlock_ops); + + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); + if (IS_ERR_OR_NULL(c->btree_debug_dir)) + return; + + for (bd = c->btree_debug; + bd < c->btree_debug + ARRAY_SIZE(c->btree_debug); + bd++) { + bd->id = bd - c->btree_debug; + debugfs_create_file(bch2_btree_ids[bd->id], + 0400, c->btree_debug_dir, bd, + &btree_debug_ops); + + snprintf(name, sizeof(name), "%s-formats", + bch2_btree_ids[bd->id]); + + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &btree_format_debug_ops); + + snprintf(name, sizeof(name), "%s-bfloat-failed", + bch2_btree_ids[bd->id]); + + debugfs_create_file(name, 0400, c->btree_debug_dir, bd, + &bfloat_failed_debug_ops); + } +} + +#endif + +void bch2_debug_exit(void) +{ + if (!IS_ERR_OR_NULL(bch_debug)) + debugfs_remove_recursive(bch_debug); +} + +int __init bch2_debug_init(void) +{ + int ret = 0; + + bch_debug = debugfs_create_dir("bcachefs", NULL); + return ret; +} diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h new file mode 100644 index 000000000..2c37143b5 --- /dev/null +++ b/fs/bcachefs/debug.h @@ -0,0 +1,32 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DEBUG_H +#define _BCACHEFS_DEBUG_H + +#include "bcachefs.h" + +struct bio; +struct btree; +struct bch_fs; + +void __bch2_btree_verify(struct bch_fs *, struct btree *); +void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, + const struct btree *); + +static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) +{ + if (bch2_verify_btree_ondisk) + __bch2_btree_verify(c, b); +} + +#ifdef CONFIG_DEBUG_FS +void bch2_fs_debug_exit(struct bch_fs *); +void bch2_fs_debug_init(struct bch_fs *); +#else +static inline void bch2_fs_debug_exit(struct bch_fs *c) {} +static inline void bch2_fs_debug_init(struct bch_fs *c) {} +#endif + +void bch2_debug_exit(void); +int bch2_debug_init(void); + +#endif /* _BCACHEFS_DEBUG_H */ diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c new file mode 100644 index 000000000..065ea59ee --- /dev/null +++ b/fs/bcachefs/dirent.c @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "dirent.h" +#include "fs.h" +#include "keylist.h" +#include "str_hash.h" +#include "subvolume.h" + +#include + +unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) +{ + unsigned len = bkey_val_bytes(d.k) - + offsetof(struct bch_dirent, d_name); + + return strnlen(d.v->d_name, len); +} + +static u64 bch2_dirent_hash(const struct bch_hash_info *info, + const struct qstr *name) +{ + struct bch_str_hash_ctx ctx; + + bch2_str_hash_init(&ctx, info); + bch2_str_hash_update(&ctx, info, name->name, name->len); + + /* [0,2) reserved for dots */ + return max_t(u64, bch2_str_hash_end(&ctx, info), 2); +} + +static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) +{ + return bch2_dirent_hash(info, key); +} + +static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + struct qstr name = QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + + return bch2_dirent_hash(info, &name); +} + +static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + int len = bch2_dirent_name_bytes(l); + const struct qstr *r = _r; + + return len - r->len ?: memcmp(l.v->d_name, r->name, len); +} + +static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); + struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); + int l_len = bch2_dirent_name_bytes(l); + int r_len = bch2_dirent_name_bytes(r); + + return l_len - r_len ?: memcmp(l.v->d_name, r.v->d_name, l_len); +} + +static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_SUBVOL) + return le32_to_cpu(d.v->d_parent_subvol) == inum.subvol; + return true; +} + +const struct bch_hash_desc bch2_dirent_hash_desc = { + .btree_id = BTREE_ID_dirents, + .key_type = KEY_TYPE_dirent, + .hash_key = dirent_hash_key, + .hash_bkey = dirent_hash_bkey, + .cmp_key = dirent_cmp_key, + .cmp_bkey = dirent_cmp_bkey, + .is_visible = dirent_is_visible, +}; + +int bch2_dirent_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned len; + + len = bch2_dirent_name_bytes(d); + if (!len) { + prt_printf(err, "empty name"); + return -BCH_ERR_invalid_bkey; + } + + if (bkey_val_u64s(k.k) > dirent_val_u64s(len)) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), dirent_val_u64s(len)); + return -BCH_ERR_invalid_bkey; + } + + if (len > BCH_NAME_MAX) { + prt_printf(err, "dirent name too big (%u > %u)", + len, BCH_NAME_MAX); + return -BCH_ERR_invalid_bkey; + } + + if (len == 1 && !memcmp(d.v->d_name, ".", 1)) { + prt_printf(err, "invalid name"); + return -BCH_ERR_invalid_bkey; + } + + if (len == 2 && !memcmp(d.v->d_name, "..", 2)) { + prt_printf(err, "invalid name"); + return -BCH_ERR_invalid_bkey; + } + + if (memchr(d.v->d_name, '/', len)) { + prt_printf(err, "invalid name"); + return -BCH_ERR_invalid_bkey; + } + + if (d.v->d_type != DT_SUBVOL && + le64_to_cpu(d.v->d_inum) == d.k->p.inode) { + prt_printf(err, "dirent points to own directory"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + prt_printf(out, "%.*s -> %llu type %s", + bch2_dirent_name_bytes(d), + d.v->d_name, + d.v->d_type != DT_SUBVOL + ? le64_to_cpu(d.v->d_inum) + : le32_to_cpu(d.v->d_child_subvol), + bch2_d_type_str(d.v->d_type)); +} + +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + subvol_inum dir, u8 type, + const struct qstr *name, u64 dst) +{ + struct bkey_i_dirent *dirent; + unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); + + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + BUG_ON(u64s > U8_MAX); + + dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; + + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = u64s; + + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); + } + + dirent->v.d_type = type; + + memcpy(dirent->v.d_name, name->name, name->len); + memset(dirent->v.d_name + name->len, 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); + + EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); + + return dirent; +} + +int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, + const struct bch_hash_info *hash_info, + u8 type, const struct qstr *name, u64 dst_inum, + u64 *dir_offset, int flags) +{ + struct bkey_i_dirent *dirent; + int ret; + + dirent = dirent_create_key(trans, dir, type, name, dst_inum); + ret = PTR_ERR_OR_ZERO(dirent); + if (ret) + return ret; + + ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info, + dir, &dirent->k_i, flags); + *dir_offset = dirent->k.p.offset; + + return ret; +} + +static void dirent_copy_target(struct bkey_i_dirent *dst, + struct bkey_s_c_dirent src) +{ + dst->v.d_inum = src.v->d_inum; + dst->v.d_type = src.v->d_type; +} + +int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, + struct bkey_s_c_dirent d, subvol_inum *target) +{ + struct bch_subvolume s; + int ret = 0; + + if (d.v->d_type == DT_SUBVOL && + le32_to_cpu(d.v->d_parent_subvol) != dir.subvol) + return 1; + + if (likely(d.v->d_type != DT_SUBVOL)) { + target->subvol = dir.subvol; + target->inum = le64_to_cpu(d.v->d_inum); + } else { + target->subvol = le32_to_cpu(d.v->d_child_subvol); + + ret = bch2_subvolume_get(trans, target->subvol, true, BTREE_ITER_CACHED, &s); + + target->inum = le64_to_cpu(s.inode); + } + + return ret; +} + +int bch2_dirent_rename(struct btree_trans *trans, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, + const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, + const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, + enum bch_rename_mode mode) +{ + struct btree_iter src_iter = { NULL }; + struct btree_iter dst_iter = { NULL }; + struct bkey_s_c old_src, old_dst = bkey_s_c_null; + struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; + struct bpos dst_pos = + POS(dst_dir.inum, bch2_dirent_hash(dst_hash, dst_name)); + unsigned src_type = 0, dst_type = 0, src_update_flags = 0; + int ret = 0; + + if (src_dir.subvol != dst_dir.subvol) + return -EXDEV; + + memset(src_inum, 0, sizeof(*src_inum)); + memset(dst_inum, 0, sizeof(*dst_inum)); + + /* Lookup src: */ + ret = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, + src_hash, src_dir, src_name, + BTREE_ITER_INTENT); + if (ret) + goto out; + + old_src = bch2_btree_iter_peek_slot(&src_iter); + ret = bkey_err(old_src); + if (ret) + goto out; + + ret = bch2_dirent_read_target(trans, src_dir, + bkey_s_c_to_dirent(old_src), src_inum); + if (ret) + goto out; + + src_type = bkey_s_c_to_dirent(old_src).v->d_type; + + if (src_type == DT_SUBVOL && mode == BCH_RENAME_EXCHANGE) + return -EOPNOTSUPP; + + + /* Lookup dst: */ + if (mode == BCH_RENAME) { + /* + * Note that we're _not_ checking if the target already exists - + * we're relying on the VFS to do that check for us for + * correctness: + */ + ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name); + if (ret) + goto out; + } else { + ret = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, + dst_hash, dst_dir, dst_name, + BTREE_ITER_INTENT); + if (ret) + goto out; + + old_dst = bch2_btree_iter_peek_slot(&dst_iter); + ret = bkey_err(old_dst); + if (ret) + goto out; + + ret = bch2_dirent_read_target(trans, dst_dir, + bkey_s_c_to_dirent(old_dst), dst_inum); + if (ret) + goto out; + + dst_type = bkey_s_c_to_dirent(old_dst).v->d_type; + + if (dst_type == DT_SUBVOL) + return -EOPNOTSUPP; + } + + if (mode != BCH_RENAME_EXCHANGE) + *src_offset = dst_iter.pos.offset; + + /* Create new dst key: */ + new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + ret = PTR_ERR_OR_ZERO(new_dst); + if (ret) + goto out; + + dirent_copy_target(new_dst, bkey_s_c_to_dirent(old_src)); + new_dst->k.p = dst_iter.pos; + + /* Create new src key: */ + if (mode == BCH_RENAME_EXCHANGE) { + new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; + + dirent_copy_target(new_src, bkey_s_c_to_dirent(old_dst)); + new_src->k.p = src_iter.pos; + } else { + new_src = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + ret = PTR_ERR_OR_ZERO(new_src); + if (ret) + goto out; + + bkey_init(&new_src->k); + new_src->k.p = src_iter.pos; + + if (bkey_le(dst_pos, src_iter.pos) && + bkey_lt(src_iter.pos, dst_iter.pos)) { + /* + * We have a hash collision for the new dst key, + * and new_src - the key we're deleting - is between + * new_dst's hashed slot and the slot we're going to be + * inserting it into - oops. This will break the hash + * table if we don't deal with it: + */ + if (mode == BCH_RENAME) { + /* + * If we're not overwriting, we can just insert + * new_dst at the src position: + */ + new_src = new_dst; + new_src->k.p = src_iter.pos; + goto out_set_src; + } else { + /* If we're overwriting, we can't insert new_dst + * at a different slot because it has to + * overwrite old_dst - just make sure to use a + * whiteout when deleting src: + */ + new_src->k.type = KEY_TYPE_hash_whiteout; + } + } else { + /* Check if we need a whiteout to delete src: */ + ret = bch2_hash_needs_whiteout(trans, bch2_dirent_hash_desc, + src_hash, &src_iter); + if (ret < 0) + goto out; + + if (ret) + new_src->k.type = KEY_TYPE_hash_whiteout; + } + } + + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); + if (ret) + goto out; +out_set_src: + + /* + * If we're deleting a subvolume, we need to really delete the dirent, + * not just emit a whiteout in the current snapshot: + */ + if (src_type == DT_SUBVOL) { + bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(&src_iter); + if (ret) + goto out; + + new_src->k.p = src_iter.pos; + src_update_flags |= BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE; + } + + ret = bch2_trans_update(trans, &src_iter, &new_src->k_i, src_update_flags); + if (ret) + goto out; + + if (mode == BCH_RENAME_EXCHANGE) + *src_offset = new_src->k.p.offset; + *dst_offset = new_dst->k.p.offset; +out: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int __bch2_dirent_lookup_trans(struct btree_trans *trans, + struct btree_iter *iter, + subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum, + unsigned flags) +{ + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + ret = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, + hash_info, dir, name, flags); + if (ret) + return ret; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + goto err; + + d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, inum); + if (ret > 0) + ret = -ENOENT; +err: + if (ret) + bch2_trans_iter_exit(trans, iter); + + return ret; +} + +u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, + const struct bch_hash_info *hash_info, + const struct qstr *name, subvol_inum *inum) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = __bch2_dirent_lookup_trans(&trans, &iter, dir, hash_info, + name, inum, 0); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (!ret) + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +int bch2_empty_dir_trans(struct btree_trans *trans, subvol_inum dir) +{ + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_upto_norestart(trans, iter, BTREE_ID_dirents, + SPOS(dir.inum, 0, snapshot), + POS(dir.inum, U64_MAX), 0, k, ret) + if (k.k->type == KEY_TYPE_dirent) { + ret = -ENOTEMPTY; + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent dirent; + subvol_inum target; + u32 snapshot; + struct bkey_buf sk; + int ret; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_dirents, + SPOS(inum.inum, ctx->pos, snapshot), + POS(inum.inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) + continue; + + dirent = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(&trans, inum, dirent, &target); + if (ret < 0) + break; + if (ret) + continue; + + /* dir_emit() can fault and block: */ + bch2_bkey_buf_reassemble(&sk, c, k); + dirent = bkey_i_to_s_c_dirent(sk.k); + bch2_trans_unlock(&trans); + + ctx->pos = dirent.k->p.offset; + if (!dir_emit(ctx, dirent.v->d_name, + bch2_dirent_name_bytes(dirent), + target.inum, + vfs_d_type(dirent.v->d_type))) + break; + ctx->pos = dirent.k->p.offset + 1; + + /* + * read_target looks up subvolumes, we can overflow paths if the + * directory has many subvolumes in it + */ + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h new file mode 100644 index 000000000..b42f4a13b --- /dev/null +++ b/fs/bcachefs/dirent.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DIRENT_H +#define _BCACHEFS_DIRENT_H + +#include "str_hash.h" + +enum bkey_invalid_flags; +extern const struct bch_hash_desc bch2_dirent_hash_desc; + +int bch2_dirent_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_dirent_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_dirent ((struct bkey_ops) { \ + .key_invalid = bch2_dirent_invalid, \ + .val_to_text = bch2_dirent_to_text, \ + .min_val_size = 16, \ +}) + +struct qstr; +struct file; +struct dir_context; +struct bch_fs; +struct bch_hash_info; +struct bch_inode_info; + +unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent); + +static inline unsigned dirent_val_u64s(unsigned len) +{ + return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, + sizeof(u64)); +} + +int bch2_dirent_read_target(struct btree_trans *, subvol_inum, + struct bkey_s_c_dirent, subvol_inum *); + +int bch2_dirent_create(struct btree_trans *, subvol_inum, + const struct bch_hash_info *, u8, + const struct qstr *, u64, u64 *, int); + +static inline unsigned vfs_d_type(unsigned type) +{ + return type == DT_SUBVOL ? DT_DIR : type; +} + +enum bch_rename_mode { + BCH_RENAME, + BCH_RENAME_OVERWRITE, + BCH_RENAME_EXCHANGE, +}; + +int bch2_dirent_rename(struct btree_trans *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, + const struct qstr *, subvol_inum *, u64 *, + const struct qstr *, subvol_inum *, u64 *, + enum bch_rename_mode); + +int __bch2_dirent_lookup_trans(struct btree_trans *, struct btree_iter *, + subvol_inum, const struct bch_hash_info *, + const struct qstr *, subvol_inum *, unsigned); +u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, + const struct bch_hash_info *, + const struct qstr *, subvol_inum *); + +int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); +int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); + +#endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c new file mode 100644 index 000000000..de14ca3a9 --- /dev/null +++ b/fs/bcachefs/disk_groups.c @@ -0,0 +1,555 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "disk_groups.h" +#include "super-io.h" + +#include + +static int group_cmp(const void *_l, const void *_r) +{ + const struct bch_disk_group *l = _l; + const struct bch_disk_group *r = _r; + + return ((BCH_GROUP_DELETED(l) > BCH_GROUP_DELETED(r)) - + (BCH_GROUP_DELETED(l) < BCH_GROUP_DELETED(r))) ?: + ((BCH_GROUP_PARENT(l) > BCH_GROUP_PARENT(r)) - + (BCH_GROUP_PARENT(l) < BCH_GROUP_PARENT(r))) ?: + strncmp(l->label, r->label, sizeof(l->label)); +} + +static int bch2_sb_disk_groups_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g, *sorted = NULL; + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + unsigned nr_groups = disk_groups_nr(groups); + unsigned i, len; + int ret = 0; + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + unsigned g; + + if (!BCH_MEMBER_GROUP(m)) + continue; + + g = BCH_MEMBER_GROUP(m) - 1; + + if (g >= nr_groups) { + prt_printf(err, "disk %u has invalid label %u (have %u)", + i, g, nr_groups); + return -BCH_ERR_invalid_sb_disk_groups; + } + + if (BCH_GROUP_DELETED(&groups->entries[g])) { + prt_printf(err, "disk %u has deleted label %u", i, g); + return -BCH_ERR_invalid_sb_disk_groups; + } + } + + if (!nr_groups) + return 0; + + for (i = 0; i < nr_groups; i++) { + g = groups->entries + i; + + if (BCH_GROUP_DELETED(g)) + continue; + + len = strnlen(g->label, sizeof(g->label)); + if (!len) { + prt_printf(err, "label %u empty", i); + return -BCH_ERR_invalid_sb_disk_groups; + } + } + + sorted = kmalloc_array(nr_groups, sizeof(*sorted), GFP_KERNEL); + if (!sorted) + return -BCH_ERR_ENOMEM_disk_groups_validate; + + memcpy(sorted, groups->entries, nr_groups * sizeof(*sorted)); + sort(sorted, nr_groups, sizeof(*sorted), group_cmp, NULL); + + for (g = sorted; g + 1 < sorted + nr_groups; g++) + if (!BCH_GROUP_DELETED(g) && + !group_cmp(&g[0], &g[1])) { + prt_printf(err, "duplicate label %llu.%.*s", + BCH_GROUP_PARENT(g), + (int) sizeof(g->label), g->label); + ret = -BCH_ERR_invalid_sb_disk_groups; + goto err; + } +err: + kfree(sorted); + return ret; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_disk_groups_cpu *g; + struct bch_dev *ca; + int i; + unsigned iter; + + out->atomic++; + rcu_read_lock(); + + g = rcu_dereference(c->disk_groups); + if (!g) + goto out; + + for (i = 0; i < g->nr; i++) { + if (i) + prt_printf(out, " "); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + continue; + } + + prt_printf(out, "[parent %d devs", g->entries[i].parent); + for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); + prt_printf(out, "]"); + } + +out: + rcu_read_unlock(); + out->atomic--; +} + +static void bch2_sb_disk_groups_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_disk_groups *groups = + field_to_type(f, disk_groups); + struct bch_disk_group *g; + unsigned nr_groups = disk_groups_nr(groups); + + for (g = groups->entries; + g < groups->entries + nr_groups; + g++) { + if (g != groups->entries) + prt_printf(out, " "); + + if (BCH_GROUP_DELETED(g)) + prt_printf(out, "[deleted]"); + else + prt_printf(out, "[parent %llu name %s]", + BCH_GROUP_PARENT(g), g->label); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_disk_groups = { + .validate = bch2_sb_disk_groups_validate, + .to_text = bch2_sb_disk_groups_to_text +}; + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field_disk_groups *groups; + struct bch_disk_groups_cpu *cpu_g, *old_g; + unsigned i, g, nr_groups; + + lockdep_assert_held(&c->sb_lock); + + mi = bch2_sb_get_members(c->disk_sb.sb); + groups = bch2_sb_get_disk_groups(c->disk_sb.sb); + nr_groups = disk_groups_nr(groups); + + if (!groups) + return 0; + + cpu_g = kzalloc(sizeof(*cpu_g) + + sizeof(cpu_g->entries[0]) * nr_groups, GFP_KERNEL); + if (!cpu_g) + return -BCH_ERR_ENOMEM_disk_groups_to_cpu; + + cpu_g->nr = nr_groups; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *src = &groups->entries[i]; + struct bch_disk_group_cpu *dst = &cpu_g->entries[i]; + + dst->deleted = BCH_GROUP_DELETED(src); + dst->parent = BCH_GROUP_PARENT(src); + } + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + struct bch_disk_group_cpu *dst = + &cpu_g->entries[BCH_MEMBER_GROUP(m)]; + + if (!bch2_member_exists(m)) + continue; + + g = BCH_MEMBER_GROUP(m); + while (g) { + dst = &cpu_g->entries[g - 1]; + __set_bit(i, dst->devs.d); + g = dst->parent; + } + } + + old_g = rcu_dereference_protected(c->disk_groups, + lockdep_is_held(&c->sb_lock)); + rcu_assign_pointer(c->disk_groups, cpu_g); + if (old_g) + kfree_rcu(old_g, rcu); + + return 0; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) +{ + struct target t = target_decode(target); + struct bch_devs_mask *devs; + + rcu_read_lock(); + + switch (t.type) { + case TARGET_NULL: + devs = NULL; + break; + case TARGET_DEV: { + struct bch_dev *ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + devs = ca ? &ca->self : NULL; + break; + } + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + devs = g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + break; + } + default: + BUG(); + } + + rcu_read_unlock(); + + return devs; +} + +bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) +{ + struct target t = target_decode(target); + + switch (t.type) { + case TARGET_NULL: + return false; + case TARGET_DEV: + return dev == t.dev; + case TARGET_GROUP: { + struct bch_disk_groups_cpu *g; + const struct bch_devs_mask *m; + bool ret; + + rcu_read_lock(); + g = rcu_dereference(c->disk_groups); + m = g && t.group < g->nr && !g->entries[t.group].deleted + ? &g->entries[t.group].devs + : NULL; + + ret = m ? test_bit(dev, m->d) : false; + rcu_read_unlock(); + + return ret; + } + default: + BUG(); + } +} + +static int __bch2_disk_group_find(struct bch_sb_field_disk_groups *groups, + unsigned parent, + const char *name, unsigned namelen) +{ + unsigned i, nr_groups = disk_groups_nr(groups); + + if (!namelen || namelen > BCH_SB_LABEL_SIZE) + return -EINVAL; + + for (i = 0; i < nr_groups; i++) { + struct bch_disk_group *g = groups->entries + i; + + if (BCH_GROUP_DELETED(g)) + continue; + + if (!BCH_GROUP_DELETED(g) && + BCH_GROUP_PARENT(g) == parent && + strnlen(g->label, sizeof(g->label)) == namelen && + !memcmp(name, g->label, namelen)) + return i; + } + + return -1; +} + +static int __bch2_disk_group_add(struct bch_sb_handle *sb, unsigned parent, + const char *name, unsigned namelen) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_get_disk_groups(sb->sb); + unsigned i, nr_groups = disk_groups_nr(groups); + struct bch_disk_group *g; + + if (!namelen || namelen > BCH_SB_LABEL_SIZE) + return -EINVAL; + + for (i = 0; + i < nr_groups && !BCH_GROUP_DELETED(&groups->entries[i]); + i++) + ; + + if (i == nr_groups) { + unsigned u64s = + (sizeof(struct bch_sb_field_disk_groups) + + sizeof(struct bch_disk_group) * (nr_groups + 1)) / + sizeof(u64); + + groups = bch2_sb_resize_disk_groups(sb, u64s); + if (!groups) + return -BCH_ERR_ENOSPC_disk_label_add; + + nr_groups = disk_groups_nr(groups); + } + + BUG_ON(i >= nr_groups); + + g = &groups->entries[i]; + + memcpy(g->label, name, namelen); + if (namelen < sizeof(g->label)) + g->label[namelen] = '\0'; + SET_BCH_GROUP_DELETED(g, 0); + SET_BCH_GROUP_PARENT(g, parent); + SET_BCH_GROUP_DATA_ALLOWED(g, ~0); + + return i; +} + +int bch2_disk_path_find(struct bch_sb_handle *sb, const char *name) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_get_disk_groups(sb->sb); + int v = -1; + + do { + const char *next = strchrnul(name, '.'); + unsigned len = next - name; + + if (*next == '.') + next++; + + v = __bch2_disk_group_find(groups, v + 1, name, len); + name = next; + } while (*name && v >= 0); + + return v; +} + +int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) +{ + struct bch_sb_field_disk_groups *groups; + unsigned parent = 0; + int v = -1; + + do { + const char *next = strchrnul(name, '.'); + unsigned len = next - name; + + if (*next == '.') + next++; + + groups = bch2_sb_get_disk_groups(sb->sb); + + v = __bch2_disk_group_find(groups, parent, name, len); + if (v < 0) + v = __bch2_disk_group_add(sb, parent, name, len); + if (v < 0) + return v; + + parent = v + 1; + name = next; + } while (*name && v >= 0); + + return v; +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_sb *sb, unsigned v) +{ + struct bch_sb_field_disk_groups *groups = + bch2_sb_get_disk_groups(sb); + struct bch_disk_group *g; + unsigned nr = 0; + u16 path[32]; + + while (1) { + if (nr == ARRAY_SIZE(path)) + goto inval; + + if (v >= disk_groups_nr(groups)) + goto inval; + + g = groups->entries + v; + + if (BCH_GROUP_DELETED(g)) + goto inval; + + path[nr++] = v; + + if (!BCH_GROUP_PARENT(g)) + break; + + v = BCH_GROUP_PARENT(g) - 1; + } + + while (nr) { + v = path[--nr]; + g = groups->entries + v; + + prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + if (nr) + prt_printf(out, "."); + } + return; +inval: + prt_printf(out, "invalid label %u", v); +} + +int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + struct bch_member *mi; + int ret, v = -1; + + if (!strlen(name) || !strcmp(name, "none")) + return 0; + + v = bch2_disk_path_find_or_create(&c->disk_sb, name); + if (v < 0) + return v; + + ret = bch2_sb_disk_groups_to_cpu(c); + if (ret) + return ret; + + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + SET_BCH_MEMBER_GROUP(mi, v + 1); + return 0; +} + +int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) +{ + int ret; + + mutex_lock(&c->sb_lock); + ret = __bch2_dev_group_set(c, ca, name) ?: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_opt_target_parse(struct bch_fs *c, const char *val, u64 *res, + struct printbuf *err) +{ + struct bch_dev *ca; + int g; + + if (!val) + return -EINVAL; + + if (!c) + return 0; + + if (!strlen(val) || !strcmp(val, "none")) { + *res = 0; + return 0; + } + + /* Is it a device? */ + ca = bch2_dev_lookup(c, val); + if (!IS_ERR(ca)) { + *res = dev_to_target(ca->dev_idx); + percpu_ref_put(&ca->ref); + return 0; + } + + mutex_lock(&c->sb_lock); + g = bch2_disk_path_find(&c->disk_sb, val); + mutex_unlock(&c->sb_lock); + + if (g >= 0) { + *res = group_to_target(g); + return 0; + } + + return -EINVAL; +} + +void bch2_opt_target_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + struct target t = target_decode(v); + + switch (t.type) { + case TARGET_NULL: + prt_printf(out, "none"); + break; + case TARGET_DEV: + if (c) { + struct bch_dev *ca; + + rcu_read_lock(); + ca = t.dev < c->sb.nr_devices + ? rcu_dereference(c->devs[t.dev]) + : NULL; + + if (ca && percpu_ref_tryget(&ca->io_ref)) { + prt_printf(out, "/dev/%pg", ca->disk_sb.bdev); + percpu_ref_put(&ca->io_ref); + } else if (ca) { + prt_printf(out, "offline device %u", t.dev); + } else { + prt_printf(out, "invalid device %u", t.dev); + } + + rcu_read_unlock(); + } else { + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + struct bch_member *m = mi->members + t.dev; + + if (bch2_dev_exists(sb, mi, t.dev)) { + prt_printf(out, "Device "); + pr_uuid(out, m->uuid.b); + prt_printf(out, " (%u)", t.dev); + } else { + prt_printf(out, "Bad device %u", t.dev); + } + } + break; + case TARGET_GROUP: + if (c) { + mutex_lock(&c->sb_lock); + bch2_disk_path_to_text(out, c->disk_sb.sb, t.group); + mutex_unlock(&c->sb_lock); + } else { + bch2_disk_path_to_text(out, sb, t.group); + } + break; + default: + BUG(); + } +} diff --git a/fs/bcachefs/disk_groups.h b/fs/bcachefs/disk_groups.h new file mode 100644 index 000000000..bd7711767 --- /dev/null +++ b/fs/bcachefs/disk_groups.h @@ -0,0 +1,106 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_DISK_GROUPS_H +#define _BCACHEFS_DISK_GROUPS_H + +extern const struct bch_sb_field_ops bch_sb_field_ops_disk_groups; + +static inline unsigned disk_groups_nr(struct bch_sb_field_disk_groups *groups) +{ + return groups + ? (vstruct_end(&groups->field) - + (void *) &groups->entries[0]) / sizeof(struct bch_disk_group) + : 0; +} + +struct target { + enum { + TARGET_NULL, + TARGET_DEV, + TARGET_GROUP, + } type; + union { + unsigned dev; + unsigned group; + }; +}; + +#define TARGET_DEV_START 1 +#define TARGET_GROUP_START (256 + TARGET_DEV_START) + +static inline u16 dev_to_target(unsigned dev) +{ + return TARGET_DEV_START + dev; +} + +static inline u16 group_to_target(unsigned group) +{ + return TARGET_GROUP_START + group; +} + +static inline struct target target_decode(unsigned target) +{ + if (target >= TARGET_GROUP_START) + return (struct target) { + .type = TARGET_GROUP, + .group = target - TARGET_GROUP_START + }; + + if (target >= TARGET_DEV_START) + return (struct target) { + .type = TARGET_DEV, + .group = target - TARGET_DEV_START + }; + + return (struct target) { .type = TARGET_NULL }; +} + +const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *, unsigned); + +static inline struct bch_devs_mask target_rw_devs(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask devs = c->rw_devs[data_type]; + const struct bch_devs_mask *t = bch2_target_to_mask(c, target); + + if (t) + bitmap_and(devs.d, devs.d, t->d, BCH_SB_MEMBERS_MAX); + return devs; +} + +static inline bool bch2_target_accepts_data(struct bch_fs *c, + enum bch_data_type data_type, + u16 target) +{ + struct bch_devs_mask rw_devs = target_rw_devs(c, data_type, target); + return !bitmap_empty(rw_devs.d, BCH_SB_MEMBERS_MAX); +} + +bool bch2_dev_in_target(struct bch_fs *, unsigned, unsigned); + +int bch2_disk_path_find(struct bch_sb_handle *, const char *); + +/* Exported for userspace bcachefs-tools: */ +int bch2_disk_path_find_or_create(struct bch_sb_handle *, const char *); + +void bch2_disk_path_to_text(struct printbuf *, struct bch_sb *, unsigned); + +int bch2_opt_target_parse(struct bch_fs *, const char *, u64 *, struct printbuf *); +void bch2_opt_target_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); + +#define bch2_opt_target (struct bch_opt_fn) { \ + .parse = bch2_opt_target_parse, \ + .to_text = bch2_opt_target_to_text, \ +} + +int bch2_sb_disk_groups_to_cpu(struct bch_fs *); + +int __bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); +int bch2_dev_group_set(struct bch_fs *, struct bch_dev *, const char *); + +const char *bch2_sb_validate_disk_groups(struct bch_sb *, + struct bch_sb_field *); + +void bch2_disk_groups_to_text(struct printbuf *, struct bch_fs *); + +#endif /* _BCACHEFS_DISK_GROUPS_H */ diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c new file mode 100644 index 000000000..efbb7cf7a --- /dev/null +++ b/fs/bcachefs/ec.c @@ -0,0 +1,1960 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* erasure coding */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io.h" +#include "keylist.h" +#include "recovery.h" +#include "replicas.h" +#include "super-io.h" +#include "util.h" + +#include + +#ifdef __KERNEL__ + +#include +#include + +static void raid5_recov(unsigned disks, unsigned failed_idx, + size_t size, void **data) +{ + unsigned i = 2, nr; + + BUG_ON(failed_idx >= disks); + + swap(data[0], data[failed_idx]); + memcpy(data[0], data[1], size); + + while (i < disks) { + nr = min_t(unsigned, disks - i, MAX_XOR_BLOCKS); + xor_blocks(nr, size, data[0], data + i); + i += nr; + } + + swap(data[0], data[failed_idx]); +} + +static void raid_gen(int nd, int np, size_t size, void **v) +{ + if (np >= 1) + raid5_recov(nd + np, nd, size, v); + if (np >= 2) + raid6_call.gen_syndrome(nd + np, size, v); + BUG_ON(np > 2); +} + +static void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v) +{ + switch (nr) { + case 0: + break; + case 1: + if (ir[0] < nd + 1) + raid5_recov(nd + 1, ir[0], size, v); + else + raid6_call.gen_syndrome(nd + np, size, v); + break; + case 2: + if (ir[1] < nd) { + /* data+data failure. */ + raid6_2data_recov(nd + np, size, ir[0], ir[1], v); + } else if (ir[0] < nd) { + /* data + p/q failure */ + + if (ir[1] == nd) /* data + p failure */ + raid6_datap_recov(nd + np, size, ir[0], v); + else { /* data + q failure */ + raid5_recov(nd + 1, ir[0], size, v); + raid6_call.gen_syndrome(nd + np, size, v); + } + } else { + raid_gen(nd, np, size, v); + } + break; + default: + BUG(); + } +} + +#else + +#include + +#endif + +struct ec_bio { + struct bch_dev *ca; + struct ec_stripe_buf *buf; + size_t idx; + struct bio bio; +}; + +/* Stripes btree keys: */ + +int bch2_stripe_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + + if (bkey_eq(k.k->p, POS_MIN)) { + prt_printf(err, "stripe at POS_MIN"); + return -BCH_ERR_invalid_bkey; + } + + if (k.k->p.inode) { + prt_printf(err, "nonzero inode field"); + return -BCH_ERR_invalid_bkey; + } + + if (bkey_val_u64s(k.k) < stripe_val_u64s(s)) { + prt_printf(err, "incorrect value size (%zu < %u)", + bkey_val_u64s(k.k), stripe_val_u64s(s)); + return -BCH_ERR_invalid_bkey; + } + + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} + +void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_stripe *s = bkey_s_c_to_stripe(k).v; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + + prt_printf(out, "algo %u sectors %u blocks %u:%u csum %u gran %u", + s->algorithm, + le16_to_cpu(s->sectors), + nr_data, + s->nr_redundant, + s->csum_type, + 1U << s->csum_granularity_bits); + + for (i = 0; i < s->nr_blocks; i++) { + const struct bch_extent_ptr *ptr = s->ptrs + i; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, " %u:%llu:%u", ptr->dev, b, offset); + if (i < nr_data) + prt_printf(out, "#%u", stripe_blockcount_get(s, i)); + if (ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } +} + +/* returns blocknr in stripe that we matched: */ +static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, + struct bkey_s_c k, unsigned *block) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + unsigned i, nr_data = s->nr_blocks - s->nr_redundant; + + bkey_for_each_ptr(ptrs, ptr) + for (i = 0; i < nr_data; i++) + if (__bch2_ptr_matches_stripe(&s->ptrs[i], ptr, + le16_to_cpu(s->sectors))) { + *block = i; + return ptr; + } + + return NULL; +} + +static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) +{ + switch (k.k->type) { + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const union bch_extent_entry *entry; + + extent_for_each_entry(e, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; + + break; + } + } + + return false; +} + +/* Stripe bufs: */ + +static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) +{ + unsigned i; + + for (i = 0; i < buf->key.v.nr_blocks; i++) { + kvpfree(buf->data[i], buf->size << 9); + buf->data[i] = NULL; + } +} + +/* XXX: this is a non-mempoolified memory allocation: */ +static int ec_stripe_buf_init(struct ec_stripe_buf *buf, + unsigned offset, unsigned size) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1U << v->csum_granularity_bits; + unsigned end = offset + size; + unsigned i; + + BUG_ON(end > le16_to_cpu(v->sectors)); + + offset = round_down(offset, csum_granularity); + end = min_t(unsigned, le16_to_cpu(v->sectors), + round_up(end, csum_granularity)); + + buf->offset = offset; + buf->size = end - offset; + + memset(buf->valid, 0xFF, sizeof(buf->valid)); + + for (i = 0; i < buf->key.v.nr_blocks; i++) { + buf->data[i] = kvpmalloc(buf->size << 9, GFP_KERNEL); + if (!buf->data[i]) + goto err; + } + + return 0; +err: + ec_stripe_buf_exit(buf); + return -BCH_ERR_ENOMEM_stripe_buf; +} + +/* Checksumming: */ + +static struct bch_csum ec_block_checksum(struct ec_stripe_buf *buf, + unsigned block, unsigned offset) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned end = buf->offset + buf->size; + unsigned len = min(csum_granularity, end - offset); + + BUG_ON(offset >= end); + BUG_ON(offset < buf->offset); + BUG_ON(offset & (csum_granularity - 1)); + BUG_ON(offset + len != le16_to_cpu(v->sectors) && + (len & (csum_granularity - 1))); + + return bch2_checksum(NULL, v->csum_type, + null_nonce(), + buf->data[block] + ((offset - buf->offset) << 9), + len << 9); +} + +static void ec_generate_checksums(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, j, csums_per_device = stripe_csums_per_device(v); + + if (!v->csum_type) + return; + + BUG_ON(buf->offset); + BUG_ON(buf->size != le16_to_cpu(v->sectors)); + + for (i = 0; i < v->nr_blocks; i++) + for (j = 0; j < csums_per_device; j++) + stripe_csum_set(v, i, j, + ec_block_checksum(buf, i, j << v->csum_granularity_bits)); +} + +static void ec_validate_checksums(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned csum_granularity = 1 << v->csum_granularity_bits; + unsigned i; + + if (!v->csum_type) + return; + + for (i = 0; i < v->nr_blocks; i++) { + unsigned offset = buf->offset; + unsigned end = buf->offset + buf->size; + + if (!test_bit(i, buf->valid)) + continue; + + while (offset < end) { + unsigned j = offset >> v->csum_granularity_bits; + unsigned len = min(csum_granularity, end - offset); + struct bch_csum want = stripe_csum_get(v, i, j); + struct bch_csum got = ec_block_checksum(buf, i, offset); + + if (bch2_crc_cmp(want, got)) { + struct printbuf buf2 = PRINTBUF; + + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(&buf->key.k_i)); + + bch_err_ratelimited(c, + "stripe checksum error for %ps at %u:%u: csum type %u, expected %llx got %llx\n%s", + (void *) _RET_IP_, i, j, v->csum_type, + want.lo, got.lo, buf2.buf); + printbuf_exit(&buf2); + clear_bit(i, buf->valid); + break; + } + + offset += len; + } + } +} + +/* Erasure coding: */ + +static void ec_generate_ec(struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = le16_to_cpu(v->sectors) << 9; + + raid_gen(nr_data, v->nr_redundant, bytes, buf->data); +} + +static unsigned ec_nr_failed(struct ec_stripe_buf *buf) +{ + return buf->key.v.nr_blocks - + bitmap_weight(buf->valid, buf->key.v.nr_blocks); +} + +static int ec_do_recov(struct bch_fs *c, struct ec_stripe_buf *buf) +{ + struct bch_stripe *v = &buf->key.v; + unsigned i, failed[BCH_BKEY_PTRS_MAX], nr_failed = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; + unsigned bytes = buf->size << 9; + + if (ec_nr_failed(buf) > v->nr_redundant) { + bch_err_ratelimited(c, + "error doing reconstruct read: unable to read enough blocks"); + return -1; + } + + for (i = 0; i < nr_data; i++) + if (!test_bit(i, buf->valid)) + failed[nr_failed++] = i; + + raid_rec(nr_failed, failed, nr_data, v->nr_redundant, bytes, buf->data); + return 0; +} + +/* IO: */ + +static void ec_block_endio(struct bio *bio) +{ + struct ec_bio *ec_bio = container_of(bio, struct ec_bio, bio); + struct bch_stripe *v = &ec_bio->buf->key.v; + struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; + struct bch_dev *ca = ec_bio->ca; + struct closure *cl = bio->bi_private; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "erasure coding %s error: %s", + bio_data_dir(bio) ? "write" : "read", + bch2_blk_status_to_str(bio->bi_status))) + clear_bit(ec_bio->idx, ec_bio->buf->valid); + + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(ca->fs, + "error %s stripe: stale pointer after io", + bio_data_dir(bio) == READ ? "reading from" : "writing to"); + clear_bit(ec_bio->idx, ec_bio->buf->valid); + } + + bio_put(&ec_bio->bio); + percpu_ref_put(&ca->io_ref); + closure_put(cl); +} + +static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, + blk_opf_t opf, unsigned idx, struct closure *cl) +{ + struct bch_stripe *v = &buf->key.v; + unsigned offset = 0, bytes = buf->size << 9; + struct bch_extent_ptr *ptr = &v->ptrs[idx]; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev); + enum bch_data_type data_type = idx < buf->key.v.nr_blocks - buf->key.v.nr_redundant + ? BCH_DATA_user + : BCH_DATA_parity; + int rw = op_is_write(opf); + + if (ptr_stale(ca, ptr)) { + bch_err_ratelimited(c, + "error %s stripe: stale pointer", + rw == READ ? "reading from" : "writing to"); + clear_bit(idx, buf->valid); + return; + } + + if (!bch2_dev_get_ioref(ca, rw)) { + clear_bit(idx, buf->valid); + return; + } + + this_cpu_add(ca->io_done->sectors[rw][data_type], buf->size); + + while (offset < bytes) { + unsigned nr_iovecs = min_t(size_t, BIO_MAX_VECS, + DIV_ROUND_UP(bytes, PAGE_SIZE)); + unsigned b = min_t(size_t, bytes - offset, + nr_iovecs << PAGE_SHIFT); + struct ec_bio *ec_bio; + + ec_bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, + nr_iovecs, + opf, + GFP_KERNEL, + &c->ec_bioset), + struct ec_bio, bio); + + ec_bio->ca = ca; + ec_bio->buf = buf; + ec_bio->idx = idx; + + ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); + ec_bio->bio.bi_end_io = ec_block_endio; + ec_bio->bio.bi_private = cl; + + bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); + + closure_get(cl); + percpu_ref_get(&ca->io_ref); + + submit_bio(&ec_bio->bio); + + offset += b; + } + + percpu_ref_put(&ca->io_ref); +} + +static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, + struct ec_stripe_buf *stripe) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + POS(0, idx), BTREE_ITER_SLOTS); + ret = bkey_err(k); + if (ret) + goto err; + if (k.k->type != KEY_TYPE_stripe) { + ret = -ENOENT; + goto err; + } + bkey_reassemble(&stripe->key.k_i, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int get_stripe_key(struct bch_fs *c, u64 idx, struct ec_stripe_buf *stripe) +{ + return bch2_trans_run(c, get_stripe_key_trans(&trans, idx, stripe)); +} + +/* recovery read path: */ +int bch2_ec_read_extent(struct bch_fs *c, struct bch_read_bio *rbio) +{ + struct ec_stripe_buf *buf; + struct closure cl; + struct bch_stripe *v; + unsigned i, offset; + int ret = 0; + + closure_init_stack(&cl); + + BUG_ON(!rbio->pick.has_ec); + + buf = kzalloc(sizeof(*buf), GFP_NOFS); + if (!buf) + return -BCH_ERR_ENOMEM_ec_read_extent; + + ret = get_stripe_key(c, rbio->pick.ec.idx, buf); + if (ret) { + bch_err_ratelimited(c, + "error doing reconstruct read: error %i looking up stripe", ret); + kfree(buf); + return -EIO; + } + + v = &buf->key.v; + + if (!bch2_ptr_matches_stripe(v, rbio->pick)) { + bch_err_ratelimited(c, + "error doing reconstruct read: pointer doesn't match stripe"); + ret = -EIO; + goto err; + } + + offset = rbio->bio.bi_iter.bi_sector - v->ptrs[rbio->pick.ec.block].offset; + if (offset + bio_sectors(&rbio->bio) > le16_to_cpu(v->sectors)) { + bch_err_ratelimited(c, + "error doing reconstruct read: read is bigger than stripe"); + ret = -EIO; + goto err; + } + + ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + if (ret) + goto err; + + for (i = 0; i < v->nr_blocks; i++) + ec_block_io(c, buf, REQ_OP_READ, i, &cl); + + closure_sync(&cl); + + if (ec_nr_failed(buf) > v->nr_redundant) { + bch_err_ratelimited(c, + "error doing reconstruct read: unable to read enough blocks"); + ret = -EIO; + goto err; + } + + ec_validate_checksums(c, buf); + + ret = ec_do_recov(c, buf); + if (ret) + goto err; + + memcpy_to_bio(&rbio->bio, rbio->bio.bi_iter, + buf->data[rbio->pick.ec.block] + ((offset - buf->offset) << 9)); +err: + ec_stripe_buf_exit(buf); + kfree(buf); + return ret; +} + +/* stripe bucket accounting: */ + +static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) +{ + ec_stripes_heap n, *h = &c->ec_stripes_heap; + + if (idx >= h->size) { + if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + + mutex_lock(&c->ec_stripes_heap_lock); + if (n.size > h->size) { + memcpy(n.data, h->data, h->used * sizeof(h->data[0])); + n.used = h->used; + swap(*h, n); + } + mutex_unlock(&c->ec_stripes_heap_lock); + + free_heap(&n); + } + + if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + + if (c->gc_pos.phase != GC_PHASE_NOT_RUNNING && + !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) + return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + + return 0; +} + +static int ec_stripe_mem_alloc(struct btree_trans *trans, + struct btree_iter *iter) +{ + return allocate_dropping_locks_errcode(trans, + __ec_stripe_mem_alloc(trans->c, iter->pos.offset, _gfp)); +} + +/* + * Hash table of open stripes: + * Stripes that are being created or modified are kept in a hash table, so that + * stripe deletion can skip them. + */ + +static bool __bch2_stripe_is_open(struct bch_fs *c, u64 idx) +{ + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + struct ec_stripe_new *s; + + hlist_for_each_entry(s, &c->ec_stripes_new[hash], hash) + if (s->idx == idx) + return true; + return false; +} + +static bool bch2_stripe_is_open(struct bch_fs *c, u64 idx) +{ + bool ret = false; + + spin_lock(&c->ec_stripes_new_lock); + ret = __bch2_stripe_is_open(c, idx); + spin_unlock(&c->ec_stripes_new_lock); + + return ret; +} + +static bool bch2_try_open_stripe(struct bch_fs *c, + struct ec_stripe_new *s, + u64 idx) +{ + bool ret; + + spin_lock(&c->ec_stripes_new_lock); + ret = !__bch2_stripe_is_open(c, idx); + if (ret) { + unsigned hash = hash_64(idx, ilog2(ARRAY_SIZE(c->ec_stripes_new))); + + s->idx = idx; + hlist_add_head(&s->hash, &c->ec_stripes_new[hash]); + } + spin_unlock(&c->ec_stripes_new_lock); + + return ret; +} + +static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) +{ + BUG_ON(!s->idx); + + spin_lock(&c->ec_stripes_new_lock); + hlist_del_init(&s->hash); + spin_unlock(&c->ec_stripes_new_lock); + + s->idx = 0; +} + +/* Heap of all existing stripes, ordered by blocks_nonempty */ + +static u64 stripe_idx_to_delete(struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + + lockdep_assert_held(&c->ec_stripes_heap_lock); + + if (h->used && + h->data[0].blocks_nonempty == 0 && + !bch2_stripe_is_open(c, h->data[0].idx)) + return h->data[0].idx; + + return 0; +} + +static inline int ec_stripes_heap_cmp(ec_stripes_heap *h, + struct ec_stripe_heap_entry l, + struct ec_stripe_heap_entry r) +{ + return ((l.blocks_nonempty > r.blocks_nonempty) - + (l.blocks_nonempty < r.blocks_nonempty)); +} + +static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, + size_t i) +{ + struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); + + genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; +} + +static void heap_verify_backpointer(struct bch_fs *c, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m = genradix_ptr(&c->stripes, idx); + + BUG_ON(m->heap_idx >= h->used); + BUG_ON(h->data[m->heap_idx].idx != idx); +} + +void bch2_stripes_heap_del(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + mutex_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + + heap_del(&c->ec_stripes_heap, m->heap_idx, + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + mutex_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_stripes_heap_insert(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + mutex_lock(&c->ec_stripes_heap_lock); + BUG_ON(heap_full(&c->ec_stripes_heap)); + + heap_add(&c->ec_stripes_heap, ((struct ec_stripe_heap_entry) { + .idx = idx, + .blocks_nonempty = m->blocks_nonempty, + }), + ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + mutex_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_stripes_heap_update(struct bch_fs *c, + struct stripe *m, size_t idx) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + bool do_deletes; + size_t i; + + mutex_lock(&c->ec_stripes_heap_lock); + heap_verify_backpointer(c, idx); + + h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; + + i = m->heap_idx; + heap_sift_up(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + heap_sift_down(h, i, ec_stripes_heap_cmp, + ec_stripes_heap_set_backpointer); + + heap_verify_backpointer(c, idx); + + do_deletes = stripe_idx_to_delete(c) != 0; + mutex_unlock(&c->ec_stripes_heap_lock); + + if (do_deletes) + bch2_do_stripe_deletes(c); +} + +/* stripe deletion */ + +static int ec_stripe_delete(struct btree_trans *trans, u64 idx) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_stripe s; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_stripe) { + bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); + ret = -EINVAL; + goto err; + } + + s = bkey_s_c_to_stripe(k); + for (unsigned i = 0; i < s.v->nr_blocks; i++) + if (stripe_blockcount_get(s.v, i)) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); + printbuf_exit(&buf); + ret = -EINVAL; + goto err; + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void ec_stripe_delete_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, ec_stripe_delete_work); + struct btree_trans trans; + int ret; + u64 idx; + + bch2_trans_init(&trans, c, 0, 0); + + while (1) { + mutex_lock(&c->ec_stripes_heap_lock); + idx = stripe_idx_to_delete(c); + mutex_unlock(&c->ec_stripes_heap_lock); + + if (!idx) + break; + + ret = commit_do(&trans, NULL, NULL, BTREE_INSERT_NOFAIL, + ec_stripe_delete(&trans, idx)); + if (ret) { + bch_err_fn(c, ret); + break; + } + } + + bch2_trans_exit(&trans); + + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); +} + +void bch2_do_stripe_deletes(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); +} + +/* stripe creation: */ + +static int ec_stripe_key_update(struct btree_trans *trans, + struct bkey_i_stripe *new, + bool create) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, + new->k.p, BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != (create ? KEY_TYPE_deleted : KEY_TYPE_stripe)) { + bch2_fs_inconsistent(c, "error %s stripe: got existing key type %s", + create ? "creating" : "updating", + bch2_bkey_types[k.k->type]); + ret = -EINVAL; + goto err; + } + + if (k.k->type == KEY_TYPE_stripe) { + const struct bch_stripe *old = bkey_s_c_to_stripe(k).v; + unsigned i; + + if (old->nr_blocks != new->v.nr_blocks) { + bch_err(c, "error updating stripe: nr_blocks does not match"); + ret = -EINVAL; + goto err; + } + + for (i = 0; i < new->v.nr_blocks; i++) { + unsigned v = stripe_blockcount_get(old, i); + + BUG_ON(v && + (old->ptrs[i].dev != new->v.ptrs[i].dev || + old->ptrs[i].gen != new->v.ptrs[i].gen || + old->ptrs[i].offset != new->v.ptrs[i].offset)); + + stripe_blockcount_set(&new->v, i, v); + } + } + + ret = bch2_trans_update(trans, &iter, &new->k_i, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int ec_stripe_update_extent(struct btree_trans *trans, + struct bpos bucket, u8 gen, + struct ec_stripe_buf *s, + struct bpos *bp_pos) +{ + struct bch_fs *c = trans->c; + struct bch_backpointer bp; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_extent_ptr *ptr_c; + struct bch_extent_ptr *ptr, *ec_ptr = NULL; + struct bch_extent_stripe_ptr stripe_ptr; + struct bkey_i *n; + int ret, dev, block; + + ret = bch2_get_next_backpointer(trans, bucket, gen, + bp_pos, &bp, BTREE_ITER_CACHED); + if (ret) + return ret; + if (bpos_eq(*bp_pos, SPOS_MAX)) + return 0; + + if (bp.level) { + struct printbuf buf = PRINTBUF; + struct btree_iter node_iter; + struct btree *b; + + b = bch2_backpointer_get_node(trans, &node_iter, *bp_pos, bp); + bch2_trans_iter_exit(trans, &node_iter); + + if (!b) + return 0; + + prt_printf(&buf, "found btree node in erasure coded bucket: b=%px\n", b); + bch2_backpointer_to_text(&buf, &bp); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EIO; + } + + k = bch2_backpointer_get_key(trans, &iter, *bp_pos, bp, BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + return ret; + if (!k.k) { + /* + * extent no longer exists - we could flush the btree + * write buffer and retry to verify, but no need: + */ + return 0; + } + + if (extent_has_stripe_ptr(k, s->key.k.p.offset)) + goto out; + + ptr_c = bkey_matches_stripe(&s->key.v, k, &block); + /* + * It doesn't generally make sense to erasure code cached ptrs: + * XXX: should we be incrementing a counter? + */ + if (!ptr_c || ptr_c->cached) + goto out; + + dev = s->key.v.ptrs[block].dev; + + n = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + sizeof(stripe_ptr)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto out; + + bkey_reassemble(n, k); + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, ptr->dev != dev); + ec_ptr = bch2_bkey_has_device(bkey_i_to_s(n), dev); + BUG_ON(!ec_ptr); + + stripe_ptr = (struct bch_extent_stripe_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_stripe_ptr, + .block = block, + .redundancy = s->key.v.nr_redundant, + .idx = s->key.k.p.offset, + }; + + __extent_entry_insert(n, + (union bch_extent_entry *) ec_ptr, + (union bch_extent_entry *) &stripe_ptr); + + ret = bch2_trans_update(trans, &iter, n, 0); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_buf *s, + unsigned block) +{ + struct bch_fs *c = trans->c; + struct bch_extent_ptr bucket = s->key.v.ptrs[block]; + struct bpos bucket_pos = PTR_BUCKET_POS(c, &bucket); + struct bpos bp_pos = POS_MIN; + int ret = 0; + + while (1) { + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, + ec_stripe_update_extent(trans, bucket_pos, bucket.gen, + s, &bp_pos)); + if (ret) + break; + if (bkey_eq(bp_pos, POS_MAX)) + break; + + bp_pos = bpos_nosnap_successor(bp_pos); + } + + return ret; +} + +static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) +{ + struct btree_trans trans; + struct bch_stripe *v = &s->key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + ret = bch2_btree_write_buffer_flush(&trans); + if (ret) + goto err; + + for (i = 0; i < nr_data; i++) { + ret = ec_stripe_update_bucket(&trans, s, i); + if (ret) + break; + } +err: + bch2_trans_exit(&trans); + + return ret; +} + +static void zero_out_rest_of_ec_bucket(struct bch_fs *c, + struct ec_stripe_new *s, + unsigned block, + struct open_bucket *ob) +{ + struct bch_dev *ca = bch_dev_bkey_exists(c, ob->dev); + unsigned offset = ca->mi.bucket_size - ob->sectors_free; + int ret; + + if (!bch2_dev_get_ioref(ca, WRITE)) { + s->err = -BCH_ERR_erofs_no_writes; + return; + } + + memset(s->new_stripe.data[block] + (offset << 9), + 0, + ob->sectors_free << 9); + + ret = blkdev_issue_zeroout(ca->disk_sb.bdev, + ob->bucket * ca->mi.bucket_size + offset, + ob->sectors_free, + GFP_KERNEL, 0); + + percpu_ref_put(&ca->io_ref); + + if (ret) + s->err = ret; +} + +void bch2_ec_stripe_new_free(struct bch_fs *c, struct ec_stripe_new *s) +{ + if (s->idx) + bch2_stripe_close(c, s); + kfree(s); +} + +/* + * data buckets of new stripe all written: create the stripe + */ +static void ec_stripe_create(struct ec_stripe_new *s) +{ + struct bch_fs *c = s->c; + struct open_bucket *ob; + struct bch_stripe *v = &s->new_stripe.key.v; + unsigned i, nr_data = v->nr_blocks - v->nr_redundant; + int ret; + + BUG_ON(s->h->s == s); + + closure_sync(&s->iodone); + + if (!s->err) { + for (i = 0; i < nr_data; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (ob->sectors_free) + zero_out_rest_of_ec_bucket(c, s, i, ob); + } + } + + if (s->err) { + if (!bch2_err_matches(s->err, EROFS)) + bch_err(c, "error creating stripe: error writing data buckets"); + goto err; + } + + if (s->have_existing_stripe) { + ec_validate_checksums(c, &s->existing_stripe); + + if (ec_do_recov(c, &s->existing_stripe)) { + bch_err(c, "error creating stripe: error reading existing stripe"); + goto err; + } + + for (i = 0; i < nr_data; i++) + if (stripe_blockcount_get(&s->existing_stripe.key.v, i)) + swap(s->new_stripe.data[i], + s->existing_stripe.data[i]); + + ec_stripe_buf_exit(&s->existing_stripe); + } + + BUG_ON(!s->allocated); + BUG_ON(!s->idx); + + ec_generate_ec(&s->new_stripe); + + ec_generate_checksums(&s->new_stripe); + + /* write p/q: */ + for (i = nr_data; i < v->nr_blocks; i++) + ec_block_io(c, &s->new_stripe, REQ_OP_WRITE, i, &s->iodone); + closure_sync(&s->iodone); + + if (ec_nr_failed(&s->new_stripe)) { + bch_err(c, "error creating stripe: error writing redundancy buckets"); + goto err; + } + + ret = bch2_trans_do(c, &s->res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL, + ec_stripe_key_update(&trans, &s->new_stripe.key, + !s->have_existing_stripe)); + if (ret) { + bch_err(c, "error creating stripe: error creating stripe key"); + goto err; + } + + ret = ec_stripe_update_extents(c, &s->new_stripe); + if (ret) { + bch_err(c, "error creating stripe: error updating pointers: %s", + bch2_err_str(ret)); + goto err; + } +err: + bch2_disk_reservation_put(c, &s->res); + + for (i = 0; i < v->nr_blocks; i++) + if (s->blocks[i]) { + ob = c->open_buckets + s->blocks[i]; + + if (i < nr_data) { + ob->ec = NULL; + __bch2_open_bucket_put(c, ob); + } else { + bch2_open_bucket_put(c, ob); + } + } + + mutex_lock(&c->ec_stripe_new_lock); + list_del(&s->list); + mutex_unlock(&c->ec_stripe_new_lock); + wake_up(&c->ec_stripe_new_wait); + + ec_stripe_buf_exit(&s->existing_stripe); + ec_stripe_buf_exit(&s->new_stripe); + closure_debug_destroy(&s->iodone); + + ec_stripe_new_put(c, s, STRIPE_REF_stripe); +} + +static struct ec_stripe_new *get_pending_stripe(struct bch_fs *c) +{ + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) + if (!atomic_read(&s->ref[STRIPE_REF_io])) + goto out; + s = NULL; +out: + mutex_unlock(&c->ec_stripe_new_lock); + + return s; +} + +static void ec_stripe_create_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, + struct bch_fs, ec_stripe_create_work); + struct ec_stripe_new *s; + + while ((s = get_pending_stripe(c))) + ec_stripe_create(s); + + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); +} + +void bch2_ec_do_stripe_creates(struct bch_fs *c) +{ + bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); + + if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); +} + +static void ec_stripe_set_pending(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s = h->s; + + BUG_ON(!s->allocated && !s->err); + + h->s = NULL; + s->pending = true; + + mutex_lock(&c->ec_stripe_new_lock); + list_add(&s->list, &c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + ec_stripe_new_put(c, s, STRIPE_REF_io); +} + +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +{ + struct ec_stripe_new *s = ob->ec; + + s->err = -EIO; +} + +void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) +{ + struct open_bucket *ob = ec_open_bucket(c, &wp->ptrs); + struct bch_dev *ca; + unsigned offset; + + if (!ob) + return NULL; + + BUG_ON(!ob->ec->new_stripe.data[ob->ec_idx]); + + ca = bch_dev_bkey_exists(c, ob->dev); + offset = ca->mi.bucket_size - ob->sectors_free; + + return ob->ec->new_stripe.data[ob->ec_idx] + (offset << 9); +} + +static int unsigned_cmp(const void *_l, const void *_r) +{ + unsigned l = *((const unsigned *) _l); + unsigned r = *((const unsigned *) _r); + + return cmp_int(l, r); +} + +/* pick most common bucket size: */ +static unsigned pick_blocksize(struct bch_fs *c, + struct bch_devs_mask *devs) +{ + struct bch_dev *ca; + unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX]; + struct { + unsigned nr, size; + } cur = { 0, 0 }, best = { 0, 0 }; + + for_each_member_device_rcu(ca, c, i, devs) + sizes[nr++] = ca->mi.bucket_size; + + sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL); + + for (i = 0; i < nr; i++) { + if (sizes[i] != cur.size) { + if (cur.nr > best.nr) + best = cur; + + cur.nr = 0; + cur.size = sizes[i]; + } + + cur.nr++; + } + + if (cur.nr > best.nr) + best = cur; + + return best.size; +} + +static bool may_create_new_stripe(struct bch_fs *c) +{ + return false; +} + +static void ec_stripe_key_init(struct bch_fs *c, + struct bkey_i_stripe *s, + unsigned nr_data, + unsigned nr_parity, + unsigned stripe_size) +{ + unsigned u64s; + + bkey_stripe_init(&s->k_i); + s->v.sectors = cpu_to_le16(stripe_size); + s->v.algorithm = 0; + s->v.nr_blocks = nr_data + nr_parity; + s->v.nr_redundant = nr_parity; + s->v.csum_granularity_bits = ilog2(c->opts.encoded_extent_max >> 9); + s->v.csum_type = BCH_CSUM_crc32c; + s->v.pad = 0; + + while ((u64s = stripe_val_u64s(&s->v)) > BKEY_VAL_U64s_MAX) { + BUG_ON(1 << s->v.csum_granularity_bits >= + le16_to_cpu(s->v.sectors) || + s->v.csum_granularity_bits == U8_MAX); + s->v.csum_granularity_bits++; + } + + set_bkey_val_u64s(&s->k, u64s); +} + +static int ec_new_stripe_alloc(struct bch_fs *c, struct ec_stripe_head *h) +{ + struct ec_stripe_new *s; + + lockdep_assert_held(&h->lock); + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + + mutex_init(&s->lock); + closure_init(&s->iodone, NULL); + atomic_set(&s->ref[STRIPE_REF_stripe], 1); + atomic_set(&s->ref[STRIPE_REF_io], 1); + s->c = c; + s->h = h; + s->nr_data = min_t(unsigned, h->nr_active_devs, + BCH_BKEY_PTRS_MAX) - h->redundancy; + s->nr_parity = h->redundancy; + + ec_stripe_key_init(c, &s->new_stripe.key, s->nr_data, + s->nr_parity, h->blocksize); + + h->s = s; + return 0; +} + +static struct ec_stripe_head * +ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target, + unsigned algo, unsigned redundancy, + enum bch_watermark watermark) +{ + struct ec_stripe_head *h; + struct bch_dev *ca; + unsigned i; + + h = kzalloc(sizeof(*h), GFP_KERNEL); + if (!h) + return NULL; + + mutex_init(&h->lock); + BUG_ON(!mutex_trylock(&h->lock)); + + h->target = target; + h->algo = algo; + h->redundancy = redundancy; + h->watermark = watermark; + + rcu_read_lock(); + h->devs = target_rw_devs(c, BCH_DATA_user, target); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (!ca->mi.durability) + __clear_bit(i, h->devs.d); + + h->blocksize = pick_blocksize(c, &h->devs); + + for_each_member_device_rcu(ca, c, i, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + + rcu_read_unlock(); + list_add(&h->list, &c->ec_stripe_head_list); + return h; +} + +void bch2_ec_stripe_head_put(struct bch_fs *c, struct ec_stripe_head *h) +{ + if (h->s && + h->s->allocated && + bitmap_weight(h->s->blocks_allocated, + h->s->nr_data) == h->s->nr_data) + ec_stripe_set_pending(c, h); + + mutex_unlock(&h->lock); +} + +static struct ec_stripe_head * +__bch2_ec_stripe_head_get(struct btree_trans *trans, + unsigned target, + unsigned algo, + unsigned redundancy, + enum bch_watermark watermark) +{ + struct bch_fs *c = trans->c; + struct ec_stripe_head *h; + int ret; + + if (!redundancy) + return NULL; + + ret = bch2_trans_mutex_lock(trans, &c->ec_stripe_head_lock); + if (ret) + return ERR_PTR(ret); + + if (test_bit(BCH_FS_GOING_RO, &c->flags)) { + h = ERR_PTR(-BCH_ERR_erofs_no_writes); + goto found; + } + + list_for_each_entry(h, &c->ec_stripe_head_list, list) + if (h->target == target && + h->algo == algo && + h->redundancy == redundancy && + h->watermark == watermark) { + ret = bch2_trans_mutex_lock(trans, &h->lock); + if (ret) + h = ERR_PTR(ret); + goto found; + } + + h = ec_new_stripe_head_alloc(c, target, algo, redundancy, watermark); +found: + mutex_unlock(&c->ec_stripe_head_lock); + return h; +} + +static int new_stripe_alloc_buckets(struct btree_trans *trans, struct ec_stripe_head *h, + enum bch_watermark watermark, struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct bch_devs_mask devs = h->devs; + struct open_bucket *ob; + struct open_buckets buckets; + unsigned i, j, nr_have_parity = 0, nr_have_data = 0; + bool have_cache = true; + int ret = 0; + + BUG_ON(h->s->new_stripe.key.v.nr_blocks != h->s->nr_data + h->s->nr_parity); + BUG_ON(h->s->new_stripe.key.v.nr_redundant != h->s->nr_parity); + + for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) { + __clear_bit(h->s->new_stripe.key.v.ptrs[i].dev, devs.d); + if (i < h->s->nr_data) + nr_have_data++; + else + nr_have_parity++; + } + + BUG_ON(nr_have_data > h->s->nr_data); + BUG_ON(nr_have_parity > h->s->nr_parity); + + buckets.nr = 0; + if (nr_have_parity < h->s->nr_parity) { + ret = bch2_bucket_alloc_set_trans(trans, &buckets, + &h->parity_stripe, + &devs, + h->s->nr_parity, + &nr_have_parity, + &have_cache, 0, + BCH_DATA_parity, + watermark, + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data + h->s->nr_parity, + h->s->nr_data); + BUG_ON(j >= h->s->nr_data + h->s->nr_parity); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + + if (ret) + return ret; + } + + buckets.nr = 0; + if (nr_have_data < h->s->nr_data) { + ret = bch2_bucket_alloc_set_trans(trans, &buckets, + &h->block_stripe, + &devs, + h->s->nr_data, + &nr_have_data, + &have_cache, 0, + BCH_DATA_user, + watermark, + cl); + + open_bucket_for_each(c, &buckets, ob, i) { + j = find_next_zero_bit(h->s->blocks_gotten, + h->s->nr_data, 0); + BUG_ON(j >= h->s->nr_data); + + h->s->blocks[j] = buckets.v[i]; + h->s->new_stripe.key.v.ptrs[j] = bch2_ob_ptr(c, ob); + __set_bit(j, h->s->blocks_gotten); + } + + if (ret) + return ret; + } + + return 0; +} + +/* XXX: doesn't obey target: */ +static s64 get_existing_stripe(struct bch_fs *c, + struct ec_stripe_head *head) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t heap_idx; + u64 stripe_idx; + s64 ret = -1; + + if (may_create_new_stripe(c)) + return -1; + + mutex_lock(&c->ec_stripes_heap_lock); + for (heap_idx = 0; heap_idx < h->used; heap_idx++) { + /* No blocks worth reusing, stripe will just be deleted: */ + if (!h->data[heap_idx].blocks_nonempty) + continue; + + stripe_idx = h->data[heap_idx].idx; + + m = genradix_ptr(&c->stripes, stripe_idx); + + if (m->algorithm == head->algo && + m->nr_redundant == head->redundancy && + m->sectors == head->blocksize && + m->blocks_nonempty < m->nr_blocks - m->nr_redundant && + bch2_try_open_stripe(c, head->s, stripe_idx)) { + ret = stripe_idx; + break; + } + } + mutex_unlock(&c->ec_stripes_heap_lock); + return ret; +} + +static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stripe_head *h) +{ + struct bch_fs *c = trans->c; + unsigned i; + s64 idx; + int ret; + + /* + * If we can't allocate a new stripe, and there's no stripes with empty + * blocks for us to reuse, that means we have to wait on copygc: + */ + idx = get_existing_stripe(c, h); + if (idx < 0) + return -BCH_ERR_stripe_alloc_blocked; + + ret = get_stripe_key_trans(trans, idx, &h->s->existing_stripe); + if (ret) { + bch2_stripe_close(c, h->s); + if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch2_fs_fatal_error(c, "error reading stripe key: %s", bch2_err_str(ret)); + return ret; + } + + BUG_ON(h->s->existing_stripe.key.v.nr_redundant != h->s->nr_parity); + h->s->nr_data = h->s->existing_stripe.key.v.nr_blocks - + h->s->existing_stripe.key.v.nr_redundant; + + ret = ec_stripe_buf_init(&h->s->existing_stripe, 0, h->blocksize); + if (ret) { + bch2_stripe_close(c, h->s); + return ret; + } + + BUG_ON(h->s->existing_stripe.size != h->blocksize); + BUG_ON(h->s->existing_stripe.size != le16_to_cpu(h->s->existing_stripe.key.v.sectors)); + + /* + * Free buckets we initially allocated - they might conflict with + * blocks from the stripe we're reusing: + */ + for_each_set_bit(i, h->s->blocks_gotten, h->s->new_stripe.key.v.nr_blocks) { + bch2_open_bucket_put(c, c->open_buckets + h->s->blocks[i]); + h->s->blocks[i] = 0; + } + memset(h->s->blocks_gotten, 0, sizeof(h->s->blocks_gotten)); + memset(h->s->blocks_allocated, 0, sizeof(h->s->blocks_allocated)); + + for (i = 0; i < h->s->existing_stripe.key.v.nr_blocks; i++) { + if (stripe_blockcount_get(&h->s->existing_stripe.key.v, i)) { + __set_bit(i, h->s->blocks_gotten); + __set_bit(i, h->s->blocks_allocated); + } + + ec_block_io(c, &h->s->existing_stripe, READ, i, &h->s->iodone); + } + + bkey_copy(&h->s->new_stripe.key.k_i, &h->s->existing_stripe.key.k_i); + h->s->have_existing_stripe = true; + + return 0; +} + +static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_stripe_head *h) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bpos min_pos = POS(0, 1); + struct bpos start_pos = bpos_max(min_pos, POS(0, c->ec_stripe_hint)); + int ret; + + if (!h->s->res.sectors) { + ret = bch2_disk_reservation_get(c, &h->s->res, + h->blocksize, + h->s->nr_parity, + BCH_DISK_RESERVATION_NOFAIL); + if (ret) + return ret; + } + + for_each_btree_key_norestart(trans, iter, BTREE_ID_stripes, start_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (bkey_gt(k.k->p, POS(0, U32_MAX))) { + if (start_pos.offset) { + start_pos = min_pos; + bch2_btree_iter_set_pos(&iter, start_pos); + continue; + } + + ret = -BCH_ERR_ENOSPC_stripe_create; + break; + } + + if (bkey_deleted(k.k) && + bch2_try_open_stripe(c, h->s, k.k->p.offset)) + break; + } + + c->ec_stripe_hint = iter.pos.offset; + + if (ret) + goto err; + + ret = ec_stripe_mem_alloc(trans, &iter); + if (ret) { + bch2_stripe_close(c, h->s); + goto err; + } + + h->s->new_stripe.key.k.p = iter.pos; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +err: + bch2_disk_reservation_put(c, &h->s->res); + goto out; +} + +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, + unsigned target, + unsigned algo, + unsigned redundancy, + enum bch_watermark watermark, + struct closure *cl) +{ + struct bch_fs *c = trans->c; + struct ec_stripe_head *h; + bool waiting = false; + int ret; + + h = __bch2_ec_stripe_head_get(trans, target, algo, redundancy, watermark); + if (!h) + bch_err(c, "no stripe head"); + if (IS_ERR_OR_NULL(h)) + return h; + + if (!h->s) { + ret = ec_new_stripe_alloc(c, h); + if (ret) { + bch_err(c, "failed to allocate new stripe"); + goto err; + } + } + + if (h->s->allocated) + goto allocated; + + if (h->s->have_existing_stripe) + goto alloc_existing; + + /* First, try to allocate a full stripe: */ + ret = new_stripe_alloc_buckets(trans, h, BCH_WATERMARK_stripe, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (!ret) + goto allocate_buf; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, ENOMEM)) + goto err; + + /* + * Not enough buckets available for a full stripe: we must reuse an + * existing stripe: + */ + while (1) { + ret = __bch2_ec_stripe_head_reuse(trans, h); + if (!ret) + break; + if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) + goto err; + + if (watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, h, watermark, NULL) ?: + __bch2_ec_stripe_head_reserve(trans, h); + if (ret) + goto err; + goto allocate_buf; + } + + /* XXX freelist_wait? */ + closure_wait(&c->freelist_wait, cl); + waiting = true; + } + + if (waiting) + closure_wake_up(&c->freelist_wait); +alloc_existing: + /* + * Retry allocating buckets, with the watermark for this + * particular write: + */ + ret = new_stripe_alloc_buckets(trans, h, watermark, cl); + if (ret) + goto err; + +allocate_buf: + ret = ec_stripe_buf_init(&h->s->new_stripe, 0, h->blocksize); + if (ret) + goto err; + + h->s->allocated = true; +allocated: + BUG_ON(!h->s->idx); + BUG_ON(!h->s->new_stripe.data[0]); + BUG_ON(trans->restarted); + return h; +err: + bch2_ec_stripe_head_put(c, h); + return ERR_PTR(ret); +} + +static void __bch2_ec_stop(struct bch_fs *c, struct bch_dev *ca) +{ + struct ec_stripe_head *h; + struct open_bucket *ob; + unsigned i; + + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + mutex_lock(&h->lock); + if (!h->s) + goto unlock; + + if (!ca) + goto found; + + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) { + if (!h->s->blocks[i]) + continue; + + ob = c->open_buckets + h->s->blocks[i]; + if (ob->dev == ca->dev_idx) + goto found; + } + goto unlock; +found: + h->s->err = -BCH_ERR_erofs_no_writes; + ec_stripe_set_pending(c, h); +unlock: + mutex_unlock(&h->lock); + } + mutex_unlock(&c->ec_stripe_head_lock); +} + +void bch2_ec_stop_dev(struct bch_fs *c, struct bch_dev *ca) +{ + __bch2_ec_stop(c, ca); +} + +void bch2_fs_ec_stop(struct bch_fs *c) +{ + __bch2_ec_stop(c, NULL); +} + +static bool bch2_fs_ec_flush_done(struct bch_fs *c) +{ + bool ret; + + mutex_lock(&c->ec_stripe_new_lock); + ret = list_empty(&c->ec_stripe_new_list); + mutex_unlock(&c->ec_stripe_new_lock); + + return ret; +} + +void bch2_fs_ec_flush(struct bch_fs *c) +{ + wait_event(c->ec_stripe_new_wait, bch2_fs_ec_flush_done(c)); +} + +int bch2_stripes_read(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + const struct bch_stripe *s; + struct stripe *m; + unsigned i; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_stripes, POS_MIN, + BTREE_ITER_PREFETCH, k, ret) { + if (k.k->type != KEY_TYPE_stripe) + continue; + + ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); + if (ret) + break; + + s = bkey_s_c_to_stripe(k).v; + + m = genradix_ptr(&c->stripes, k.k->p.offset); + m->sectors = le16_to_cpu(s->sectors); + m->algorithm = s->algorithm; + m->nr_blocks = s->nr_blocks; + m->nr_redundant = s->nr_redundant; + m->blocks_nonempty = 0; + + for (i = 0; i < s->nr_blocks; i++) + m->blocks_nonempty += !!stripe_blockcount_get(s, i); + + bch2_stripes_heap_insert(c, m, k.k->p.offset); + } + bch2_trans_iter_exit(&trans, &iter); + + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + + return ret; +} + +void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) +{ + ec_stripes_heap *h = &c->ec_stripes_heap; + struct stripe *m; + size_t i; + + mutex_lock(&c->ec_stripes_heap_lock); + for (i = 0; i < min_t(size_t, h->used, 50); i++) { + m = genradix_ptr(&c->stripes, h->data[i].idx); + + prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, + h->data[i].blocks_nonempty, + m->nr_blocks - m->nr_redundant, + m->nr_redundant); + if (bch2_stripe_is_open(c, h->data[i].idx)) + prt_str(out, " open"); + prt_newline(out); + } + mutex_unlock(&c->ec_stripes_heap_lock); +} + +void bch2_new_stripes_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct ec_stripe_head *h; + struct ec_stripe_new *s; + + mutex_lock(&c->ec_stripe_head_lock); + list_for_each_entry(h, &c->ec_stripe_head_list, list) { + prt_printf(out, "target %u algo %u redundancy %u %s:\n", + h->target, h->algo, h->redundancy, + bch2_watermarks[h->watermark]); + + if (h->s) + prt_printf(out, "\tidx %llu blocks %u+%u allocated %u\n", + h->s->idx, h->s->nr_data, h->s->nr_parity, + bitmap_weight(h->s->blocks_allocated, + h->s->nr_data)); + } + mutex_unlock(&c->ec_stripe_head_lock); + + prt_printf(out, "in flight:\n"); + + mutex_lock(&c->ec_stripe_new_lock); + list_for_each_entry(s, &c->ec_stripe_new_list, list) { + prt_printf(out, "\tidx %llu blocks %u+%u ref %u %u %s\n", + s->idx, s->nr_data, s->nr_parity, + atomic_read(&s->ref[STRIPE_REF_io]), + atomic_read(&s->ref[STRIPE_REF_stripe]), + bch2_watermarks[s->h->watermark]); + } + mutex_unlock(&c->ec_stripe_new_lock); +} + +void bch2_fs_ec_exit(struct bch_fs *c) +{ + struct ec_stripe_head *h; + unsigned i; + + while (1) { + mutex_lock(&c->ec_stripe_head_lock); + h = list_first_entry_or_null(&c->ec_stripe_head_list, + struct ec_stripe_head, list); + if (h) + list_del(&h->list); + mutex_unlock(&c->ec_stripe_head_lock); + if (!h) + break; + + if (h->s) { + for (i = 0; i < h->s->new_stripe.key.v.nr_blocks; i++) + BUG_ON(h->s->blocks[i]); + + kfree(h->s); + } + kfree(h); + } + + BUG_ON(!list_empty(&c->ec_stripe_new_list)); + + free_heap(&c->ec_stripes_heap); + genradix_free(&c->stripes); + bioset_exit(&c->ec_bioset); +} + +void bch2_fs_ec_init_early(struct bch_fs *c) +{ + spin_lock_init(&c->ec_stripes_new_lock); + mutex_init(&c->ec_stripes_heap_lock); + + INIT_LIST_HEAD(&c->ec_stripe_head_list); + mutex_init(&c->ec_stripe_head_lock); + + INIT_LIST_HEAD(&c->ec_stripe_new_list); + mutex_init(&c->ec_stripe_new_lock); + init_waitqueue_head(&c->ec_stripe_new_wait); + + INIT_WORK(&c->ec_stripe_create_work, ec_stripe_create_work); + INIT_WORK(&c->ec_stripe_delete_work, ec_stripe_delete_work); +} + +int bch2_fs_ec_init(struct bch_fs *c) +{ + return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), + BIOSET_NEED_BVECS); +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h new file mode 100644 index 000000000..1b1848e5f --- /dev/null +++ b/fs/bcachefs/ec.h @@ -0,0 +1,263 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_H +#define _BCACHEFS_EC_H + +#include "ec_types.h" +#include "buckets_types.h" +#include "extents_types.h" + +enum bkey_invalid_flags; + +int bch2_stripe_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_stripe_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +#define bch2_bkey_ops_stripe ((struct bkey_ops) { \ + .key_invalid = bch2_stripe_invalid, \ + .val_to_text = bch2_stripe_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_stripe, \ + .atomic_trigger = bch2_mark_stripe, \ + .min_val_size = 8, \ +}) + +static inline unsigned stripe_csums_per_device(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(le16_to_cpu(s->sectors), + 1 << s->csum_granularity_bits); +} + +static inline unsigned stripe_csum_offset(const struct bch_stripe *s, + unsigned dev, unsigned csum_idx) +{ + unsigned csum_bytes = bch_crc_bytes[s->csum_type]; + + return sizeof(struct bch_stripe) + + sizeof(struct bch_extent_ptr) * s->nr_blocks + + (dev * stripe_csums_per_device(s) + csum_idx) * csum_bytes; +} + +static inline unsigned stripe_blockcount_offset(const struct bch_stripe *s, + unsigned idx) +{ + return stripe_csum_offset(s, s->nr_blocks, 0) + + sizeof(u16) * idx; +} + +static inline unsigned stripe_blockcount_get(const struct bch_stripe *s, + unsigned idx) +{ + return le16_to_cpup((void *) s + stripe_blockcount_offset(s, idx)); +} + +static inline void stripe_blockcount_set(struct bch_stripe *s, + unsigned idx, unsigned v) +{ + __le16 *p = (void *) s + stripe_blockcount_offset(s, idx); + + *p = cpu_to_le16(v); +} + +static inline unsigned stripe_val_u64s(const struct bch_stripe *s) +{ + return DIV_ROUND_UP(stripe_blockcount_offset(s, s->nr_blocks), + sizeof(u64)); +} + +static inline void *stripe_csum(struct bch_stripe *s, + unsigned block, unsigned csum_idx) +{ + EBUG_ON(block >= s->nr_blocks); + EBUG_ON(csum_idx >= stripe_csums_per_device(s)); + + return (void *) s + stripe_csum_offset(s, block, csum_idx); +} + +static inline struct bch_csum stripe_csum_get(struct bch_stripe *s, + unsigned block, unsigned csum_idx) +{ + struct bch_csum csum = { 0 }; + + memcpy(&csum, stripe_csum(s, block, csum_idx), bch_crc_bytes[s->csum_type]); + return csum; +} + +static inline void stripe_csum_set(struct bch_stripe *s, + unsigned block, unsigned csum_idx, + struct bch_csum csum) +{ + memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); +} + +static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, + const struct bch_extent_ptr *data_ptr, + unsigned sectors) +{ + return data_ptr->dev == stripe_ptr->dev && + data_ptr->gen == stripe_ptr->gen && + data_ptr->offset >= stripe_ptr->offset && + data_ptr->offset < stripe_ptr->offset + sectors; +} + +static inline bool bch2_ptr_matches_stripe(const struct bch_stripe *s, + struct extent_ptr_decoded p) +{ + unsigned nr_data = s->nr_blocks - s->nr_redundant; + + BUG_ON(!p.has_ec); + + if (p.ec.block >= nr_data) + return false; + + return __bch2_ptr_matches_stripe(&s->ptrs[p.ec.block], &p.ptr, + le16_to_cpu(s->sectors)); +} + +static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, + struct extent_ptr_decoded p) +{ + unsigned nr_data = m->nr_blocks - m->nr_redundant; + + BUG_ON(!p.has_ec); + + if (p.ec.block >= nr_data) + return false; + + return __bch2_ptr_matches_stripe(&m->ptrs[p.ec.block], &p.ptr, + m->sectors); +} + +struct bch_read_bio; + +struct ec_stripe_buf { + /* might not be buffering the entire stripe: */ + unsigned offset; + unsigned size; + unsigned long valid[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + + void *data[BCH_BKEY_PTRS_MAX]; + + union { + struct bkey_i_stripe key; + u64 pad[255]; + }; +}; + +struct ec_stripe_head; + +enum ec_stripe_ref { + STRIPE_REF_io, + STRIPE_REF_stripe, + STRIPE_REF_NR +}; + +struct ec_stripe_new { + struct bch_fs *c; + struct ec_stripe_head *h; + struct mutex lock; + struct list_head list; + + struct hlist_node hash; + u64 idx; + + struct closure iodone; + + atomic_t ref[STRIPE_REF_NR]; + + int err; + + u8 nr_data; + u8 nr_parity; + bool allocated; + bool pending; + bool have_existing_stripe; + + unsigned long blocks_gotten[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + unsigned long blocks_allocated[BITS_TO_LONGS(BCH_BKEY_PTRS_MAX)]; + open_bucket_idx_t blocks[BCH_BKEY_PTRS_MAX]; + struct disk_reservation res; + + struct ec_stripe_buf new_stripe; + struct ec_stripe_buf existing_stripe; +}; + +struct ec_stripe_head { + struct list_head list; + struct mutex lock; + + unsigned target; + unsigned algo; + unsigned redundancy; + enum bch_watermark watermark; + + struct bch_devs_mask devs; + unsigned nr_active_devs; + + unsigned blocksize; + + struct dev_stripe_state block_stripe; + struct dev_stripe_state parity_stripe; + + struct ec_stripe_new *s; +}; + +int bch2_ec_read_extent(struct bch_fs *, struct bch_read_bio *); + +void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); + +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); + +int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); + +void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, + unsigned, unsigned, unsigned, + enum bch_watermark, struct closure *); + +void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); +void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); +void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); + +void bch2_do_stripe_deletes(struct bch_fs *); +void bch2_ec_do_stripe_creates(struct bch_fs *); +void bch2_ec_stripe_new_free(struct bch_fs *, struct ec_stripe_new *); + +static inline void ec_stripe_new_get(struct ec_stripe_new *s, + enum ec_stripe_ref ref) +{ + atomic_inc(&s->ref[ref]); +} + +static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, + enum ec_stripe_ref ref) +{ + BUG_ON(atomic_read(&s->ref[ref]) <= 0); + + if (atomic_dec_and_test(&s->ref[ref])) + switch (ref) { + case STRIPE_REF_stripe: + bch2_ec_stripe_new_free(c, s); + break; + case STRIPE_REF_io: + bch2_ec_do_stripe_creates(c); + break; + default: + unreachable(); + } +} + +void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); +void bch2_fs_ec_stop(struct bch_fs *); +void bch2_fs_ec_flush(struct bch_fs *); + +int bch2_stripes_read(struct bch_fs *); + +void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); +void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_ec_exit(struct bch_fs *); +void bch2_fs_ec_init_early(struct bch_fs *); +int bch2_fs_ec_init(struct bch_fs *); + +#endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h new file mode 100644 index 000000000..e2b02a82d --- /dev/null +++ b/fs/bcachefs/ec_types.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EC_TYPES_H +#define _BCACHEFS_EC_TYPES_H + +#include "bcachefs_format.h" + +struct bch_replicas_padded { + struct bch_replicas_entry e; + u8 pad[BCH_BKEY_PTRS_MAX]; +}; + +struct stripe { + size_t heap_idx; + u16 sectors; + u8 algorithm; + u8 nr_blocks; + u8 nr_redundant; + u8 blocks_nonempty; +}; + +struct gc_stripe { + u16 sectors; + + u8 nr_blocks; + u8 nr_redundant; + + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ + u16 block_sectors[BCH_BKEY_PTRS_MAX]; + struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; + + struct bch_replicas_padded r; +}; + +struct ec_stripe_heap_entry { + size_t idx; + unsigned blocks_nonempty; +}; + +typedef HEAP(struct ec_stripe_heap_entry) ec_stripes_heap; + +#endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c new file mode 100644 index 000000000..dc906fc91 --- /dev/null +++ b/fs/bcachefs/errcode.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "errcode.h" + +#include + +static const char * const bch2_errcode_strs[] = { +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = #err, + BCH_ERRCODES() +#undef x + NULL +}; + +#define BCH_ERR_0 0 + +static unsigned bch2_errcode_parents[] = { +#define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, + BCH_ERRCODES() +#undef x +}; + +const char *bch2_err_str(int err) +{ + const char *errstr; + + err = abs(err); + + BUG_ON(err >= BCH_ERR_MAX); + + if (err >= BCH_ERR_START) + errstr = bch2_errcode_strs[err - BCH_ERR_START]; + else if (err) + errstr = errname(err); + else + errstr = "(No error)"; + return errstr ?: "(Invalid error)"; +} + +bool __bch2_err_matches(int err, int class) +{ + err = abs(err); + class = abs(class); + + BUG_ON(err >= BCH_ERR_MAX); + BUG_ON(class >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && err != class) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return err == class; +} + +int __bch2_err_class(int err) +{ + err = -err; + BUG_ON((unsigned) err >= BCH_ERR_MAX); + + while (err >= BCH_ERR_START && bch2_errcode_parents[err - BCH_ERR_START]) + err = bch2_errcode_parents[err - BCH_ERR_START]; + + return -err; +} diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h new file mode 100644 index 000000000..d5277ec73 --- /dev/null +++ b/fs/bcachefs/errcode.h @@ -0,0 +1,246 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERRCODE_H +#define _BCACHEFS_ERRCODE_H + +#define BCH_ERRCODES() \ + x(ENOMEM, ENOMEM_stripe_buf) \ + x(ENOMEM, ENOMEM_replicas_table) \ + x(ENOMEM, ENOMEM_cpu_replicas) \ + x(ENOMEM, ENOMEM_replicas_gc) \ + x(ENOMEM, ENOMEM_disk_groups_validate) \ + x(ENOMEM, ENOMEM_disk_groups_to_cpu) \ + x(ENOMEM, ENOMEM_mark_snapshot) \ + x(ENOMEM, ENOMEM_mark_stripe) \ + x(ENOMEM, ENOMEM_mark_stripe_ptr) \ + x(ENOMEM, ENOMEM_btree_key_cache_create) \ + x(ENOMEM, ENOMEM_btree_key_cache_fill) \ + x(ENOMEM, ENOMEM_btree_key_cache_insert) \ + x(ENOMEM, ENOMEM_trans_kmalloc) \ + x(ENOMEM, ENOMEM_trans_log_msg) \ + x(ENOMEM, ENOMEM_do_encrypt) \ + x(ENOMEM, ENOMEM_ec_read_extent) \ + x(ENOMEM, ENOMEM_ec_stripe_mem_alloc) \ + x(ENOMEM, ENOMEM_ec_new_stripe_alloc) \ + x(ENOMEM, ENOMEM_fs_btree_cache_init) \ + x(ENOMEM, ENOMEM_fs_btree_key_cache_init) \ + x(ENOMEM, ENOMEM_fs_counters_init) \ + x(ENOMEM, ENOMEM_fs_btree_write_buffer_init) \ + x(ENOMEM, ENOMEM_io_clock_init) \ + x(ENOMEM, ENOMEM_blacklist_table_init) \ + x(ENOMEM, ENOMEM_sb_realloc_injected) \ + x(ENOMEM, ENOMEM_sb_bio_realloc) \ + x(ENOMEM, ENOMEM_sb_buf_realloc) \ + x(ENOMEM, ENOMEM_sb_journal_validate) \ + x(ENOMEM, ENOMEM_sb_journal_v2_validate) \ + x(ENOMEM, ENOMEM_journal_entry_add) \ + x(ENOMEM, ENOMEM_journal_read_buf_realloc) \ + x(ENOMEM, ENOMEM_btree_interior_update_worker_init)\ + x(ENOMEM, ENOMEM_btree_interior_update_pool_init) \ + x(ENOMEM, ENOMEM_bio_read_init) \ + x(ENOMEM, ENOMEM_bio_read_split_init) \ + x(ENOMEM, ENOMEM_bio_write_init) \ + x(ENOMEM, ENOMEM_bio_bounce_pages_init) \ + x(ENOMEM, ENOMEM_writepage_bioset_init) \ + x(ENOMEM, ENOMEM_dio_read_bioset_init) \ + x(ENOMEM, ENOMEM_dio_write_bioset_init) \ + x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ + x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_compression_bounce_read_init) \ + x(ENOMEM, ENOMEM_compression_bounce_write_init) \ + x(ENOMEM, ENOMEM_compression_workspace_init) \ + x(ENOMEM, ENOMEM_decompression_workspace_init) \ + x(ENOMEM, ENOMEM_bucket_gens) \ + x(ENOMEM, ENOMEM_buckets_nouse) \ + x(ENOMEM, ENOMEM_usage_init) \ + x(ENOMEM, ENOMEM_btree_node_read_all_replicas) \ + x(ENOMEM, ENOMEM_btree_node_reclaim) \ + x(ENOMEM, ENOMEM_btree_node_mem_alloc) \ + x(ENOMEM, ENOMEM_btree_cache_cannibalize_lock) \ + x(ENOMEM, ENOMEM_buckets_waiting_for_journal_init)\ + x(ENOMEM, ENOMEM_buckets_waiting_for_journal_set) \ + x(ENOMEM, ENOMEM_set_nr_journal_buckets) \ + x(ENOMEM, ENOMEM_dev_journal_init) \ + x(ENOMEM, ENOMEM_journal_pin_fifo) \ + x(ENOMEM, ENOMEM_journal_buf) \ + x(ENOMEM, ENOMEM_gc_start) \ + x(ENOMEM, ENOMEM_gc_alloc_start) \ + x(ENOMEM, ENOMEM_gc_reflink_start) \ + x(ENOMEM, ENOMEM_gc_gens) \ + x(ENOMEM, ENOMEM_gc_repair_key) \ + x(ENOMEM, ENOMEM_fsck_extent_ends_at) \ + x(ENOMEM, ENOMEM_fsck_add_nlink) \ + x(ENOMEM, ENOMEM_journal_key_insert) \ + x(ENOMEM, ENOMEM_journal_keys_sort) \ + x(ENOMEM, ENOMEM_journal_replay) \ + x(ENOMEM, ENOMEM_read_superblock_clean) \ + x(ENOMEM, ENOMEM_fs_alloc) \ + x(ENOMEM, ENOMEM_fs_name_alloc) \ + x(ENOMEM, ENOMEM_fs_other_alloc) \ + x(ENOMEM, ENOMEM_dev_alloc) \ + x(ENOSPC, ENOSPC_disk_reservation) \ + x(ENOSPC, ENOSPC_bucket_alloc) \ + x(ENOSPC, ENOSPC_disk_label_add) \ + x(ENOSPC, ENOSPC_stripe_create) \ + x(ENOSPC, ENOSPC_inode_create) \ + x(ENOSPC, ENOSPC_str_hash_create) \ + x(ENOSPC, ENOSPC_snapshot_create) \ + x(ENOSPC, ENOSPC_subvolume_create) \ + x(ENOSPC, ENOSPC_sb) \ + x(ENOSPC, ENOSPC_sb_journal) \ + x(ENOSPC, ENOSPC_sb_journal_seq_blacklist) \ + x(ENOSPC, ENOSPC_sb_quota) \ + x(ENOSPC, ENOSPC_sb_replicas) \ + x(ENOSPC, ENOSPC_sb_members) \ + x(ENOSPC, ENOSPC_sb_crypt) \ + x(ENOSPC, ENOSPC_btree_slot) \ + x(ENOSPC, ENOSPC_snapshot_tree) \ + x(ENOENT, ENOENT_bkey_type_mismatch) \ + x(ENOENT, ENOENT_str_hash_lookup) \ + x(ENOENT, ENOENT_str_hash_set_must_replace) \ + x(ENOENT, ENOENT_inode) \ + x(ENOENT, ENOENT_not_subvol) \ + x(ENOENT, ENOENT_directory_dead) \ + x(ENOENT, ENOENT_subvolume) \ + x(ENOENT, ENOENT_snapshot_tree) \ + x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ + x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_idx_not_found) \ + x(0, open_buckets_empty) \ + x(0, freelist_empty) \ + x(BCH_ERR_freelist_empty, no_buckets_found) \ + x(0, transaction_restart) \ + x(BCH_ERR_transaction_restart, transaction_restart_fault_inject) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_path_intent) \ + x(BCH_ERR_transaction_restart, transaction_restart_relock_after_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_too_many_iters) \ + x(BCH_ERR_transaction_restart, transaction_restart_lock_node_reused) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_relock) \ + x(BCH_ERR_transaction_restart, transaction_restart_fill_mem_alloc_fail)\ + x(BCH_ERR_transaction_restart, transaction_restart_mem_realloced) \ + x(BCH_ERR_transaction_restart, transaction_restart_in_traverse_all) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock) \ + x(BCH_ERR_transaction_restart, transaction_restart_would_deadlock_write)\ + x(BCH_ERR_transaction_restart, transaction_restart_deadlock_recursion_limit)\ + x(BCH_ERR_transaction_restart, transaction_restart_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_upgrade) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_fill) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_raced) \ + x(BCH_ERR_transaction_restart, transaction_restart_key_cache_realloced)\ + x(BCH_ERR_transaction_restart, transaction_restart_journal_preres_get) \ + x(BCH_ERR_transaction_restart, transaction_restart_split_race) \ + x(BCH_ERR_transaction_restart, transaction_restart_write_buffer_flush) \ + x(BCH_ERR_transaction_restart, transaction_restart_nested) \ + x(0, no_btree_node) \ + x(BCH_ERR_no_btree_node, no_btree_node_relock) \ + x(BCH_ERR_no_btree_node, no_btree_node_upgrade) \ + x(BCH_ERR_no_btree_node, no_btree_node_drop) \ + x(BCH_ERR_no_btree_node, no_btree_node_lock_root) \ + x(BCH_ERR_no_btree_node, no_btree_node_up) \ + x(BCH_ERR_no_btree_node, no_btree_node_down) \ + x(BCH_ERR_no_btree_node, no_btree_node_init) \ + x(BCH_ERR_no_btree_node, no_btree_node_cached) \ + x(BCH_ERR_no_btree_node, no_btree_node_srcu_reset) \ + x(0, btree_insert_fail) \ + x(BCH_ERR_btree_insert_fail, btree_insert_btree_node_full) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \ + x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \ + x(0, backpointer_to_overwritten_btree_node) \ + x(0, lock_fail_root_changed) \ + x(0, journal_reclaim_would_deadlock) \ + x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_fix) \ + x(BCH_ERR_fsck, fsck_ignore) \ + x(BCH_ERR_fsck, fsck_errors_not_fixed) \ + x(BCH_ERR_fsck, fsck_repair_unimplemented) \ + x(BCH_ERR_fsck, fsck_repair_impossible) \ + x(0, need_snapshot_cleanup) \ + x(0, need_topology_repair) \ + x(0, unwritten_extent_update) \ + x(EINVAL, device_state_not_allowed) \ + x(EINVAL, member_info_missing) \ + x(EINVAL, mismatched_block_size) \ + x(EINVAL, block_size_too_small) \ + x(EINVAL, bucket_size_too_small) \ + x(EINVAL, device_size_too_small) \ + x(EINVAL, device_not_a_member_of_filesystem) \ + x(EINVAL, device_has_been_removed) \ + x(EINVAL, device_already_online) \ + x(EINVAL, insufficient_devices_to_start) \ + x(EINVAL, invalid) \ + x(EROFS, erofs_trans_commit) \ + x(EROFS, erofs_no_writes) \ + x(EROFS, erofs_journal_err) \ + x(EROFS, erofs_sb_err) \ + x(EROFS, erofs_unfixed_errors) \ + x(EROFS, erofs_norecovery) \ + x(EROFS, erofs_nochanges) \ + x(EROFS, insufficient_devices) \ + x(0, operation_blocked) \ + x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ + x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_invalid, invalid_sb) \ + x(BCH_ERR_invalid_sb, invalid_sb_magic) \ + x(BCH_ERR_invalid_sb, invalid_sb_version) \ + x(BCH_ERR_invalid_sb, invalid_sb_features) \ + x(BCH_ERR_invalid_sb, invalid_sb_too_big) \ + x(BCH_ERR_invalid_sb, invalid_sb_csum_type) \ + x(BCH_ERR_invalid_sb, invalid_sb_csum) \ + x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ + x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ + x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ + x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ + x(BCH_ERR_invalid_sb, invalid_sb_field_size) \ + x(BCH_ERR_invalid_sb, invalid_sb_layout) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_type) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_nr_superblocks) \ + x(BCH_ERR_invalid_sb_layout, invalid_sb_layout_superblocks_overlap) \ + x(BCH_ERR_invalid_sb, invalid_sb_members_missing) \ + x(BCH_ERR_invalid_sb, invalid_sb_members) \ + x(BCH_ERR_invalid_sb, invalid_sb_disk_groups) \ + x(BCH_ERR_invalid_sb, invalid_sb_replicas) \ + x(BCH_ERR_invalid_sb, invalid_sb_journal) \ + x(BCH_ERR_invalid_sb, invalid_sb_journal_seq_blacklist) \ + x(BCH_ERR_invalid_sb, invalid_sb_crypt) \ + x(BCH_ERR_invalid_sb, invalid_sb_clean) \ + x(BCH_ERR_invalid_sb, invalid_sb_quota) \ + x(BCH_ERR_invalid, invalid_bkey) \ + x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ + +enum bch_errcode { + BCH_ERR_START = 2048, +#define x(class, err) BCH_ERR_##err, + BCH_ERRCODES() +#undef x + BCH_ERR_MAX +}; + +const char *bch2_err_str(int); +bool __bch2_err_matches(int, int); + +static inline bool _bch2_err_matches(int err, int class) +{ + return err < 0 && __bch2_err_matches(err, class); +} + +#define bch2_err_matches(_err, _class) \ +({ \ + BUILD_BUG_ON(!__builtin_constant_p(_class)); \ + unlikely(_bch2_err_matches(_err, _class)); \ +}) + +int __bch2_err_class(int); + +static inline long bch2_err_class(long err) +{ + return err < 0 ? __bch2_err_class(err) : err; +} + +#endif /* _BCACHFES_ERRCODE_H */ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c new file mode 100644 index 000000000..685464b8c --- /dev/null +++ b/fs/bcachefs/error.c @@ -0,0 +1,297 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "error.h" +#include "io.h" +#include "super.h" + +#define FSCK_ERR_RATELIMIT_NR 10 + +bool bch2_inconsistent_error(struct bch_fs *c) +{ + set_bit(BCH_FS_ERROR, &c->flags); + + switch (c->opts.errors) { + case BCH_ON_ERROR_continue: + return false; + case BCH_ON_ERROR_ro: + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "inconsistency detected - emergency read only"); + return true; + case BCH_ON_ERROR_panic: + panic(bch2_fmt(c, "panic after error")); + return true; + default: + BUG(); + } +} + +void bch2_topology_error(struct bch_fs *c) +{ + if (!test_bit(BCH_FS_TOPOLOGY_REPAIR_DONE, &c->flags)) + return; + + set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags); + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + bch2_inconsistent_error(c); +} + +void bch2_fatal_error(struct bch_fs *c) +{ + if (bch2_fs_emergency_read_only(c)) + bch_err(c, "fatal error - emergency read only"); +} + +void bch2_io_error_work(struct work_struct *work) +{ + struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); + struct bch_fs *c = ca->fs; + bool dev; + + down_write(&c->state_lock); + dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + if (dev + ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED) + : bch2_fs_emergency_read_only(c)) + bch_err(ca, + "too many IO errors, setting %s RO", + dev ? "device" : "filesystem"); + up_write(&c->state_lock); +} + +void bch2_io_error(struct bch_dev *ca) +{ + //queue_work(system_long_wq, &ca->io_error_work); +} + +enum ask_yn { + YN_NO, + YN_YES, + YN_ALLNO, + YN_ALLYES, +}; + +#ifdef __KERNEL__ +#define bch2_fsck_ask_yn() YN_NO +#else + +#include "tools-util.h" + +enum ask_yn bch2_fsck_ask_yn(void) +{ + char *buf = NULL; + size_t buflen = 0; + bool ret; + + while (true) { + fputs(" (y,n, or Y,N for all errors of this type) ", stdout); + fflush(stdout); + + if (getline(&buf, &buflen, stdin) < 0) + die("error reading from standard input"); + + strim(buf); + if (strlen(buf) != 1) + continue; + + switch (buf[0]) { + case 'n': + return YN_NO; + case 'y': + return YN_YES; + case 'N': + return YN_ALLNO; + case 'Y': + return YN_ALLYES; + } + } + + free(buf); + return ret; +} + +#endif + +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) +{ + struct fsck_err_state *s; + + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) + return NULL; + + list_for_each_entry(s, &c->fsck_errors, list) + if (s->fmt == fmt) { + /* + * move it to the head of the list: repeated fsck errors + * are common + */ + list_move(&s->list, &c->fsck_errors); + return s; + } + + s = kzalloc(sizeof(*s), GFP_NOFS); + if (!s) { + if (!c->fsck_alloc_err) + bch_err(c, "kmalloc err, cannot ratelimit fsck errs"); + c->fsck_alloc_err = true; + return NULL; + } + + INIT_LIST_HEAD(&s->list); + s->fmt = fmt; + list_add(&s->list, &c->fsck_errors); + return s; +} + +int bch2_fsck_err(struct bch_fs *c, unsigned flags, const char *fmt, ...) +{ + struct fsck_err_state *s = NULL; + va_list args; + bool print = true, suppressing = false, inconsistent = false; + struct printbuf buf = PRINTBUF, *out = &buf; + int ret = -BCH_ERR_fsck_ignore; + + va_start(args, fmt); + prt_vprintf(out, fmt, args); + va_end(args); + + mutex_lock(&c->fsck_error_lock); + s = fsck_err_get(c, fmt); + if (s) { + /* + * We may be called multiple times for the same error on + * transaction restart - this memoizes instead of asking the user + * multiple times for the same error: + */ + if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { + ret = s->ret; + mutex_unlock(&c->fsck_error_lock); + printbuf_exit(&buf); + return ret; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(buf.buf, GFP_KERNEL); + + if (c->opts.ratelimit_errors && + !(flags & FSCK_NO_RATELIMIT) && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + suppressing = true; + else + print = false; + } + + s->nr++; + } + +#ifdef BCACHEFS_LOG_PREFIX + if (!strncmp(fmt, "bcachefs:", 9)) + prt_printf(out, bch2_log_msg(c, "")); +#endif + + if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) { + if (c->opts.errors != BCH_ON_ERROR_continue || + !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { + prt_str(out, ", shutting down"); + inconsistent = true; + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", continuing"); + ret = -BCH_ERR_fsck_ignore; + } + } else if (c->opts.fix_errors == FSCK_FIX_exit) { + prt_str(out, ", exiting"); + ret = -BCH_ERR_fsck_errors_not_fixed; + } else if (flags & FSCK_CAN_FIX) { + int fix = s && s->fix + ? s->fix + : c->opts.fix_errors; + + if (fix == FSCK_FIX_ask) { + int ask; + + prt_str(out, ": fix?"); + bch2_print_string_as_lines(KERN_ERR, out->buf); + print = false; + + ask = bch2_fsck_ask_yn(); + + if (ask >= YN_ALLNO && s) + s->fix = ask == YN_ALLNO + ? FSCK_FIX_no + : FSCK_FIX_yes; + + ret = ask & 1 + ? -BCH_ERR_fsck_fix + : -BCH_ERR_fsck_ignore; + } else if (fix == FSCK_FIX_yes || + (c->opts.nochanges && + !(flags & FSCK_CAN_IGNORE))) { + prt_str(out, ", fixing"); + ret = -BCH_ERR_fsck_fix; + } else { + prt_str(out, ", not fixing"); + } + } else if (flags & FSCK_NEED_FSCK) { + prt_str(out, " (run fsck to correct)"); + } else { + prt_str(out, " (repair unimplemented)"); + } + + if (ret == -BCH_ERR_fsck_ignore && + (c->opts.fix_errors == FSCK_FIX_exit || + !(flags & FSCK_CAN_IGNORE))) + ret = -BCH_ERR_fsck_errors_not_fixed; + + if (print) + bch2_print_string_as_lines(KERN_ERR, out->buf); + + if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) && + (ret != -BCH_ERR_fsck_fix && + ret != -BCH_ERR_fsck_ignore)) + bch_err(c, "Unable to continue, halting"); + else if (suppressing) + bch_err(c, "Ratelimiting new instances of previous error"); + + if (s) + s->ret = ret; + + mutex_unlock(&c->fsck_error_lock); + + printbuf_exit(&buf); + + if (inconsistent) + bch2_inconsistent_error(c); + + if (ret == -BCH_ERR_fsck_fix) { + set_bit(BCH_FS_ERRORS_FIXED, &c->flags); + } else { + set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags); + set_bit(BCH_FS_ERROR, &c->flags); + } + + return ret; +} + +void bch2_flush_fsck_errs(struct bch_fs *c) +{ + struct fsck_err_state *s, *n; + + mutex_lock(&c->fsck_error_lock); + + list_for_each_entry_safe(s, n, &c->fsck_errors, list) { + if (s->ratelimited && s->last_msg) + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); + + list_del(&s->list); + kfree(s->last_msg); + kfree(s); + } + + mutex_unlock(&c->fsck_error_lock); +} diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h new file mode 100644 index 000000000..7ce954005 --- /dev/null +++ b/fs/bcachefs/error.h @@ -0,0 +1,206 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ERROR_H +#define _BCACHEFS_ERROR_H + +#include +#include + +struct bch_dev; +struct bch_fs; +struct work_struct; + +/* + * XXX: separate out errors that indicate on disk data is inconsistent, and flag + * superblock as such + */ + +/* Error messages: */ + +/* + * Inconsistency errors: The on disk data is inconsistent. If these occur during + * initial recovery, they don't indicate a bug in the running code - we walk all + * the metadata before modifying anything. If they occur at runtime, they + * indicate either a bug in the running code or (less likely) data is being + * silently corrupted under us. + * + * XXX: audit all inconsistent errors and make sure they're all recoverable, in + * BCH_ON_ERROR_CONTINUE mode + */ + +bool bch2_inconsistent_error(struct bch_fs *); + +void bch2_topology_error(struct bch_fs *); + +#define bch2_fs_inconsistent(c, ...) \ +({ \ + bch_err(c, __VA_ARGS__); \ + bch2_inconsistent_error(c); \ +}) + +#define bch2_fs_inconsistent_on(cond, c, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_inconsistent(c, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Later we might want to mark only the particular device inconsistent, not the + * entire filesystem: + */ + +#define bch2_dev_inconsistent(ca, ...) \ +do { \ + bch_err(ca, __VA_ARGS__); \ + bch2_inconsistent_error((ca)->fs); \ +} while (0) + +#define bch2_dev_inconsistent_on(cond, ca, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_dev_inconsistent(ca, __VA_ARGS__); \ + _ret; \ +}) + +/* + * When a transaction update discovers or is causing a fs inconsistency, it's + * helpful to also dump the pending updates: + */ +#define bch2_trans_inconsistent(trans, ...) \ +({ \ + bch_err(trans->c, __VA_ARGS__); \ + bch2_dump_trans_updates(trans); \ + bch2_inconsistent_error(trans->c); \ +}) + +#define bch2_trans_inconsistent_on(cond, trans, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_trans_inconsistent(trans, __VA_ARGS__); \ + _ret; \ +}) + +/* + * Fsck errors: inconsistency errors we detect at mount time, and should ideally + * be able to repair: + */ + +struct fsck_err_state { + struct list_head list; + const char *fmt; + u64 nr; + bool ratelimited; + int ret; + int fix; + char *last_msg; +}; + +#define FSCK_CAN_FIX (1 << 0) +#define FSCK_CAN_IGNORE (1 << 1) +#define FSCK_NEED_FSCK (1 << 2) +#define FSCK_NO_RATELIMIT (1 << 3) + +__printf(3, 4) __cold +int bch2_fsck_err(struct bch_fs *, unsigned, const char *, ...); +void bch2_flush_fsck_errs(struct bch_fs *); + +#define __fsck_err(c, _flags, msg, ...) \ +({ \ + int _ret = bch2_fsck_err(c, _flags, msg, ##__VA_ARGS__); \ + \ + if (_ret != -BCH_ERR_fsck_fix && \ + _ret != -BCH_ERR_fsck_ignore) { \ + ret = _ret; \ + goto fsck_err; \ + } \ + \ + _ret == -BCH_ERR_fsck_fix; \ +}) + +/* These macros return true if error should be fixed: */ + +/* XXX: mark in superblock that filesystem contains errors, if we ignore: */ + +#define __fsck_err_on(cond, c, _flags, ...) \ + (unlikely(cond) ? __fsck_err(c, _flags, ##__VA_ARGS__) : false) + +#define need_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) + +#define need_fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_IGNORE|FSCK_NEED_FSCK, ##__VA_ARGS__) + +#define mustfix_fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_FIX, ##__VA_ARGS__) + +#define mustfix_fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX, ##__VA_ARGS__) + +#define fsck_err(c, ...) \ + __fsck_err(c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) + +#define fsck_err_on(cond, c, ...) \ + __fsck_err_on(cond, c, FSCK_CAN_FIX|FSCK_CAN_IGNORE, ##__VA_ARGS__) + +/* + * Fatal errors: these don't indicate a bug, but we can't continue running in RW + * mode - pretty much just due to metadata IO errors: + */ + +void bch2_fatal_error(struct bch_fs *); + +#define bch2_fs_fatal_error(c, ...) \ +do { \ + bch_err(c, __VA_ARGS__); \ + bch2_fatal_error(c); \ +} while (0) + +#define bch2_fs_fatal_err_on(cond, c, ...) \ +({ \ + bool _ret = unlikely(!!(cond)); \ + \ + if (_ret) \ + bch2_fs_fatal_error(c, __VA_ARGS__); \ + _ret; \ +}) + +/* + * IO errors: either recoverable metadata IO (because we have replicas), or data + * IO - we need to log it and print out a message, but we don't (necessarily) + * want to shut down the fs: + */ + +void bch2_io_error_work(struct work_struct *); + +/* Does the error handling without logging a message */ +void bch2_io_error(struct bch_dev *); + +#define bch2_dev_io_err_on(cond, ca, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) { \ + bch_err_dev_ratelimited(ca, __VA_ARGS__); \ + bch2_io_error(ca); \ + } \ + _ret; \ +}) + +#define bch2_dev_inum_io_err_on(cond, ca, ...) \ +({ \ + bool _ret = (cond); \ + \ + if (_ret) { \ + bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ + bch2_io_error(ca); \ + } \ + _ret; \ +}) + +#endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c new file mode 100644 index 000000000..21af6fb8c --- /dev/null +++ b/fs/bcachefs/extent_update.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "debug.h" +#include "extents.h" +#include "extent_update.h" + +/* + * This counts the number of iterators to the alloc & ec btrees we'll need + * inserting/removing this extent: + */ +static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + unsigned ret = 0, lru = 0; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + /* Might also be updating LRU btree */ + if (entry->ptr.cached) + lru++; + + fallthrough; + case BCH_EXTENT_ENTRY_stripe_ptr: + ret++; + } + } + + /* + * Updating keys in the alloc btree may also update keys in the + * freespace or discard btrees: + */ + return lru + ret * 2; +} + +static int count_iters_for_insert(struct btree_trans *trans, + struct bkey_s_c k, + unsigned offset, + struct bpos *end, + unsigned *nr_iters, + unsigned max_iters) +{ + int ret = 0, ret2 = 0; + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + *nr_iters += bch2_bkey_nr_alloc_ptrs(k); + + if (*nr_iters >= max_iters) { + *end = bpos_min(*end, k.k->p); + ret = 1; + } + + break; + case KEY_TYPE_reflink_p: { + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + u64 idx = le64_to_cpu(p.v->idx); + unsigned sectors = bpos_min(*end, p.k->p).offset - + bkey_start_offset(p.k); + struct btree_iter iter; + struct bkey_s_c r_k; + + for_each_btree_key_norestart(trans, iter, + BTREE_ID_reflink, POS(0, idx + offset), + BTREE_ITER_SLOTS, r_k, ret2) { + if (bkey_ge(bkey_start_pos(r_k.k), POS(0, idx + sectors))) + break; + + /* extent_update_to_keys(), for the reflink_v update */ + *nr_iters += 1; + + *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); + + if (*nr_iters >= max_iters) { + struct bpos pos = bkey_start_pos(k.k); + pos.offset += min_t(u64, k.k->size, + r_k.k->p.offset - idx); + + *end = bpos_min(*end, pos); + ret = 1; + break; + } + } + bch2_trans_iter_exit(trans, &iter); + + break; + } + } + + return ret2 ?: ret; +} + +#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3) + +int bch2_extent_atomic_end(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *insert, + struct bpos *end) +{ + struct btree_iter copy; + struct bkey_s_c k; + unsigned nr_iters = 0; + int ret; + + ret = bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + *end = insert->k.p; + + /* extent_update_to_keys(): */ + nr_iters += 1; + + ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, + &nr_iters, EXTENT_ITERS_MAX / 2); + if (ret < 0) + return ret; + + bch2_trans_copy_iter(©, iter); + + for_each_btree_key_upto_continue_norestart(copy, insert->k.p, 0, k, ret) { + unsigned offset = 0; + + if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) + offset = bkey_start_offset(&insert->k) - + bkey_start_offset(k.k); + + /* extent_handle_overwrites(): */ + switch (bch2_extent_overlap(&insert->k, k.k)) { + case BCH_EXTENT_OVERLAP_ALL: + case BCH_EXTENT_OVERLAP_FRONT: + nr_iters += 1; + break; + case BCH_EXTENT_OVERLAP_BACK: + case BCH_EXTENT_OVERLAP_MIDDLE: + nr_iters += 2; + break; + } + + ret = count_iters_for_insert(trans, k, offset, end, + &nr_iters, EXTENT_ITERS_MAX); + if (ret) + break; + } + + bch2_trans_iter_exit(trans, ©); + return ret < 0 ? ret : 0; +} + +int bch2_extent_trim_atomic(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *k) +{ + struct bpos end; + int ret; + + ret = bch2_extent_atomic_end(trans, iter, k, &end); + if (ret) + return ret; + + bch2_cut_back(end, k); + return 0; +} diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h new file mode 100644 index 000000000..6f5cf4493 --- /dev/null +++ b/fs/bcachefs/extent_update.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENT_UPDATE_H +#define _BCACHEFS_EXTENT_UPDATE_H + +#include "bcachefs.h" + +int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, + struct bkey_i *, struct bpos *); +int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, + struct bkey_i *); + +#endif /* _BCACHEFS_EXTENT_UPDATE_H */ diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c new file mode 100644 index 000000000..c13e0afc6 --- /dev/null +++ b/fs/bcachefs/extents.c @@ -0,0 +1,1394 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2010 Kent Overstreet + * + * Code for managing the extent btree and dynamically updating the writeback + * dirty sector count. + */ + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_gc.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "buckets.h" +#include "checksum.h" +#include "debug.h" +#include "disk_groups.h" +#include "error.h" +#include "extents.h" +#include "inode.h" +#include "journal.h" +#include "replicas.h" +#include "super.h" +#include "super-io.h" +#include "trace.h" +#include "util.h" + +static unsigned bch2_crc_field_size_max[] = { + [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, + [BCH_EXTENT_ENTRY_crc128] = CRC128_SIZE_MAX, +}; + +static void bch2_extent_crc_pack(union bch_extent_crc *, + struct bch_extent_crc_unpacked, + enum bch_extent_entry_type); + +static struct bch_dev_io_failures *dev_io_failures(struct bch_io_failures *f, + unsigned dev) +{ + struct bch_dev_io_failures *i; + + for (i = f->devs; i < f->devs + f->nr; i++) + if (i->dev == dev) + return i; + + return NULL; +} + +void bch2_mark_io_failure(struct bch_io_failures *failed, + struct extent_ptr_decoded *p) +{ + struct bch_dev_io_failures *f = dev_io_failures(failed, p->ptr.dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + f->dev = p->ptr.dev; + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else if (p->idx != f->idx) { + f->idx = p->idx; + f->nr_failed = 1; + f->nr_retries = 0; + } else { + f->nr_failed++; + } +} + +/* + * returns true if p1 is better than p2: + */ +static inline bool ptr_better(struct bch_fs *c, + const struct extent_ptr_decoded p1, + const struct extent_ptr_decoded p2) +{ + if (likely(!p1.idx && !p2.idx)) { + struct bch_dev *dev1 = bch_dev_bkey_exists(c, p1.ptr.dev); + struct bch_dev *dev2 = bch_dev_bkey_exists(c, p2.ptr.dev); + + u64 l1 = atomic64_read(&dev1->cur_latency[READ]); + u64 l2 = atomic64_read(&dev2->cur_latency[READ]); + + /* Pick at random, biased in favor of the faster device: */ + + return bch2_rand_range(l1 + l2) > l1; + } + + if (bch2_force_reconstruct_read) + return p1.idx > p2.idx; + + return p1.idx < p2.idx; +} + +/* + * This picks a non-stale pointer, preferably from a device other than @avoid. + * Avoid can be NULL, meaning pick any. If there are no non-stale pointers to + * other devices, it will still pick a pointer from avoid. + */ +int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_failures *failed, + struct extent_ptr_decoded *pick) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + struct bch_dev_io_failures *f; + struct bch_dev *ca; + int ret = 0; + + if (k.k->type == KEY_TYPE_error) + return -EIO; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Unwritten extent: no need to actually read, treat it as a + * hole and return 0s: + */ + if (p.ptr.unwritten) + return 0; + + ca = bch_dev_bkey_exists(c, p.ptr.dev); + + /* + * If there are any dirty pointers it's an error if we can't + * read: + */ + if (!ret && !p.ptr.cached) + ret = -EIO; + + if (p.ptr.cached && ptr_stale(ca, &p.ptr)) + continue; + + f = failed ? dev_io_failures(failed, p.ptr.dev) : NULL; + if (f) + p.idx = f->nr_failed < f->nr_retries + ? f->idx + : f->idx + 1; + + if (!p.idx && + !bch2_dev_is_readable(ca)) + p.idx++; + + if (bch2_force_reconstruct_read && + !p.idx && p.has_ec) + p.idx++; + + if (p.idx >= (unsigned) p.has_ec + 1) + continue; + + if (ret > 0 && !ptr_better(c, p, *pick)) + continue; + + *pick = p; + ret = 1; + } + + return ret; +} + +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (bkey_val_u64s(k.k) > BCH_REPLICAS_MAX) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), BCH_REPLICAS_MAX); + return -BCH_ERR_invalid_bkey; + } + + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} + +void bch2_btree_ptr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + bch2_bkey_ptrs_to_text(out, c, k); +} + +int bch2_btree_ptr_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (bkey_val_u64s(k.k) > BKEY_BTREE_PTR_VAL_U64s_MAX) { + prt_printf(err, "value too big (%zu > %zu)", + bkey_val_u64s(k.k), BKEY_BTREE_PTR_VAL_U64s_MAX); + return -BCH_ERR_invalid_bkey; + } + + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} + +void bch2_btree_ptr_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_btree_ptr_v2 bp = bkey_s_c_to_btree_ptr_v2(k); + + prt_printf(out, "seq %llx written %u min_key %s", + le64_to_cpu(bp.v->seq), + le16_to_cpu(bp.v->sectors_written), + BTREE_PTR_RANGE_UPDATED(bp.v) ? "R " : ""); + + bch2_bpos_to_text(out, bp.v->min_key); + prt_printf(out, " "); + bch2_bkey_ptrs_to_text(out, c, k); +} + +void bch2_btree_ptr_v2_compat(enum btree_id btree_id, unsigned version, + unsigned big_endian, int write, + struct bkey_s k) +{ + struct bkey_s_btree_ptr_v2 bp = bkey_s_to_btree_ptr_v2(k); + + compat_bpos(0, btree_id, version, big_endian, write, &bp.v->min_key); + + if (version < bcachefs_metadata_version_inode_btree_change && + btree_id_is_extents(btree_id) && + !bkey_eq(bp.v->min_key, POS_MIN)) + bp.v->min_key = write + ? bpos_nosnap_predecessor(bp.v->min_key) + : bpos_nosnap_successor(bp.v->min_key); +} + +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) +{ + struct bkey_ptrs l_ptrs = bch2_bkey_ptrs(l); + struct bkey_ptrs_c r_ptrs = bch2_bkey_ptrs_c(r); + union bch_extent_entry *en_l; + const union bch_extent_entry *en_r; + struct extent_ptr_decoded lp, rp; + bool use_right_ptr; + struct bch_dev *ca; + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_type(en_l) != extent_entry_type(en_r)) + return false; + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + if (en_l < l_ptrs.end || en_r < r_ptrs.end) + return false; + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + lp.crc = bch2_extent_crc_unpack(l.k, NULL); + rp.crc = bch2_extent_crc_unpack(r.k, NULL); + + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && + __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { + if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != + rp.ptr.offset + rp.crc.offset || + lp.ptr.dev != rp.ptr.dev || + lp.ptr.gen != rp.ptr.gen || + lp.ptr.unwritten != rp.ptr.unwritten || + lp.has_ec != rp.has_ec) + return false; + + /* Extents may not straddle buckets: */ + ca = bch_dev_bkey_exists(c, lp.ptr.dev); + if (PTR_BUCKET_NR(ca, &lp.ptr) != PTR_BUCKET_NR(ca, &rp.ptr)) + return false; + + if (lp.has_ec != rp.has_ec || + (lp.has_ec && + (lp.ec.block != rp.ec.block || + lp.ec.redundancy != rp.ec.redundancy || + lp.ec.idx != rp.ec.idx))) + return false; + + if (lp.crc.compression_type != rp.crc.compression_type || + lp.crc.nonce != rp.crc.nonce) + return false; + + if (lp.crc.offset + lp.crc.live_size + rp.crc.live_size <= + lp.crc.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (lp.crc.live_size <= rp.crc.offset) { + /* can use right extent's crc entry */ + } else { + /* check if checksums can be merged: */ + if (lp.crc.csum_type != rp.crc.csum_type || + lp.crc.nonce != rp.crc.nonce || + crc_is_compressed(lp.crc) || + !bch2_checksum_mergeable(lp.crc.csum_type)) + return false; + + if (lp.crc.offset + lp.crc.live_size != lp.crc.compressed_size || + rp.crc.offset) + return false; + + if (lp.crc.csum_type && + lp.crc.uncompressed_size + + rp.crc.uncompressed_size > (c->opts.encoded_extent_max >> 9)) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end && en_r < r_ptrs.end) { + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + if (crc_l.uncompressed_size + crc_r.uncompressed_size > + bch2_crc_field_size_max[extent_entry_type(en_l)]) + return false; + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + use_right_ptr = false; + en_l = l_ptrs.start; + en_r = r_ptrs.start; + while (en_l < l_ptrs.end) { + if (extent_entry_type(en_l) == BCH_EXTENT_ENTRY_ptr && + use_right_ptr) + en_l->ptr = en_r->ptr; + + if (extent_entry_is_crc(en_l)) { + struct bch_extent_crc_unpacked crc_l = + bch2_extent_crc_unpack(l.k, entry_to_crc(en_l)); + struct bch_extent_crc_unpacked crc_r = + bch2_extent_crc_unpack(r.k, entry_to_crc(en_r)); + + use_right_ptr = false; + + if (crc_l.offset + crc_l.live_size + crc_r.live_size <= + crc_l.uncompressed_size) { + /* can use left extent's crc entry */ + } else if (crc_l.live_size <= crc_r.offset) { + /* can use right extent's crc entry */ + crc_r.offset -= crc_l.live_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_r, + extent_entry_type(en_l)); + use_right_ptr = true; + } else { + crc_l.csum = bch2_checksum_merge(crc_l.csum_type, + crc_l.csum, + crc_r.csum, + crc_r.uncompressed_size << 9); + + crc_l.uncompressed_size += crc_r.uncompressed_size; + crc_l.compressed_size += crc_r.compressed_size; + bch2_extent_crc_pack(entry_to_crc(en_l), crc_l, + extent_entry_type(en_l)); + } + } + + en_l = extent_entry_next(en_l); + en_r = extent_entry_next(en_r); + } + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + if (!r.v->nr_replicas || r.v->nr_replicas > BCH_REPLICAS_MAX) { + prt_printf(err, "invalid nr_replicas (%u)", + r.v->nr_replicas); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_reservation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(k); + + prt_printf(out, "generation %u replicas %u", + le32_to_cpu(r.v->generation), + r.v->nr_replicas); +} + +bool bch2_reservation_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reservation l = bkey_s_to_reservation(_l); + struct bkey_s_c_reservation r = bkey_s_c_to_reservation(_r); + + if (l.v->generation != r.v->generation || + l.v->nr_replicas != r.v->nr_replicas) + return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +/* Extent checksum entries: */ + +/* returns true if not equal */ +static inline bool bch2_crc_unpacked_cmp(struct bch_extent_crc_unpacked l, + struct bch_extent_crc_unpacked r) +{ + return (l.csum_type != r.csum_type || + l.compression_type != r.compression_type || + l.compressed_size != r.compressed_size || + l.uncompressed_size != r.uncompressed_size || + l.offset != r.offset || + l.live_size != r.live_size || + l.nonce != r.nonce || + bch2_crc_cmp(l.csum, r.csum)); +} + +static inline bool can_narrow_crc(struct bch_extent_crc_unpacked u, + struct bch_extent_crc_unpacked n) +{ + return !crc_is_compressed(u) && + u.csum_type && + u.uncompressed_size > u.live_size && + bch2_csum_type_is_encryption(u.csum_type) == + bch2_csum_type_is_encryption(n.csum_type); +} + +bool bch2_can_narrow_extent_crcs(struct bkey_s_c k, + struct bch_extent_crc_unpacked n) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + if (!n.csum_type) + return false; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (can_narrow_crc(crc, n)) + return true; + + return false; +} + +/* + * We're writing another replica for this extent, so while we've got the data in + * memory we'll be computing a new checksum for the currently live data. + * + * If there are other replicas we aren't moving, and they are checksummed but + * not compressed, we can modify them to point to only the data that is + * currently live (so that readers won't have to bounce) while we've got the + * checksum we need: + */ +bool bch2_bkey_narrow_crcs(struct bkey_i *k, struct bch_extent_crc_unpacked n) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked u; + struct extent_ptr_decoded p; + union bch_extent_entry *i; + bool ret = false; + + /* Find a checksum entry that covers only live data: */ + if (!n.csum_type) { + bkey_for_each_crc(&k->k, ptrs, u, i) + if (!crc_is_compressed(u) && + u.csum_type && + u.live_size == u.uncompressed_size) { + n = u; + goto found; + } + return false; + } +found: + BUG_ON(crc_is_compressed(n)); + BUG_ON(n.offset); + BUG_ON(n.live_size != k->k.size); + +restart_narrow_pointers: + ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + bkey_for_each_ptr_decode(&k->k, ptrs, p, i) + if (can_narrow_crc(p.crc, n)) { + bch2_bkey_drop_ptr_noerror(bkey_i_to_s(k), &i->ptr); + p.ptr.offset += p.crc.offset; + p.crc = n; + bch2_extent_ptr_decoded_append(k, &p); + ret = true; + goto restart_narrow_pointers; + } + + return ret; +} + +static void bch2_extent_crc_pack(union bch_extent_crc *dst, + struct bch_extent_crc_unpacked src, + enum bch_extent_entry_type type) +{ +#define set_common_fields(_dst, _src) \ + _dst.type = 1 << type; \ + _dst.csum_type = _src.csum_type, \ + _dst.compression_type = _src.compression_type, \ + _dst._compressed_size = _src.compressed_size - 1, \ + _dst._uncompressed_size = _src.uncompressed_size - 1, \ + _dst.offset = _src.offset + + switch (type) { + case BCH_EXTENT_ENTRY_crc32: + set_common_fields(dst->crc32, src); + memcpy(&dst->crc32.csum, &src.csum.lo, sizeof(dst->crc32.csum)); + break; + case BCH_EXTENT_ENTRY_crc64: + set_common_fields(dst->crc64, src); + dst->crc64.nonce = src.nonce; + dst->crc64.csum_lo = src.csum.lo; + dst->crc64.csum_hi = *((__le16 *) &src.csum.hi); + break; + case BCH_EXTENT_ENTRY_crc128: + set_common_fields(dst->crc128, src); + dst->crc128.nonce = src.nonce; + dst->crc128.csum = src.csum; + break; + default: + BUG(); + } +#undef set_common_fields +} + +void bch2_extent_crc_append(struct bkey_i *k, + struct bch_extent_crc_unpacked new) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + union bch_extent_crc *crc = (void *) ptrs.end; + enum bch_extent_entry_type type; + + if (bch_crc_bytes[new.csum_type] <= 4 && + new.uncompressed_size <= CRC32_SIZE_MAX && + new.nonce <= CRC32_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc32; + else if (bch_crc_bytes[new.csum_type] <= 10 && + new.uncompressed_size <= CRC64_SIZE_MAX && + new.nonce <= CRC64_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc64; + else if (bch_crc_bytes[new.csum_type] <= 16 && + new.uncompressed_size <= CRC128_SIZE_MAX && + new.nonce <= CRC128_NONCE_MAX) + type = BCH_EXTENT_ENTRY_crc128; + else + BUG(); + + bch2_extent_crc_pack(crc, new, type); + + k->k.u64s += extent_entry_u64s(ptrs.end); + + EBUG_ON(bkey_val_u64s(&k->k) > BKEY_EXTENT_VAL_U64s_MAX); +} + +/* Generic code for keys with pointers: */ + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c k) +{ + return bch2_bkey_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation + ? bkey_s_c_to_reservation(k).v->nr_replicas + : bch2_bkey_dirty_devs(k).nr; +} + +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c k) +{ + unsigned ret = 0; + + if (k.k->type == KEY_TYPE_reservation) { + ret = bkey_s_c_to_reservation(k).v->nr_replicas; + } else { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + ret += !p.ptr.cached && !crc_is_compressed(p.crc); + } + + return ret; +} + +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned ret = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (!p.ptr.cached && crc_is_compressed(p.crc)) + ret += p.crc.compressed_size; + + return ret; +} + +bool bch2_bkey_is_incompressible(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + + bkey_for_each_crc(k.k, ptrs, crc, entry) + if (crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) + return true; + return false; +} + +unsigned bch2_bkey_replicas(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p = { 0 }; + unsigned replicas = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; + + if (p.has_ec) + replicas += p.ec.redundancy; + + replicas++; + + } + + return replicas; +} + +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca; + + if (p->ptr.cached) + return 0; + + ca = bch_dev_bkey_exists(c, p->ptr.dev); + + return ca->mi.durability + + (p->has_ec + ? p->ec.redundancy + : 0); +} + +unsigned bch2_extent_ptr_durability(struct bch_fs *c, struct extent_ptr_decoded *p) +{ + struct bch_dev *ca; + + if (p->ptr.cached) + return 0; + + ca = bch_dev_bkey_exists(c, p->ptr.dev); + + if (ca->mi.state == BCH_MEMBER_STATE_failed) + return 0; + + return ca->mi.durability + + (p->has_ec + ? p->ec.redundancy + : 0); +} + +unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + durability += bch2_extent_ptr_durability(c, &p); + + return durability; +} + +static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned durability = 0; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) + durability += bch2_extent_ptr_durability(c, &p); + + return durability; +} + +void bch2_bkey_extent_entry_drop(struct bkey_i *k, union bch_extent_entry *entry) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + union bch_extent_entry *next = extent_entry_next(entry); + + memmove_u64s(entry, next, (u64 *) end - (u64 *) next); + k->k.u64s -= extent_entry_u64s(entry); +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *k, + struct extent_ptr_decoded *p) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + struct bch_extent_crc_unpacked crc = + bch2_extent_crc_unpack(&k->k, NULL); + union bch_extent_entry *pos; + + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = ptrs.start; + goto found; + } + + bkey_for_each_crc(&k->k, ptrs, crc, pos) + if (!bch2_crc_unpacked_cmp(crc, p->crc)) { + pos = extent_entry_next(pos); + goto found; + } + + bch2_extent_crc_append(k, p->crc); + pos = bkey_val_end(bkey_i_to_s(k)); +found: + p->ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ptr)); + + if (p->has_ec) { + p->ec.type = 1 << BCH_EXTENT_ENTRY_stripe_ptr; + __extent_entry_insert(k, pos, to_entry(&p->ec)); + } +} + +static union bch_extent_entry *extent_entry_prev(struct bkey_ptrs ptrs, + union bch_extent_entry *entry) +{ + union bch_extent_entry *i = ptrs.start; + + if (i == entry) + return NULL; + + while (extent_entry_next(i) != entry) + i = extent_entry_next(i); + return i; +} + +static void extent_entry_drop(struct bkey_s k, union bch_extent_entry *entry) +{ + union bch_extent_entry *next = extent_entry_next(entry); + + /* stripes have ptrs, but their layout doesn't work with this code */ + BUG_ON(k.k->type == KEY_TYPE_stripe); + + memmove_u64s_down(entry, next, + (u64 *) bkey_val_end(k) - (u64 *) next); + k.k->u64s -= (u64 *) next - (u64 *) entry; +} + +/* + * Returns pointer to the next entry after the one being dropped: + */ +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry = to_entry(ptr), *next; + union bch_extent_entry *ret = entry; + bool drop_crc = true; + + EBUG_ON(ptr < &ptrs.start->ptr || + ptr >= &ptrs.end->ptr); + EBUG_ON(ptr->type != 1 << BCH_EXTENT_ENTRY_ptr); + + for (next = extent_entry_next(entry); + next != ptrs.end; + next = extent_entry_next(next)) { + if (extent_entry_is_crc(next)) { + break; + } else if (extent_entry_is_ptr(next)) { + drop_crc = false; + break; + } + } + + extent_entry_drop(k, entry); + + while ((entry = extent_entry_prev(ptrs, entry))) { + if (extent_entry_is_ptr(entry)) + break; + + if ((extent_entry_is_crc(entry) && drop_crc) || + extent_entry_is_stripe_ptr(entry)) { + ret = (void *) ret - extent_entry_bytes(entry); + extent_entry_drop(k, entry); + } + } + + return ret; +} + +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s k, + struct bch_extent_ptr *ptr) +{ + bool have_dirty = bch2_bkey_dirty_devs(k.s_c).nr; + union bch_extent_entry *ret = + bch2_bkey_drop_ptr_noerror(k, ptr); + + /* + * If we deleted all the dirty pointers and there's still cached + * pointers, we could set the cached pointers to dirty if they're not + * stale - but to do that correctly we'd need to grab an open_bucket + * reference so that we don't race with bucket reuse: + */ + if (have_dirty && + !bch2_bkey_dirty_devs(k.s_c).nr) { + k.k->type = KEY_TYPE_error; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } else if (!bch2_bkey_nr_ptrs(k.s_c)) { + k.k->type = KEY_TYPE_deleted; + set_bkey_val_u64s(k.k, 0); + ret = NULL; + } + + return ret; +} + +void bch2_bkey_drop_device(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, ptr->dev == dev); +} + +void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev) +{ + struct bch_extent_ptr *ptr = bch2_bkey_has_device(k, dev); + + if (ptr) + bch2_bkey_drop_ptr_noerror(k, ptr); +} + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->dev == dev) + return ptr; + + return NULL; +} + +bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (bch2_dev_in_target(c, ptr->dev, target) && + (!ptr->cached || + !ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr))) + return true; + + return false; +} + +bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, + struct bch_extent_ptr m, u64 offset) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == m.dev && + p.ptr.gen == m.gen && + (s64) p.ptr.offset + p.crc.offset - bkey_start_offset(k.k) == + (s64) m.offset - offset) + return true; + + return false; +} + +/* + * Returns true if two extents refer to the same data: + */ +bool bch2_extents_match(struct bkey_s_c k1, struct bkey_s_c k2) +{ + if (k1.k->type != k2.k->type) + return false; + + if (bkey_extent_is_direct_data(k1.k)) { + struct bkey_ptrs_c ptrs1 = bch2_bkey_ptrs_c(k1); + struct bkey_ptrs_c ptrs2 = bch2_bkey_ptrs_c(k2); + const union bch_extent_entry *entry1, *entry2; + struct extent_ptr_decoded p1, p2; + + if (bkey_extent_is_unwritten(k1) != bkey_extent_is_unwritten(k2)) + return false; + + bkey_for_each_ptr_decode(k1.k, ptrs1, p1, entry1) + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return true; + + return false; + } else { + /* KEY_TYPE_deleted, etc. */ + return true; + } +} + +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bkey_s k2) +{ + struct bkey_ptrs ptrs2 = bch2_bkey_ptrs(k2); + union bch_extent_entry *entry2; + struct extent_ptr_decoded p2; + + bkey_for_each_ptr_decode(k2.k, ptrs2, p2, entry2) + if (p1.ptr.dev == p2.ptr.dev && + p1.ptr.gen == p2.ptr.gen && + (s64) p1.ptr.offset + p1.crc.offset - bkey_start_offset(k1.k) == + (s64) p2.ptr.offset + p2.crc.offset - bkey_start_offset(k2.k)) + return &entry2->ptr; + + return NULL; +} + +void bch2_extent_ptr_set_cached(struct bkey_s k, struct bch_extent_ptr *ptr) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + union bch_extent_entry *ec = NULL; + + bkey_extent_entry_for_each(ptrs, entry) { + if (&entry->ptr == ptr) { + ptr->cached = true; + if (ec) + extent_entry_drop(k, ec); + return; + } + + if (extent_entry_is_stripe_ptr(entry)) + ec = entry; + else if (extent_entry_is_ptr(entry)) + ec = NULL; + } + + BUG(); +} + +/* + * bch_extent_normalize - clean up an extent, dropping stale pointers etc. + * + * Returns true if @k should be dropped entirely + * + * For existing keys, only called when btree nodes are being rewritten, not when + * they're merely being compacted/resorted in memory. + */ +bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) +{ + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(k, ptr, + ptr->cached && + ptr_stale(bch_dev_bkey_exists(c, ptr->dev), ptr)); + + return bkey_deleted(k.k); +} + +void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + const struct bch_extent_ptr *ptr; + const struct bch_extent_stripe_ptr *ec; + struct bch_dev *ca; + bool first = true; + + if (c) + prt_printf(out, "durability: %u ", bch2_bkey_durability_safe(c, k)); + + bkey_extent_entry_for_each(ptrs, entry) { + if (!first) + prt_printf(out, " "); + + switch (__extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ptr = entry_to_ptr(entry); + ca = c && ptr->dev < c->sb.nr_devices && c->devs[ptr->dev] + ? bch_dev_bkey_exists(c, ptr->dev) + : NULL; + + if (!ca) { + prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, + (u64) ptr->offset, ptr->gen, + ptr->cached ? " cached" : ""); + } else { + u32 offset; + u64 b = sector_to_bucket_and_offset(ca, ptr->offset, &offset); + + prt_printf(out, "ptr: %u:%llu:%u gen %u", + ptr->dev, b, offset, ptr->gen); + if (ptr->cached) + prt_str(out, " cached"); + if (ptr->unwritten) + prt_str(out, " unwritten"); + if (ca && ptr_stale(ca, ptr)) + prt_printf(out, " stale"); + } + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s", + crc.compressed_size, + crc.uncompressed_size, + crc.offset, crc.nonce, + bch2_csum_types[crc.csum_type], + bch2_compression_types[crc.compression_type]); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + ec = &entry->stripe_ptr; + + prt_printf(out, "ec: idx %llu block %u", + (u64) ec->idx, ec->block); + break; + default: + prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); + return; + } + + first = false; + } +} + +static int extent_ptr_invalid(const struct bch_fs *c, + struct bkey_s_c k, + const struct bch_extent_ptr *ptr, + unsigned size_ondisk, + bool metadata, + struct printbuf *err) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr2; + u64 bucket; + u32 bucket_offset; + struct bch_dev *ca; + + if (!bch2_dev_exists2(c, ptr->dev)) { + prt_printf(err, "pointer to invalid device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } + + ca = bch_dev_bkey_exists(c, ptr->dev); + bkey_for_each_ptr(ptrs, ptr2) + if (ptr != ptr2 && ptr->dev == ptr2->dev) { + prt_printf(err, "multiple pointers to same device (%u)", ptr->dev); + return -BCH_ERR_invalid_bkey; + } + + bucket = sector_to_bucket_and_offset(ca, ptr->offset, &bucket_offset); + + if (bucket >= ca->mi.nbuckets) { + prt_printf(err, "pointer past last bucket (%llu > %llu)", + bucket, ca->mi.nbuckets); + return -BCH_ERR_invalid_bkey; + } + + if (ptr->offset < bucket_to_sector(ca, ca->mi.first_bucket)) { + prt_printf(err, "pointer before first bucket (%llu < %u)", + bucket, ca->mi.first_bucket); + return -BCH_ERR_invalid_bkey; + } + + if (bucket_offset + size_ondisk > ca->mi.bucket_size) { + prt_printf(err, "pointer spans multiple buckets (%u + %u > %u)", + bucket_offset, size_ondisk, ca->mi.bucket_size); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_bkey_ptrs_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct bch_extent_crc_unpacked crc; + unsigned size_ondisk = k.k->size; + unsigned nonce = UINT_MAX; + unsigned nr_ptrs = 0; + bool unwritten = false, have_ec = false, crc_since_last_ptr = false; + int ret; + + if (bkey_is_btree_ptr(k.k)) + size_ondisk = btree_sectors(c); + + bkey_extent_entry_for_each(ptrs, entry) { + if (__extent_entry_type(entry) >= BCH_EXTENT_ENTRY_MAX) { + prt_printf(err, "invalid extent entry type (got %u, max %u)", + __extent_entry_type(entry), BCH_EXTENT_ENTRY_MAX); + return -BCH_ERR_invalid_bkey; + } + + if (bkey_is_btree_ptr(k.k) && + !extent_entry_is_ptr(entry)) { + prt_printf(err, "has non ptr field"); + return -BCH_ERR_invalid_bkey; + } + + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + ret = extent_ptr_invalid(c, k, &entry->ptr, size_ondisk, + false, err); + if (ret) + return ret; + + if (nr_ptrs && unwritten != entry->ptr.unwritten) { + prt_printf(err, "extent with unwritten and written ptrs"); + return -BCH_ERR_invalid_bkey; + } + + if (k.k->type != KEY_TYPE_extent && entry->ptr.unwritten) { + prt_printf(err, "has unwritten ptrs"); + return -BCH_ERR_invalid_bkey; + } + + if (entry->ptr.cached && have_ec) { + prt_printf(err, "cached, erasure coded ptr"); + return -BCH_ERR_invalid_bkey; + } + + unwritten = entry->ptr.unwritten; + have_ec = false; + crc_since_last_ptr = false; + nr_ptrs++; + break; + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + crc = bch2_extent_crc_unpack(k.k, entry_to_crc(entry)); + + if (crc.offset + crc.live_size > + crc.uncompressed_size) { + prt_printf(err, "checksum offset + key size > uncompressed size"); + return -BCH_ERR_invalid_bkey; + } + + size_ondisk = crc.compressed_size; + + if (!bch2_checksum_type_valid(c, crc.csum_type)) { + prt_printf(err, "invalid checksum type"); + return -BCH_ERR_invalid_bkey; + } + + if (crc.compression_type >= BCH_COMPRESSION_TYPE_NR) { + prt_printf(err, "invalid compression type"); + return -BCH_ERR_invalid_bkey; + } + + if (bch2_csum_type_is_encryption(crc.csum_type)) { + if (nonce == UINT_MAX) + nonce = crc.offset + crc.nonce; + else if (nonce != crc.offset + crc.nonce) { + prt_printf(err, "incorrect nonce"); + return -BCH_ERR_invalid_bkey; + } + } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + crc_since_last_ptr = true; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + have_ec = true; + break; + case BCH_EXTENT_ENTRY_rebalance: + break; + } + } + + if (!nr_ptrs) { + prt_str(err, "no ptrs"); + return -BCH_ERR_invalid_bkey; + } + + if (nr_ptrs >= BCH_BKEY_PTRS_MAX) { + prt_str(err, "too many ptrs"); + return -BCH_ERR_invalid_bkey; + } + + if (crc_since_last_ptr) { + prt_printf(err, "redundant crc entry"); + return -BCH_ERR_invalid_bkey; + } + + if (have_ec) { + prt_printf(err, "redundant stripe entry"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_ptr_swab(struct bkey_s k) +{ + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + u64 *d; + + for (d = (u64 *) ptrs.start; + d != (u64 *) ptrs.end; + d++) + *d = swab64(*d); + + for (entry = ptrs.start; + entry < ptrs.end; + entry = extent_entry_next(entry)) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.csum = swab32(entry->crc32.csum); + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.csum_hi = swab16(entry->crc64.csum_hi); + entry->crc64.csum_lo = swab64(entry->crc64.csum_lo); + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.csum.hi = (__force __le64) + swab64((__force u64) entry->crc128.csum.hi); + entry->crc128.csum.lo = (__force __le64) + swab64((__force u64) entry->crc128.csum.lo); + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + case BCH_EXTENT_ENTRY_rebalance: + break; + } + } +} + +/* Generic extent code: */ + +int bch2_cut_front_s(struct bpos where, struct bkey_s k) +{ + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 sub; + + if (bkey_le(where, bkey_start_pos(k.k))) + return 0; + + EBUG_ON(bkey_gt(where, k.k->p)); + + sub = where.offset - bkey_start_offset(k.k); + + k.k->size -= sub; + + if (!k.k->size) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } + + switch (k.k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: { + struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + union bch_extent_entry *entry; + bool seen_crc = false; + + bkey_extent_entry_for_each(ptrs, entry) { + switch (extent_entry_type(entry)) { + case BCH_EXTENT_ENTRY_ptr: + if (!seen_crc) + entry->ptr.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc32: + entry->crc32.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc64: + entry->crc64.offset += sub; + break; + case BCH_EXTENT_ENTRY_crc128: + entry->crc128.offset += sub; + break; + case BCH_EXTENT_ENTRY_stripe_ptr: + break; + case BCH_EXTENT_ENTRY_rebalance: + break; + } + + if (extent_entry_is_crc(entry)) + seen_crc = true; + } + + break; + } + case KEY_TYPE_reflink_p: { + struct bkey_s_reflink_p p = bkey_s_to_reflink_p(k); + + le64_add_cpu(&p.v->idx, sub); + break; + } + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: { + void *p = bkey_inline_data_p(k); + unsigned bytes = bkey_inline_data_bytes(k.k); + + sub = min_t(u64, sub << 9, bytes); + + memmove(p, p + sub, bytes - sub); + + new_val_u64s -= sub >> 3; + break; + } + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; +} + +int bch2_cut_back_s(struct bpos where, struct bkey_s k) +{ + unsigned new_val_u64s = bkey_val_u64s(k.k); + int val_u64s_delta; + u64 len = 0; + + if (bkey_ge(where, k.k->p)) + return 0; + + EBUG_ON(bkey_lt(where, bkey_start_pos(k.k))); + + len = where.offset - bkey_start_offset(k.k); + + k.k->p.offset = where.offset; + k.k->size = len; + + if (!len) { + k.k->type = KEY_TYPE_deleted; + new_val_u64s = 0; + } + + switch (k.k->type) { + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: + new_val_u64s = (bkey_inline_data_offset(k.k) + + min(bkey_inline_data_bytes(k.k), k.k->size << 9)) >> 3; + break; + } + + val_u64s_delta = bkey_val_u64s(k.k) - new_val_u64s; + BUG_ON(val_u64s_delta < 0); + + set_bkey_val_u64s(k.k, new_val_u64s); + memset(bkey_val_end(k), 0, val_u64s_delta * sizeof(u64)); + return -val_u64s_delta; +} diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h new file mode 100644 index 000000000..d359b3fda --- /dev/null +++ b/fs/bcachefs/extents.h @@ -0,0 +1,757 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_H +#define _BCACHEFS_EXTENTS_H + +#include "bcachefs.h" +#include "bkey.h" +#include "extents_types.h" + +struct bch_fs; +struct btree_trans; +enum bkey_invalid_flags; + +/* extent entries: */ + +#define extent_entry_last(_e) \ + ((typeof(&(_e).v->start[0])) bkey_val_end(_e)) + +#define entry_to_ptr(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_ptr(_entry)); \ + \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const struct bch_extent_ptr *) (_entry), \ + (struct bch_extent_ptr *) (_entry)); \ +}) + +/* downcast, preserves const */ +#define to_entry(_entry) \ +({ \ + BUILD_BUG_ON(!type_is(_entry, union bch_extent_crc *) && \ + !type_is(_entry, struct bch_extent_ptr *) && \ + !type_is(_entry, struct bch_extent_stripe_ptr *)); \ + \ + __builtin_choose_expr( \ + (type_is_exact(_entry, const union bch_extent_crc *) || \ + type_is_exact(_entry, const struct bch_extent_ptr *) ||\ + type_is_exact(_entry, const struct bch_extent_stripe_ptr *)),\ + (const union bch_extent_entry *) (_entry), \ + (union bch_extent_entry *) (_entry)); \ +}) + +#define extent_entry_next(_entry) \ + ((typeof(_entry)) ((void *) (_entry) + extent_entry_bytes(_entry))) + +static inline unsigned +__extent_entry_type(const union bch_extent_entry *e) +{ + return e->type ? __ffs(e->type) : BCH_EXTENT_ENTRY_MAX; +} + +static inline enum bch_extent_entry_type +extent_entry_type(const union bch_extent_entry *e) +{ + int ret = __ffs(e->type); + + EBUG_ON(ret < 0 || ret >= BCH_EXTENT_ENTRY_MAX); + + return ret; +} + +static inline size_t extent_entry_bytes(const union bch_extent_entry *entry) +{ + switch (extent_entry_type(entry)) { +#define x(f, n) \ + case BCH_EXTENT_ENTRY_##f: \ + return sizeof(struct bch_extent_##f); + BCH_EXTENT_ENTRY_TYPES() +#undef x + default: + BUG(); + } +} + +static inline size_t extent_entry_u64s(const union bch_extent_entry *entry) +{ + return extent_entry_bytes(entry) / sizeof(u64); +} + +static inline void __extent_entry_insert(struct bkey_i *k, + union bch_extent_entry *dst, + union bch_extent_entry *new) +{ + union bch_extent_entry *end = bkey_val_end(bkey_i_to_s(k)); + + memmove_u64s_up_small((u64 *) dst + extent_entry_u64s(new), + dst, (u64 *) end - (u64 *) dst); + k->k.u64s += extent_entry_u64s(new); + memcpy_u64s_small(dst, new, extent_entry_u64s(new)); +} + +static inline bool extent_entry_is_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_ptr; +} + +static inline bool extent_entry_is_stripe_ptr(const union bch_extent_entry *e) +{ + return extent_entry_type(e) == BCH_EXTENT_ENTRY_stripe_ptr; +} + +static inline bool extent_entry_is_crc(const union bch_extent_entry *e) +{ + switch (extent_entry_type(e)) { + case BCH_EXTENT_ENTRY_crc32: + case BCH_EXTENT_ENTRY_crc64: + case BCH_EXTENT_ENTRY_crc128: + return true; + default: + return false; + } +} + +union bch_extent_crc { + u8 type; + struct bch_extent_crc32 crc32; + struct bch_extent_crc64 crc64; + struct bch_extent_crc128 crc128; +}; + +#define __entry_to_crc(_entry) \ + __builtin_choose_expr( \ + type_is_exact(_entry, const union bch_extent_entry *), \ + (const union bch_extent_crc *) (_entry), \ + (union bch_extent_crc *) (_entry)) + +#define entry_to_crc(_entry) \ +({ \ + EBUG_ON((_entry) && !extent_entry_is_crc(_entry)); \ + \ + __entry_to_crc(_entry); \ +}) + +static inline struct bch_extent_crc_unpacked +bch2_extent_crc_unpack(const struct bkey *k, const union bch_extent_crc *crc) +{ +#define common_fields(_crc) \ + .csum_type = _crc.csum_type, \ + .compression_type = _crc.compression_type, \ + .compressed_size = _crc._compressed_size + 1, \ + .uncompressed_size = _crc._uncompressed_size + 1, \ + .offset = _crc.offset, \ + .live_size = k->size + + if (!crc) + return (struct bch_extent_crc_unpacked) { + .compressed_size = k->size, + .uncompressed_size = k->size, + .live_size = k->size, + }; + + switch (extent_entry_type(to_entry(crc))) { + case BCH_EXTENT_ENTRY_crc32: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { + common_fields(crc->crc32), + }; + + memcpy(&ret.csum.lo, &crc->crc32.csum, sizeof(crc->crc32.csum)); + return ret; + } + case BCH_EXTENT_ENTRY_crc64: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { + common_fields(crc->crc64), + .nonce = crc->crc64.nonce, + .csum.lo = (__force __le64) crc->crc64.csum_lo, + }; + + u16 hi = crc->crc64.csum_hi; + memcpy(&ret.csum.hi, &hi, sizeof(hi)); + return ret; + } + case BCH_EXTENT_ENTRY_crc128: { + struct bch_extent_crc_unpacked ret = (struct bch_extent_crc_unpacked) { + common_fields(crc->crc128), + .nonce = crc->crc128.nonce, + .csum = crc->crc128.csum, + }; + + return ret; + } + default: + BUG(); + } +#undef common_fields +} + +static inline bool crc_is_compressed(struct bch_extent_crc_unpacked crc) +{ + return (crc.compression_type != BCH_COMPRESSION_TYPE_none && + crc.compression_type != BCH_COMPRESSION_TYPE_incompressible); +} + +/* bkey_ptrs: generically over any key type that has ptrs */ + +struct bkey_ptrs_c { + const union bch_extent_entry *start; + const union bch_extent_entry *end; +}; + +struct bkey_ptrs { + union bch_extent_entry *start; + union bch_extent_entry *end; +}; + +static inline struct bkey_ptrs_c bch2_bkey_ptrs_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr: { + struct bkey_s_c_btree_ptr e = bkey_s_c_to_btree_ptr(k); + + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } + case KEY_TYPE_extent: { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + + return (struct bkey_ptrs_c) { + e.v->start, + extent_entry_last(e) + }; + } + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + return (struct bkey_ptrs_c) { + to_entry(&s.v->ptrs[0]), + to_entry(&s.v->ptrs[s.v->nr_blocks]), + }; + } + case KEY_TYPE_reflink_v: { + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + return (struct bkey_ptrs_c) { + r.v->start, + bkey_val_end(r), + }; + } + case KEY_TYPE_btree_ptr_v2: { + struct bkey_s_c_btree_ptr_v2 e = bkey_s_c_to_btree_ptr_v2(k); + + return (struct bkey_ptrs_c) { + to_entry(&e.v->start[0]), + to_entry(extent_entry_last(e)) + }; + } + default: + return (struct bkey_ptrs_c) { NULL, NULL }; + } +} + +static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) +{ + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k.s_c); + + return (struct bkey_ptrs) { + (void *) p.start, + (void *) p.end + }; +} + +#define __bkey_extent_entry_for_each_from(_start, _end, _entry) \ + for ((_entry) = (_start); \ + (_entry) < (_end); \ + (_entry) = extent_entry_next(_entry)) + +#define __bkey_ptr_next(_ptr, _end) \ +({ \ + typeof(_end) _entry; \ + \ + __bkey_extent_entry_for_each_from(to_entry(_ptr), _end, _entry) \ + if (extent_entry_is_ptr(_entry)) \ + break; \ + \ + _entry < (_end) ? entry_to_ptr(_entry) : NULL; \ +}) + +#define bkey_extent_entry_for_each_from(_p, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, (_p).end, _entry) + +#define bkey_extent_entry_for_each(_p, _entry) \ + bkey_extent_entry_for_each_from(_p, _entry, _p.start) + +#define __bkey_for_each_ptr(_start, _end, _ptr) \ + for ((_ptr) = (_start); \ + ((_ptr) = __bkey_ptr_next(_ptr, _end)); \ + (_ptr)++) + +#define bkey_ptr_next(_p, _ptr) \ + __bkey_ptr_next(_ptr, (_p).end) + +#define bkey_for_each_ptr(_p, _ptr) \ + __bkey_for_each_ptr(&(_p).start->ptr, (_p).end, _ptr) + +#define __bkey_ptr_next_decode(_k, _end, _ptr, _entry) \ +({ \ + __label__ out; \ + \ + (_ptr).idx = 0; \ + (_ptr).has_ec = false; \ + \ + __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ + switch (extent_entry_type(_entry)) { \ + case BCH_EXTENT_ENTRY_ptr: \ + (_ptr).ptr = _entry->ptr; \ + goto out; \ + case BCH_EXTENT_ENTRY_crc32: \ + case BCH_EXTENT_ENTRY_crc64: \ + case BCH_EXTENT_ENTRY_crc128: \ + (_ptr).crc = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_entry)); \ + break; \ + case BCH_EXTENT_ENTRY_stripe_ptr: \ + (_ptr).ec = _entry->stripe_ptr; \ + (_ptr).has_ec = true; \ + break; \ + default: \ + /* nothing */ \ + break; \ + } \ +out: \ + _entry < (_end); \ +}) + +#define __bkey_for_each_ptr_decode(_k, _start, _end, _ptr, _entry) \ + for ((_ptr).crc = bch2_extent_crc_unpack(_k, NULL), \ + (_entry) = _start; \ + __bkey_ptr_next_decode(_k, _end, _ptr, _entry); \ + (_entry) = extent_entry_next(_entry)) + +#define bkey_for_each_ptr_decode(_k, _p, _ptr, _entry) \ + __bkey_for_each_ptr_decode(_k, (_p).start, (_p).end, \ + _ptr, _entry) + +#define bkey_crc_next(_k, _start, _end, _crc, _iter) \ +({ \ + __bkey_extent_entry_for_each_from(_iter, _end, _iter) \ + if (extent_entry_is_crc(_iter)) { \ + (_crc) = bch2_extent_crc_unpack(_k, \ + entry_to_crc(_iter)); \ + break; \ + } \ + \ + (_iter) < (_end); \ +}) + +#define __bkey_for_each_crc(_k, _start, _end, _crc, _iter) \ + for ((_crc) = bch2_extent_crc_unpack(_k, NULL), \ + (_iter) = (_start); \ + bkey_crc_next(_k, _start, _end, _crc, _iter); \ + (_iter) = extent_entry_next(_iter)) + +#define bkey_for_each_crc(_k, _p, _crc, _iter) \ + __bkey_for_each_crc(_k, (_p).start, (_p).end, _crc, _iter) + +/* Iterate over pointers in KEY_TYPE_extent: */ + +#define extent_for_each_entry_from(_e, _entry, _start) \ + __bkey_extent_entry_for_each_from(_start, \ + extent_entry_last(_e), _entry) + +#define extent_for_each_entry(_e, _entry) \ + extent_for_each_entry_from(_e, _entry, (_e).v->start) + +#define extent_ptr_next(_e, _ptr) \ + __bkey_ptr_next(_ptr, extent_entry_last(_e)) + +#define extent_for_each_ptr(_e, _ptr) \ + __bkey_for_each_ptr(&(_e).v->start->ptr, extent_entry_last(_e), _ptr) + +#define extent_for_each_ptr_decode(_e, _ptr, _entry) \ + __bkey_for_each_ptr_decode((_e).k, (_e).v->start, \ + extent_entry_last(_e), _ptr, _entry) + +/* utility code common to all keys with pointers: */ + +void bch2_mark_io_failure(struct bch_io_failures *, + struct extent_ptr_decoded *); +int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, + struct bch_io_failures *, + struct extent_ptr_decoded *); + +/* KEY_TYPE_btree_ptr: */ + +int bch2_btree_ptr_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); + +int bch2_btree_ptr_v2_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_btree_ptr_v2_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned, + int, struct bkey_s); + +#define bch2_bkey_ops_btree_ptr ((struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_invalid, \ + .val_to_text = bch2_btree_ptr_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ +}) + +#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_btree_ptr_v2_invalid, \ + .val_to_text = bch2_btree_ptr_v2_to_text, \ + .swab = bch2_ptr_swab, \ + .compat = bch2_btree_ptr_v2_compat, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ + .min_val_size = 40, \ +}) + +/* KEY_TYPE_extent: */ + +bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +#define bch2_bkey_ops_extent ((struct bkey_ops) { \ + .key_invalid = bch2_bkey_ptrs_invalid, \ + .val_to_text = bch2_bkey_ptrs_to_text, \ + .swab = bch2_ptr_swab, \ + .key_normalize = bch2_extent_normalize, \ + .key_merge = bch2_extent_merge, \ + .trans_trigger = bch2_trans_mark_extent, \ + .atomic_trigger = bch2_mark_extent, \ +}) + +/* KEY_TYPE_reservation: */ + +int bch2_reservation_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reservation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +#define bch2_bkey_ops_reservation ((struct bkey_ops) { \ + .key_invalid = bch2_reservation_invalid, \ + .val_to_text = bch2_reservation_to_text, \ + .key_merge = bch2_reservation_merge, \ + .trans_trigger = bch2_trans_mark_reservation, \ + .atomic_trigger = bch2_mark_reservation, \ + .min_val_size = 8, \ +}) + +/* Extent checksum entries: */ + +bool bch2_can_narrow_extent_crcs(struct bkey_s_c, + struct bch_extent_crc_unpacked); +bool bch2_bkey_narrow_crcs(struct bkey_i *, struct bch_extent_crc_unpacked); +void bch2_extent_crc_append(struct bkey_i *, + struct bch_extent_crc_unpacked); + +/* Generic code for keys with pointers: */ + +static inline bool bkey_is_btree_ptr(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_direct_data(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_inline_data(const struct bkey *k) +{ + return k->type == KEY_TYPE_inline_data || + k->type == KEY_TYPE_indirect_inline_data; +} + +static inline unsigned bkey_inline_data_offset(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_inline_data: + return sizeof(struct bch_inline_data); + case KEY_TYPE_indirect_inline_data: + return sizeof(struct bch_indirect_inline_data); + default: + BUG(); + } +} + +static inline unsigned bkey_inline_data_bytes(const struct bkey *k) +{ + return bkey_val_bytes(k) - bkey_inline_data_offset(k); +} + +#define bkey_inline_data_p(_k) (((void *) (_k).v) + bkey_inline_data_offset((_k).k)) + +static inline bool bkey_extent_is_data(const struct bkey *k) +{ + return bkey_extent_is_direct_data(k) || + bkey_extent_is_inline_data(k) || + k->type == KEY_TYPE_reflink_p; +} + +/* + * Should extent be counted under inode->i_sectors? + */ +static inline bool bkey_extent_is_allocation(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_extent: + case KEY_TYPE_reservation: + case KEY_TYPE_reflink_p: + case KEY_TYPE_reflink_v: + case KEY_TYPE_inline_data: + case KEY_TYPE_indirect_inline_data: + return true; + default: + return false; + } +} + +static inline bool bkey_extent_is_unwritten(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(ptrs, ptr) + if (ptr->unwritten) + return true; + return false; +} + +static inline bool bkey_extent_is_reservation(struct bkey_s_c k) +{ + return k.k->type == KEY_TYPE_reservation || + bkey_extent_is_unwritten(k); +} + +static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + +static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + if (!ptr->cached) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + +static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k) +{ + struct bch_devs_list ret = (struct bch_devs_list) { 0 }; + struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + + bkey_for_each_ptr(p, ptr) + if (ptr->cached) + ret.devs[ret.nr++] = ptr->dev; + + return ret; +} + +static inline unsigned bch2_bkey_ptr_data_type(struct bkey_s_c k, const struct bch_extent_ptr *ptr) +{ + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + return BCH_DATA_btree; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + return BCH_DATA_user; + case KEY_TYPE_stripe: { + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + BUG_ON(ptr < s.v->ptrs || + ptr >= s.v->ptrs + s.v->nr_blocks); + + return ptr >= s.v->ptrs + s.v->nr_blocks - s.v->nr_redundant + ? BCH_DATA_parity + : BCH_DATA_user; + } + default: + BUG(); + } +} + +unsigned bch2_bkey_nr_ptrs(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_allocated(struct bkey_s_c); +unsigned bch2_bkey_nr_ptrs_fully_allocated(struct bkey_s_c); +bool bch2_bkey_is_incompressible(struct bkey_s_c); +unsigned bch2_bkey_sectors_compressed(struct bkey_s_c); + +unsigned bch2_bkey_replicas(struct bch_fs *, struct bkey_s_c); +unsigned bch2_extent_ptr_desired_durability(struct bch_fs *, struct extent_ptr_decoded *); +unsigned bch2_extent_ptr_durability(struct bch_fs *, struct extent_ptr_decoded *); +unsigned bch2_bkey_durability(struct bch_fs *, struct bkey_s_c); + +void bch2_bkey_drop_device(struct bkey_s, unsigned); +void bch2_bkey_drop_device_noerror(struct bkey_s, unsigned); + +const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c, unsigned); + +static inline struct bch_extent_ptr *bch2_bkey_has_device(struct bkey_s k, unsigned dev) +{ + return (void *) bch2_bkey_has_device_c(k.s_c, dev); +} + +bool bch2_bkey_has_target(struct bch_fs *, struct bkey_s_c, unsigned); + +void bch2_bkey_extent_entry_drop(struct bkey_i *, union bch_extent_entry *); + +static inline void bch2_bkey_append_ptr(struct bkey_i *k, struct bch_extent_ptr ptr) +{ + EBUG_ON(bch2_bkey_has_device(bkey_i_to_s(k), ptr.dev)); + + switch (k->k.type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + case KEY_TYPE_extent: + EBUG_ON(bkey_val_u64s(&k->k) >= BKEY_EXTENT_VAL_U64s_MAX); + + ptr.type = 1 << BCH_EXTENT_ENTRY_ptr; + + memcpy((void *) &k->v + bkey_val_bytes(&k->k), + &ptr, + sizeof(ptr)); + k->k.u64s++; + break; + default: + BUG(); + } +} + +void bch2_extent_ptr_decoded_append(struct bkey_i *, + struct extent_ptr_decoded *); +union bch_extent_entry *bch2_bkey_drop_ptr_noerror(struct bkey_s, + struct bch_extent_ptr *); +union bch_extent_entry *bch2_bkey_drop_ptr(struct bkey_s, + struct bch_extent_ptr *); + +#define bch2_bkey_drop_ptrs(_k, _ptr, _cond) \ +do { \ + struct bkey_ptrs _ptrs = bch2_bkey_ptrs(_k); \ + \ + _ptr = &_ptrs.start->ptr; \ + \ + while ((_ptr = bkey_ptr_next(_ptrs, _ptr))) { \ + if (_cond) { \ + _ptr = (void *) bch2_bkey_drop_ptr(_k, _ptr); \ + _ptrs = bch2_bkey_ptrs(_k); \ + continue; \ + } \ + \ + (_ptr)++; \ + } \ +} while (0) + +bool bch2_bkey_matches_ptr(struct bch_fs *, struct bkey_s_c, + struct bch_extent_ptr, u64); +bool bch2_extents_match(struct bkey_s_c, struct bkey_s_c); +struct bch_extent_ptr * +bch2_extent_has_ptr(struct bkey_s_c, struct extent_ptr_decoded, struct bkey_s); + +void bch2_extent_ptr_set_cached(struct bkey_s, struct bch_extent_ptr *); + +bool bch2_extent_normalize(struct bch_fs *, struct bkey_s); +void bch2_bkey_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +int bch2_bkey_ptrs_invalid(const struct bch_fs *, struct bkey_s_c, + unsigned, struct printbuf *); + +void bch2_ptr_swab(struct bkey_s); + +/* Generic extent code: */ + +enum bch_extent_overlap { + BCH_EXTENT_OVERLAP_ALL = 0, + BCH_EXTENT_OVERLAP_BACK = 1, + BCH_EXTENT_OVERLAP_FRONT = 2, + BCH_EXTENT_OVERLAP_MIDDLE = 3, +}; + +/* Returns how k overlaps with m */ +static inline enum bch_extent_overlap bch2_extent_overlap(const struct bkey *k, + const struct bkey *m) +{ + int cmp1 = bkey_lt(k->p, m->p); + int cmp2 = bkey_gt(bkey_start_pos(k), bkey_start_pos(m)); + + return (cmp1 << 1) + cmp2; +} + +int bch2_cut_front_s(struct bpos, struct bkey_s); +int bch2_cut_back_s(struct bpos, struct bkey_s); + +static inline void bch2_cut_front(struct bpos where, struct bkey_i *k) +{ + bch2_cut_front_s(where, bkey_i_to_s(k)); +} + +static inline void bch2_cut_back(struct bpos where, struct bkey_i *k) +{ + bch2_cut_back_s(where, bkey_i_to_s(k)); +} + +/** + * bch_key_resize - adjust size of @k + * + * bkey_start_offset(k) will be preserved, modifies where the extent ends + */ +static inline void bch2_key_resize(struct bkey *k, unsigned new_size) +{ + k->p.offset -= k->size; + k->p.offset += new_size; + k->size = new_size; +} + +/* + * In extent_sort_fix_overlapping(), insert_fixup_extent(), + * extent_merge_inline() - we're modifying keys in place that are packed. To do + * that we have to unpack the key, modify the unpacked key - then this + * copies/repacks the unpacked to the original as necessary. + */ +static inline void extent_save(struct btree *b, struct bkey_packed *dst, + struct bkey *src) +{ + struct bkey_format *f = &b->format; + struct bkey_i *dst_unpacked; + + if ((dst_unpacked = packed_to_bkey(dst))) + dst_unpacked->k = *src; + else + BUG_ON(!bch2_bkey_pack_key(dst, src, f)); +} + +#endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h new file mode 100644 index 000000000..43d6c341e --- /dev/null +++ b/fs/bcachefs/extents_types.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_EXTENTS_TYPES_H +#define _BCACHEFS_EXTENTS_TYPES_H + +#include "bcachefs_format.h" + +struct bch_extent_crc_unpacked { + u32 compressed_size; + u32 uncompressed_size; + u32 live_size; + + u8 csum_type; + u8 compression_type; + + u16 offset; + + u16 nonce; + + struct bch_csum csum; +}; + +struct extent_ptr_decoded { + unsigned idx; + bool has_ec; + struct bch_extent_crc_unpacked crc; + struct bch_extent_ptr ptr; + struct bch_extent_stripe_ptr ec; +}; + +struct bch_io_failures { + u8 nr; + struct bch_dev_io_failures { + u8 dev; + u8 idx; + u8 nr_failed; + u8 nr_retries; + } devs[BCH_REPLICAS_MAX]; +}; + +#endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h new file mode 100644 index 000000000..05429c963 --- /dev/null +++ b/fs/bcachefs/eytzinger.h @@ -0,0 +1,281 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _EYTZINGER_H +#define _EYTZINGER_H + +#include +#include + +#include "util.h" + +/* + * Traversal for trees in eytzinger layout - a full binary tree layed out in an + * array + */ + +/* + * One based indexing version: + * + * With one based indexing each level of the tree starts at a power of two - + * good for cacheline alignment: + */ + +static inline unsigned eytzinger1_child(unsigned i, unsigned child) +{ + EBUG_ON(child > 1); + + return (i << 1) + child; +} + +static inline unsigned eytzinger1_left_child(unsigned i) +{ + return eytzinger1_child(i, 0); +} + +static inline unsigned eytzinger1_right_child(unsigned i) +{ + return eytzinger1_child(i, 1); +} + +static inline unsigned eytzinger1_first(unsigned size) +{ + return rounddown_pow_of_two(size); +} + +static inline unsigned eytzinger1_last(unsigned size) +{ + return rounddown_pow_of_two(size + 1) - 1; +} + +/* + * eytzinger1_next() and eytzinger1_prev() have the nice properties that + * + * eytzinger1_next(0) == eytzinger1_first()) + * eytzinger1_prev(0) == eytzinger1_last()) + * + * eytzinger1_prev(eytzinger1_first()) == 0 + * eytzinger1_next(eytzinger1_last()) == 0 + */ + +static inline unsigned eytzinger1_next(unsigned i, unsigned size) +{ + EBUG_ON(i > size); + + if (eytzinger1_right_child(i) <= size) { + i = eytzinger1_right_child(i); + + i <<= __fls(size + 1) - __fls(i); + i >>= i > size; + } else { + i >>= ffz(i) + 1; + } + + return i; +} + +static inline unsigned eytzinger1_prev(unsigned i, unsigned size) +{ + EBUG_ON(i > size); + + if (eytzinger1_left_child(i) <= size) { + i = eytzinger1_left_child(i) + 1; + + i <<= __fls(size + 1) - __fls(i); + i -= 1; + i >>= i > size; + } else { + i >>= __ffs(i) + 1; + } + + return i; +} + +static inline unsigned eytzinger1_extra(unsigned size) +{ + return (size + 1 - rounddown_pow_of_two(size)) << 1; +} + +static inline unsigned __eytzinger1_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ + unsigned b = __fls(i); + unsigned shift = __fls(size) - b; + int s; + + EBUG_ON(!i || i > size); + + i ^= 1U << b; + i <<= 1; + i |= 1; + i <<= shift; + + /* + * sign bit trick: + * + * if (i > extra) + * i -= (i - extra) >> 1; + */ + s = extra - i; + i += (s >> 1) & (s >> 31); + + return i; +} + +static inline unsigned __inorder_to_eytzinger1(unsigned i, unsigned size, + unsigned extra) +{ + unsigned shift; + int s; + + EBUG_ON(!i || i > size); + + /* + * sign bit trick: + * + * if (i > extra) + * i += i - extra; + */ + s = extra - i; + i -= s & (s >> 31); + + shift = __ffs(i); + + i >>= shift + 1; + i |= 1U << (__fls(size) - shift); + + return i; +} + +static inline unsigned eytzinger1_to_inorder(unsigned i, unsigned size) +{ + return __eytzinger1_to_inorder(i, size, eytzinger1_extra(size)); +} + +static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size) +{ + return __inorder_to_eytzinger1(i, size, eytzinger1_extra(size)); +} + +#define eytzinger1_for_each(_i, _size) \ + for ((_i) = eytzinger1_first((_size)); \ + (_i) != 0; \ + (_i) = eytzinger1_next((_i), (_size))) + +/* Zero based indexing version: */ + +static inline unsigned eytzinger0_child(unsigned i, unsigned child) +{ + EBUG_ON(child > 1); + + return (i << 1) + 1 + child; +} + +static inline unsigned eytzinger0_left_child(unsigned i) +{ + return eytzinger0_child(i, 0); +} + +static inline unsigned eytzinger0_right_child(unsigned i) +{ + return eytzinger0_child(i, 1); +} + +static inline unsigned eytzinger0_first(unsigned size) +{ + return eytzinger1_first(size) - 1; +} + +static inline unsigned eytzinger0_last(unsigned size) +{ + return eytzinger1_last(size) - 1; +} + +static inline unsigned eytzinger0_next(unsigned i, unsigned size) +{ + return eytzinger1_next(i + 1, size) - 1; +} + +static inline unsigned eytzinger0_prev(unsigned i, unsigned size) +{ + return eytzinger1_prev(i + 1, size) - 1; +} + +static inline unsigned eytzinger0_extra(unsigned size) +{ + return eytzinger1_extra(size); +} + +static inline unsigned __eytzinger0_to_inorder(unsigned i, unsigned size, + unsigned extra) +{ + return __eytzinger1_to_inorder(i + 1, size, extra) - 1; +} + +static inline unsigned __inorder_to_eytzinger0(unsigned i, unsigned size, + unsigned extra) +{ + return __inorder_to_eytzinger1(i + 1, size, extra) - 1; +} + +static inline unsigned eytzinger0_to_inorder(unsigned i, unsigned size) +{ + return __eytzinger0_to_inorder(i, size, eytzinger0_extra(size)); +} + +static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) +{ + return __inorder_to_eytzinger0(i, size, eytzinger0_extra(size)); +} + +#define eytzinger0_for_each(_i, _size) \ + for ((_i) = eytzinger0_first((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_next((_i), (_size))) + +typedef int (*eytzinger_cmp_fn)(const void *l, const void *r, size_t size); + +/* return greatest node <= @search, or -1 if not found */ +static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size, + eytzinger_cmp_fn cmp, const void *search) +{ + unsigned i, n = 0; + + if (!nr) + return -1; + + do { + i = n; + n = eytzinger0_child(i, cmp(search, base + i * size, size) >= 0); + } while (n < nr); + + if (n & 1) { + /* @i was greater than @search, return previous node: */ + + if (i == eytzinger0_first(nr)) + return -1; + + return eytzinger0_prev(i, nr); + } else { + return i; + } +} + +#define eytzinger0_find(base, nr, size, _cmp, search) \ +({ \ + void *_base = (base); \ + void *_search = (search); \ + size_t _nr = (nr); \ + size_t _size = (size); \ + size_t _i = 0; \ + int _res; \ + \ + while (_i < _nr && \ + (_res = _cmp(_search, _base + _i * _size, _size))) \ + _i = eytzinger0_child(_i, _res > 0); \ + _i; \ +}) + +void eytzinger0_sort(void *, size_t, size_t, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + +#endif /* _EYTZINGER_H */ diff --git a/fs/bcachefs/fifo.h b/fs/bcachefs/fifo.h new file mode 100644 index 000000000..66b945be1 --- /dev/null +++ b/fs/bcachefs/fifo.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FIFO_H +#define _BCACHEFS_FIFO_H + +#include "util.h" + +#define FIFO(type) \ +struct { \ + size_t front, back, size, mask; \ + type *data; \ +} + +#define DECLARE_FIFO(type, name) FIFO(type) name + +#define fifo_buf_size(fifo) \ + ((fifo)->size \ + ? roundup_pow_of_two((fifo)->size) * sizeof((fifo)->data[0]) \ + : 0) + +#define init_fifo(fifo, _size, _gfp) \ +({ \ + (fifo)->front = (fifo)->back = 0; \ + (fifo)->size = (_size); \ + (fifo)->mask = (fifo)->size \ + ? roundup_pow_of_two((fifo)->size) - 1 \ + : 0; \ + (fifo)->data = kvpmalloc(fifo_buf_size(fifo), (_gfp)); \ +}) + +#define free_fifo(fifo) \ +do { \ + kvpfree((fifo)->data, fifo_buf_size(fifo)); \ + (fifo)->data = NULL; \ +} while (0) + +#define fifo_swap(l, r) \ +do { \ + swap((l)->front, (r)->front); \ + swap((l)->back, (r)->back); \ + swap((l)->size, (r)->size); \ + swap((l)->mask, (r)->mask); \ + swap((l)->data, (r)->data); \ +} while (0) + +#define fifo_move(dest, src) \ +do { \ + typeof(*((dest)->data)) _t; \ + while (!fifo_full(dest) && \ + fifo_pop(src, _t)) \ + fifo_push(dest, _t); \ +} while (0) + +#define fifo_used(fifo) (((fifo)->back - (fifo)->front)) +#define fifo_free(fifo) ((fifo)->size - fifo_used(fifo)) + +#define fifo_empty(fifo) ((fifo)->front == (fifo)->back) +#define fifo_full(fifo) (fifo_used(fifo) == (fifo)->size) + +#define fifo_peek_front(fifo) ((fifo)->data[(fifo)->front & (fifo)->mask]) +#define fifo_peek_back(fifo) ((fifo)->data[((fifo)->back - 1) & (fifo)->mask]) + +#define fifo_entry_idx_abs(fifo, p) \ + ((((p) >= &fifo_peek_front(fifo) \ + ? (fifo)->front : (fifo)->back) & ~(fifo)->mask) + \ + (((p) - (fifo)->data))) + +#define fifo_entry_idx(fifo, p) (((p) - &fifo_peek_front(fifo)) & (fifo)->mask) +#define fifo_idx_entry(fifo, i) ((fifo)->data[((fifo)->front + (i)) & (fifo)->mask]) + +#define fifo_push_back_ref(f) \ + (fifo_full((f)) ? NULL : &(f)->data[(f)->back++ & (f)->mask]) + +#define fifo_push_front_ref(f) \ + (fifo_full((f)) ? NULL : &(f)->data[--(f)->front & (f)->mask]) + +#define fifo_push_back(fifo, new) \ +({ \ + typeof((fifo)->data) _r = fifo_push_back_ref(fifo); \ + if (_r) \ + *_r = (new); \ + _r != NULL; \ +}) + +#define fifo_push_front(fifo, new) \ +({ \ + typeof((fifo)->data) _r = fifo_push_front_ref(fifo); \ + if (_r) \ + *_r = (new); \ + _r != NULL; \ +}) + +#define fifo_pop_front(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) \ + (i) = (fifo)->data[(fifo)->front++ & (fifo)->mask]; \ + _r; \ +}) + +#define fifo_pop_back(fifo, i) \ +({ \ + bool _r = !fifo_empty((fifo)); \ + if (_r) \ + (i) = (fifo)->data[--(fifo)->back & (fifo)->mask]; \ + _r; \ +}) + +#define fifo_push_ref(fifo) fifo_push_back_ref(fifo) +#define fifo_push(fifo, i) fifo_push_back(fifo, (i)) +#define fifo_pop(fifo, i) fifo_pop_front(fifo, (i)) +#define fifo_peek(fifo) fifo_peek_front(fifo) + +#define fifo_for_each_entry(_entry, _fifo, _iter) \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ + ((_iter != (_fifo)->back) && \ + (_entry = (_fifo)->data[(_iter) & (_fifo)->mask], true)); \ + (_iter)++) + +#define fifo_for_each_entry_ptr(_ptr, _fifo, _iter) \ + for (typecheck(typeof((_fifo)->front), _iter), \ + (_iter) = (_fifo)->front; \ + ((_iter != (_fifo)->back) && \ + (_ptr = &(_fifo)->data[(_iter) & (_fifo)->mask], true)); \ + (_iter)++) + +#endif /* _BCACHEFS_FIFO_H */ diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c new file mode 100644 index 000000000..bb5305441 --- /dev/null +++ b/fs/bcachefs/fs-common.c @@ -0,0 +1,501 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "acl.h" +#include "btree_update.h" +#include "dirent.h" +#include "fs-common.h" +#include "inode.h" +#include "subvolume.h" +#include "xattr.h" + +#include + +static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) +{ + return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; +} + +int bch2_create_trans(struct btree_trans *trans, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *new_inode, + const struct qstr *name, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct posix_acl *default_acl, + struct posix_acl *acl, + subvol_inum snapshot_src, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + subvol_inum new_inum = dir; + u64 now = bch2_current_time(c); + u64 cpu = raw_smp_processor_id(); + u64 dir_target; + u32 snapshot; + unsigned dir_type = mode_to_type(mode); + int ret; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &snapshot); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + if (!(flags & BCH_CREATE_SNAPSHOT)) { + /* Normal create path - allocate a new inode: */ + bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + + if (flags & BCH_CREATE_TMPFILE) + new_inode->bi_flags |= BCH_INODE_UNLINKED; + + ret = bch2_inode_create(trans, &inode_iter, new_inode, snapshot, cpu); + if (ret) + goto err; + + snapshot_src = (subvol_inum) { 0 }; + } else { + /* + * Creating a snapshot - we're not allocating a new inode, but + * we do have to lookup the root inode of the subvolume we're + * snapshotting and update it (in the new snapshot): + */ + + if (!snapshot_src.inum) { + /* Inode wasn't specified, just snapshot: */ + struct bch_subvolume s; + + ret = bch2_subvolume_get(trans, snapshot_src.subvol, true, + BTREE_ITER_CACHED, &s); + if (ret) + goto err; + + snapshot_src.inum = le64_to_cpu(s.inode); + } + + ret = bch2_inode_peek(trans, &inode_iter, new_inode, snapshot_src, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (new_inode->bi_subvol != snapshot_src.subvol) { + /* Not a subvolume root: */ + ret = -EINVAL; + goto err; + } + + /* + * If we're not root, we have to own the subvolume being + * snapshotted: + */ + if (uid && new_inode->bi_uid != uid) { + ret = -EPERM; + goto err; + } + + flags |= BCH_CREATE_SUBVOL; + } + + new_inum.inum = new_inode->bi_inum; + dir_target = new_inode->bi_inum; + + if (flags & BCH_CREATE_SUBVOL) { + u32 new_subvol, dir_snapshot; + + ret = bch2_subvolume_create(trans, new_inode->bi_inum, + snapshot_src.subvol, + &new_subvol, &snapshot, + (flags & BCH_CREATE_SNAPSHOT_RO) != 0); + if (ret) + goto err; + + new_inode->bi_parent_subvol = dir.subvol; + new_inode->bi_subvol = new_subvol; + new_inum.subvol = new_subvol; + dir_target = new_subvol; + dir_type = DT_SUBVOL; + + ret = bch2_subvolume_get_snapshot(trans, dir.subvol, &dir_snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(&dir_iter); + if (ret) + goto err; + } + + if (!(flags & BCH_CREATE_SNAPSHOT)) { + if (default_acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + default_acl, ACL_TYPE_DEFAULT); + if (ret) + goto err; + } + + if (acl) { + ret = bch2_set_acl_trans(trans, new_inum, new_inode, + acl, ACL_TYPE_ACCESS); + if (ret) + goto err; + } + } + + if (!(flags & BCH_CREATE_TMPFILE)) { + struct bch_hash_info dir_hash = bch2_hash_info_init(c, dir_u); + u64 dir_offset; + + if (is_subdir_for_nlink(new_inode)) + dir_u->bi_nlink++; + dir_u->bi_mtime = dir_u->bi_ctime = now; + + ret = bch2_inode_write(trans, &dir_iter, dir_u); + if (ret) + goto err; + + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, + &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + new_inode->bi_dir = dir_u->bi_inum; + new_inode->bi_dir_offset = dir_offset; + } + } + + inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS; + bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + + ret = bch2_btree_iter_traverse(&inode_iter) ?: + bch2_inode_write(trans, &inode_iter, new_inode); +err: + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dir_iter); + return ret; +} + +int bch2_link_trans(struct btree_trans *trans, + subvol_inum dir, struct bch_inode_unpacked *dir_u, + subvol_inum inum, struct bch_inode_unpacked *inode_u, + const struct qstr *name) +{ + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + u64 now = bch2_current_time(c); + u64 dir_offset = 0; + int ret; + + if (dir.subvol != inum.subvol) + return -EXDEV; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + goto err; + + inode_u->bi_ctime = now; + ret = bch2_inode_nlink_inc(inode_u); + if (ret) + return ret; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + if (bch2_reinherit_attrs(inode_u, dir_u)) { + ret = -EXDEV; + goto err; + } + + dir_u->bi_mtime = dir_u->bi_ctime = now; + + dir_hash = bch2_hash_info_init(c, dir_u); + + ret = bch2_dirent_create(trans, dir, &dir_hash, + mode_to_type(inode_u->bi_mode), + name, inum.inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + goto err; + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + inode_u->bi_dir = dir.inum; + inode_u->bi_dir_offset = dir_offset; + } + + ret = bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); +err: + bch2_trans_iter_exit(trans, &dir_iter); + bch2_trans_iter_exit(trans, &inode_iter); + return ret; +} + +int bch2_unlink_trans(struct btree_trans *trans, + subvol_inum dir, + struct bch_inode_unpacked *dir_u, + struct bch_inode_unpacked *inode_u, + const struct qstr *name, + bool deleting_snapshot) +{ + struct bch_fs *c = trans->c; + struct btree_iter dir_iter = { NULL }; + struct btree_iter dirent_iter = { NULL }; + struct btree_iter inode_iter = { NULL }; + struct bch_hash_info dir_hash; + subvol_inum inum; + u64 now = bch2_current_time(c); + struct bkey_s_c k; + int ret; + + ret = bch2_inode_peek(trans, &dir_iter, dir_u, dir, BTREE_ITER_INTENT); + if (ret) + goto err; + + dir_hash = bch2_hash_info_init(c, dir_u); + + ret = __bch2_dirent_lookup_trans(trans, &dirent_iter, dir, &dir_hash, + name, &inum, BTREE_ITER_INTENT); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (!deleting_snapshot && S_ISDIR(inode_u->bi_mode)) { + ret = bch2_empty_dir_trans(trans, inum); + if (ret) + goto err; + } + + if (deleting_snapshot && !inode_u->bi_subvol) { + ret = -BCH_ERR_ENOENT_not_subvol; + goto err; + } + + if (deleting_snapshot || inode_u->bi_subvol) { + ret = bch2_subvolume_unlink(trans, inode_u->bi_subvol); + if (ret) + goto err; + + k = bch2_btree_iter_peek_slot(&dirent_iter); + ret = bkey_err(k); + if (ret) + goto err; + + /* + * If we're deleting a subvolume, we need to really delete the + * dirent, not just emit a whiteout in the current snapshot: + */ + bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(&dirent_iter); + if (ret) + goto err; + } else { + bch2_inode_nlink_dec(trans, inode_u); + } + + if (inode_u->bi_dir == dirent_iter.pos.inode && + inode_u->bi_dir_offset == dirent_iter.pos.offset) { + inode_u->bi_dir = 0; + inode_u->bi_dir_offset = 0; + } + + dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; + dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash, &dirent_iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_inode_write(trans, &dir_iter, dir_u) ?: + bch2_inode_write(trans, &inode_iter, inode_u); +err: + bch2_trans_iter_exit(trans, &inode_iter); + bch2_trans_iter_exit(trans, &dirent_iter); + bch2_trans_iter_exit(trans, &dir_iter); + return ret; +} + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, + struct bch_inode_unpacked *src_u) +{ + u64 src, dst; + unsigned id; + bool ret = false; + + for (id = 0; id < Inode_opt_nr; id++) { + /* Skip attributes that were explicitly set on this inode */ + if (dst_u->bi_fields_set & (1 << id)) + continue; + + src = bch2_inode_opt_get(src_u, id); + dst = bch2_inode_opt_get(dst_u, id); + + if (src == dst) + continue; + + bch2_inode_opt_set(dst_u, id, src); + ret = true; + } + + return ret; +} + +int bch2_rename_trans(struct btree_trans *trans, + subvol_inum src_dir, struct bch_inode_unpacked *src_dir_u, + subvol_inum dst_dir, struct bch_inode_unpacked *dst_dir_u, + struct bch_inode_unpacked *src_inode_u, + struct bch_inode_unpacked *dst_inode_u, + const struct qstr *src_name, + const struct qstr *dst_name, + enum bch_rename_mode mode) +{ + struct bch_fs *c = trans->c; + struct btree_iter src_dir_iter = { NULL }; + struct btree_iter dst_dir_iter = { NULL }; + struct btree_iter src_inode_iter = { NULL }; + struct btree_iter dst_inode_iter = { NULL }; + struct bch_hash_info src_hash, dst_hash; + subvol_inum src_inum, dst_inum; + u64 src_offset, dst_offset; + u64 now = bch2_current_time(c); + int ret; + + ret = bch2_inode_peek(trans, &src_dir_iter, src_dir_u, src_dir, + BTREE_ITER_INTENT); + if (ret) + goto err; + + src_hash = bch2_hash_info_init(c, src_dir_u); + + if (dst_dir.inum != src_dir.inum || + dst_dir.subvol != src_dir.subvol) { + ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, + BTREE_ITER_INTENT); + if (ret) + goto err; + + dst_hash = bch2_hash_info_init(c, dst_dir_u); + } else { + dst_dir_u = src_dir_u; + dst_hash = src_hash; + } + + ret = bch2_dirent_rename(trans, + src_dir, &src_hash, + dst_dir, &dst_hash, + src_name, &src_inum, &src_offset, + dst_name, &dst_inum, &dst_offset, + mode); + if (ret) + goto err; + + ret = bch2_inode_peek(trans, &src_inode_iter, src_inode_u, src_inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + + if (dst_inum.inum) { + ret = bch2_inode_peek(trans, &dst_inode_iter, dst_inode_u, dst_inum, + BTREE_ITER_INTENT); + if (ret) + goto err; + } + + if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) { + src_inode_u->bi_dir = dst_dir_u->bi_inum; + src_inode_u->bi_dir_offset = dst_offset; + + if (mode == BCH_RENAME_EXCHANGE) { + dst_inode_u->bi_dir = src_dir_u->bi_inum; + dst_inode_u->bi_dir_offset = src_offset; + } + + if (mode == BCH_RENAME_OVERWRITE && + dst_inode_u->bi_dir == dst_dir_u->bi_inum && + dst_inode_u->bi_dir_offset == src_offset) { + dst_inode_u->bi_dir = 0; + dst_inode_u->bi_dir_offset = 0; + } + } + + if (mode == BCH_RENAME_OVERWRITE) { + if (S_ISDIR(src_inode_u->bi_mode) != + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -ENOTDIR; + goto err; + } + + if (S_ISDIR(dst_inode_u->bi_mode) && + bch2_empty_dir_trans(trans, dst_inum)) { + ret = -ENOTEMPTY; + goto err; + } + } + + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } + + if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } + + if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { + dst_dir_u->bi_nlink--; + src_dir_u->bi_nlink += mode == BCH_RENAME_EXCHANGE; + } + + if (mode == BCH_RENAME_OVERWRITE) + bch2_inode_nlink_dec(trans, dst_inode_u); + + src_dir_u->bi_mtime = now; + src_dir_u->bi_ctime = now; + + if (src_dir.inum != dst_dir.inum) { + dst_dir_u->bi_mtime = now; + dst_dir_u->bi_ctime = now; + } + + src_inode_u->bi_ctime = now; + + if (dst_inum.inum) + dst_inode_u->bi_ctime = now; + + ret = bch2_inode_write(trans, &src_dir_iter, src_dir_u) ?: + (src_dir.inum != dst_dir.inum + ? bch2_inode_write(trans, &dst_dir_iter, dst_dir_u) + : 0) ?: + bch2_inode_write(trans, &src_inode_iter, src_inode_u) ?: + (dst_inum.inum + ? bch2_inode_write(trans, &dst_inode_iter, dst_inode_u) + : 0); +err: + bch2_trans_iter_exit(trans, &dst_inode_iter); + bch2_trans_iter_exit(trans, &src_inode_iter); + bch2_trans_iter_exit(trans, &dst_dir_iter); + bch2_trans_iter_exit(trans, &src_dir_iter); + return ret; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/fs-common.h new file mode 100644 index 000000000..dde237859 --- /dev/null +++ b/fs/bcachefs/fs-common.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_COMMON_H +#define _BCACHEFS_FS_COMMON_H + +struct posix_acl; + +#define BCH_CREATE_TMPFILE (1U << 0) +#define BCH_CREATE_SUBVOL (1U << 1) +#define BCH_CREATE_SNAPSHOT (1U << 2) +#define BCH_CREATE_SNAPSHOT_RO (1U << 3) + +int bch2_create_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + uid_t, gid_t, umode_t, dev_t, + struct posix_acl *, + struct posix_acl *, + subvol_inum, unsigned); + +int bch2_link_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + const struct qstr *); + +int bch2_unlink_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, bool); + +int bch2_rename_trans(struct btree_trans *, + subvol_inum, struct bch_inode_unpacked *, + subvol_inum, struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + struct bch_inode_unpacked *, + const struct qstr *, + const struct qstr *, + enum bch_rename_mode); + +bool bch2_reinherit_attrs(struct bch_inode_unpacked *, + struct bch_inode_unpacked *); + +#endif /* _BCACHEFS_FS_COMMON_H */ diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c new file mode 100644 index 000000000..6b691b2b5 --- /dev/null +++ b/fs/bcachefs/fs-io.c @@ -0,0 +1,3982 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "clock.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "fs.h" +#include "fs-io.h" +#include "fsck.h" +#include "inode.h" +#include "journal.h" +#include "io.h" +#include "keylist.h" +#include "quota.h" +#include "reflink.h" +#include "trace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void bch2_clamp_data_hole(struct inode *, u64 *, u64 *, unsigned); + +struct folio_vec { + struct folio *fv_folio; + size_t fv_offset; + size_t fv_len; +}; + +static inline struct folio_vec biovec_to_foliovec(struct bio_vec bv) +{ + + struct folio *folio = page_folio(bv.bv_page); + size_t offset = (folio_page_idx(folio, bv.bv_page) << PAGE_SHIFT) + + bv.bv_offset; + size_t len = min_t(size_t, folio_size(folio) - offset, bv.bv_len); + + return (struct folio_vec) { + .fv_folio = folio, + .fv_offset = offset, + .fv_len = len, + }; +} + +static inline struct folio_vec bio_iter_iovec_folio(struct bio *bio, + struct bvec_iter iter) +{ + return biovec_to_foliovec(bio_iter_iovec(bio, iter)); +} + +#define __bio_for_each_folio(bvl, bio, iter, start) \ + for (iter = (start); \ + (iter).bi_size && \ + ((bvl = bio_iter_iovec_folio((bio), (iter))), 1); \ + bio_advance_iter_single((bio), &(iter), (bvl).fv_len)) + +/** + * bio_for_each_folio - iterate over folios within a bio + * + * Like other non-_all versions, this iterates over what bio->bi_iter currently + * points to. This version is for drivers, where the bio may have previously + * been split or cloned. + */ +#define bio_for_each_folio(bvl, bio, iter) \ + __bio_for_each_folio(bvl, bio, iter, (bio)->bi_iter) + +/* + * Use u64 for the end pos and sector helpers because if the folio covers the + * max supported range of the mapping, the start offset of the next folio + * overflows loff_t. This breaks much of the range based processing in the + * buffered write path. + */ +static inline u64 folio_end_pos(struct folio *folio) +{ + return folio_pos(folio) + folio_size(folio); +} + +static inline size_t folio_sectors(struct folio *folio) +{ + return PAGE_SECTORS << folio_order(folio); +} + +static inline loff_t folio_sector(struct folio *folio) +{ + return folio_pos(folio) >> 9; +} + +static inline u64 folio_end_sector(struct folio *folio) +{ + return folio_end_pos(folio) >> 9; +} + +typedef DARRAY(struct folio *) folios; + +static int filemap_get_contig_folios_d(struct address_space *mapping, + loff_t start, u64 end, + int fgp_flags, gfp_t gfp, + folios *folios) +{ + struct folio *f; + u64 pos = start; + int ret = 0; + + while (pos < end) { + if ((u64) pos >= (u64) start + (1ULL << 20)) + fgp_flags &= ~FGP_CREAT; + + ret = darray_make_room_gfp(folios, 1, gfp & GFP_KERNEL); + if (ret) + break; + + f = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp_flags, gfp); + if (IS_ERR_OR_NULL(f)) + break; + + BUG_ON(folios->nr && folio_pos(f) != pos); + + pos = folio_end_pos(f); + darray_push(folios, f); + } + + if (!folios->nr && !ret && (fgp_flags & FGP_CREAT)) + ret = -ENOMEM; + + return folios->nr ? 0 : ret; +} + +struct nocow_flush { + struct closure *cl; + struct bch_dev *ca; + struct bio bio; +}; + +static void nocow_flush_endio(struct bio *_bio) +{ + + struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); + + closure_put(bio->cl); + percpu_ref_put(&bio->ca->io_ref); + bio_put(&bio->bio); +} + +static void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, + struct bch_inode_info *inode, + struct closure *cl) +{ + struct nocow_flush *bio; + struct bch_dev *ca; + struct bch_devs_mask devs; + unsigned dev; + + dev = find_first_bit(inode->ei_devs_need_flush.d, BCH_SB_MEMBERS_MAX); + if (dev == BCH_SB_MEMBERS_MAX) + return; + + devs = inode->ei_devs_need_flush; + memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); + + for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { + rcu_read_lock(); + ca = rcu_dereference(c->devs[dev]); + if (ca && !percpu_ref_tryget(&ca->io_ref)) + ca = NULL; + rcu_read_unlock(); + + if (!ca) + continue; + + bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0, + REQ_OP_FLUSH, + GFP_KERNEL, + &c->nocow_flush_bioset), + struct nocow_flush, bio); + bio->cl = cl; + bio->ca = ca; + bio->bio.bi_end_io = nocow_flush_endio; + closure_bio_submit(&bio->bio, cl); + } +} + +static int bch2_inode_flush_nocow_writes(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct closure cl; + + closure_init_stack(&cl); + bch2_inode_flush_nocow_writes_async(c, inode, &cl); + closure_sync(&cl); + + return 0; +} + +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + +static inline struct address_space *faults_disabled_mapping(void) +{ + return (void *) (((unsigned long) current->faults_disabled_mapping) & ~1UL); +} + +static inline void set_fdm_dropped_locks(void) +{ + current->faults_disabled_mapping = + (void *) (((unsigned long) current->faults_disabled_mapping)|1); +} + +static inline bool fdm_dropped_locks(void) +{ + return ((unsigned long) current->faults_disabled_mapping) & 1; +} + +struct quota_res { + u64 sectors; +}; + +struct bch_writepage_io { + struct bch_inode_info *inode; + + /* must be last: */ + struct bch_write_op op; +}; + +struct dio_write { + struct kiocb *req; + struct address_space *mapping; + struct bch_inode_info *inode; + struct mm_struct *mm; + unsigned loop:1, + extending:1, + sync:1, + flush:1, + free_iov:1; + struct quota_res quota_res; + u64 written; + + struct iov_iter iter; + struct iovec inline_vecs[2]; + + /* must be last: */ + struct bch_write_op op; +}; + +struct dio_read { + struct closure cl; + struct kiocb *req; + long ret; + bool should_dirty; + struct bch_read_bio rbio; +}; + +/* pagecache_block must be held */ +static noinline int write_invalidate_inode_pages_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + int ret; + + /* + * XXX: the way this is currently implemented, we can spin if a process + * is continually redirtying a specific page + */ + do { + if (!mapping->nrpages) + return 0; + + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + break; + + if (!mapping->nrpages) + return 0; + + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, + end >> PAGE_SHIFT); + } while (ret == -EBUSY); + + return ret; +} + +/* quotas */ + +#ifdef CONFIG_BCACHEFS_QUOTA + +static void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + BUG_ON(res->sectors > inode->ei_quota_reserved); + + bch2_quota_acct(c, inode->ei_qid, Q_SPC, + -((s64) res->sectors), KEY_TYPE_QUOTA_PREALLOC); + inode->ei_quota_reserved -= res->sectors; + res->sectors = 0; +} + +static void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) +{ + if (res->sectors) { + mutex_lock(&inode->ei_quota_lock); + __bch2_quota_reservation_put(c, inode, res); + mutex_unlock(&inode->ei_quota_lock); + } +} + +static int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + u64 sectors, + bool check_enospc) +{ + int ret; + + if (test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags)) + return 0; + + mutex_lock(&inode->ei_quota_lock); + ret = bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, + check_enospc ? KEY_TYPE_QUOTA_PREALLOC : KEY_TYPE_QUOTA_NOCHECK); + if (likely(!ret)) { + inode->ei_quota_reserved += sectors; + res->sectors += sectors; + } + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} + +#else + +static void __bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} + +static void bch2_quota_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res) {} + +static int bch2_quota_reservation_add(struct bch_fs *c, + struct bch_inode_info *inode, + struct quota_res *res, + unsigned sectors, + bool check_enospc) +{ + return 0; +} + +#endif + +/* i_size updates: */ + +struct inode_new_size { + loff_t new_size; + u64 now; + unsigned fields; +}; + +static int inode_set_size(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct inode_new_size *s = p; + + bi->bi_size = s->new_size; + if (s->fields & ATTR_ATIME) + bi->bi_atime = s->now; + if (s->fields & ATTR_MTIME) + bi->bi_mtime = s->now; + if (s->fields & ATTR_CTIME) + bi->bi_ctime = s->now; + + return 0; +} + +int __must_check bch2_write_inode_size(struct bch_fs *c, + struct bch_inode_info *inode, + loff_t new_size, unsigned fields) +{ + struct inode_new_size s = { + .new_size = new_size, + .now = bch2_current_time(c), + .fields = fields, + }; + + return bch2_write_inode(c, inode, inode_set_size, &s, fields); +} + +static void __i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, + "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + inode->v.i_blocks += sectors; + +#ifdef CONFIG_BCACHEFS_QUOTA + if (quota_res && + !test_bit(EI_INODE_SNAPSHOT, &inode->ei_flags) && + sectors > 0) { + BUG_ON(sectors > quota_res->sectors); + BUG_ON(sectors > inode->ei_quota_reserved); + + quota_res->sectors -= sectors; + inode->ei_quota_reserved -= sectors; + } else { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, sectors, KEY_TYPE_QUOTA_WARN); + } +#endif +} + +static void i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, + struct quota_res *quota_res, s64 sectors) +{ + if (sectors) { + mutex_lock(&inode->ei_quota_lock); + __i_sectors_acct(c, inode, quota_res, sectors); + mutex_unlock(&inode->ei_quota_lock); + } +} + +/* page state: */ + +/* stored in page->private: */ + +#define BCH_FOLIO_SECTOR_STATE() \ + x(unallocated) \ + x(reserved) \ + x(dirty) \ + x(dirty_reserved) \ + x(allocated) + +enum bch_folio_sector_state { +#define x(n) SECTOR_##n, + BCH_FOLIO_SECTOR_STATE() +#undef x +}; + +static const char * const bch2_folio_sector_states[] = { +#define x(n) #n, + BCH_FOLIO_SECTOR_STATE() +#undef x + NULL +}; + +static inline enum bch_folio_sector_state +folio_sector_dirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_dirty; + case SECTOR_reserved: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_undirty(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_dirty: + return SECTOR_unallocated; + case SECTOR_dirty_reserved: + return SECTOR_reserved; + default: + return state; + } +} + +static inline enum bch_folio_sector_state +folio_sector_reserve(enum bch_folio_sector_state state) +{ + switch (state) { + case SECTOR_unallocated: + return SECTOR_reserved; + case SECTOR_dirty: + return SECTOR_dirty_reserved; + default: + return state; + } +} + +struct bch_folio_sector { + /* Uncompressed, fully allocated replicas (or on disk reservation): */ + unsigned nr_replicas:4; + + /* Owns PAGE_SECTORS * replicas_reserved sized in memory reservation: */ + unsigned replicas_reserved:4; + + /* i_sectors: */ + enum bch_folio_sector_state state:8; +}; + +struct bch_folio { + spinlock_t lock; + atomic_t write_count; + /* + * Is the sector state up to date with the btree? + * (Not the data itself) + */ + bool uptodate; + struct bch_folio_sector s[]; +}; + +static inline void folio_sector_set(struct folio *folio, + struct bch_folio *s, + unsigned i, unsigned n) +{ + s->s[i].state = n; +} + +/* file offset (to folio offset) to bch_folio_sector index */ +static inline int folio_pos_to_s(struct folio *folio, loff_t pos) +{ + u64 f_offset = pos - folio_pos(folio); + BUG_ON(pos < folio_pos(folio) || pos >= folio_end_pos(folio)); + return f_offset >> SECTOR_SHIFT; +} + +static inline struct bch_folio *__bch2_folio(struct folio *folio) +{ + return folio_has_private(folio) + ? (struct bch_folio *) folio_get_private(folio) + : NULL; +} + +static inline struct bch_folio *bch2_folio(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + + return __bch2_folio(folio); +} + +/* for newly allocated folios: */ +static void __bch2_folio_release(struct folio *folio) +{ + kfree(folio_detach_private(folio)); +} + +static void bch2_folio_release(struct folio *folio) +{ + EBUG_ON(!folio_test_locked(folio)); + __bch2_folio_release(folio); +} + +/* for newly allocated folios: */ +static struct bch_folio *__bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + struct bch_folio *s; + + s = kzalloc(sizeof(*s) + + sizeof(struct bch_folio_sector) * + folio_sectors(folio), gfp); + if (!s) + return NULL; + + spin_lock_init(&s->lock); + folio_attach_private(folio, s); + return s; +} + +static struct bch_folio *bch2_folio_create(struct folio *folio, gfp_t gfp) +{ + return bch2_folio(folio) ?: __bch2_folio_create(folio, gfp); +} + +static unsigned bkey_to_sector_state(struct bkey_s_c k) +{ + if (bkey_extent_is_reservation(k)) + return SECTOR_reserved; + if (bkey_extent_is_allocation(k.k)) + return SECTOR_allocated; + return SECTOR_unallocated; +} + +static void __bch2_folio_set(struct folio *folio, + unsigned pg_offset, unsigned pg_len, + unsigned nr_ptrs, unsigned state) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + BUG_ON(pg_offset >= sectors); + BUG_ON(pg_offset + pg_len > sectors); + + spin_lock(&s->lock); + + for (i = pg_offset; i < pg_offset + pg_len; i++) { + s->s[i].nr_replicas = nr_ptrs; + folio_sector_set(folio, s, i, state); + } + + if (i == sectors) + s->uptodate = true; + + spin_unlock(&s->lock); +} + +/* + * Initialize bch_folio state (allocated/unallocated, nr_replicas) from the + * extents btree: + */ +static int bch2_folio_set(struct bch_fs *c, subvol_inum inum, + struct folio **folios, unsigned nr_folios) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_folio *s; + u64 offset = folio_sector(folios[0]); + unsigned folio_idx; + u32 snapshot; + bool need_set = false; + int ret; + + for (folio_idx = 0; folio_idx < nr_folios; folio_idx++) { + s = bch2_folio_create(folios[folio_idx], GFP_KERNEL); + if (!s) + return -ENOMEM; + + need_set |= !s->uptodate; + } + + if (!need_set) + return 0; + + folio_idx = 0; + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, ret) { + unsigned nr_ptrs = bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + while (folio_idx < nr_folios) { + struct folio *folio = folios[folio_idx]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(bkey_start_offset(k.k), folio_start) - folio_start; + unsigned folio_len = min(k.k->p.offset, folio_end) - folio_offset - folio_start; + + BUG_ON(k.k->p.offset < folio_start); + BUG_ON(bkey_start_offset(k.k) > folio_end); + + if (!bch2_folio(folio)->uptodate) + __bch2_folio_set(folio, folio_offset, folio_len, nr_ptrs, state); + + if (k.k->p.offset < folio_end) + break; + folio_idx++; + } + + if (folio_idx == nr_folios) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); + + return ret; +} + +static void bch2_bio_page_state_set(struct bio *bio, struct bkey_s_c k) +{ + struct bvec_iter iter; + struct folio_vec fv; + unsigned nr_ptrs = k.k->type == KEY_TYPE_reflink_v + ? 0 : bch2_bkey_nr_ptrs_fully_allocated(k); + unsigned state = bkey_to_sector_state(k); + + bio_for_each_folio(fv, bio, iter) + __bch2_folio_set(fv.fv_folio, + fv.fv_offset >> 9, + fv.fv_len >> 9, + nr_ptrs, state); +} + +static void mark_pagecache_unallocated(struct bch_inode_info *inode, + u64 start, u64 end) +{ + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) + s->s[j].nr_replicas = 0; + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } +} + +static void mark_pagecache_reserved(struct bch_inode_info *inode, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + pgoff_t index = start >> PAGE_SECTORS_SHIFT; + pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT; + struct folio_batch fbatch; + s64 i_sectors_delta = 0; + unsigned i, j; + + if (end <= start) + return; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(inode->v.i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + u64 folio_start = folio_sector(folio); + u64 folio_end = folio_end_sector(folio); + unsigned folio_offset = max(start, folio_start) - folio_start; + unsigned folio_len = min(end, folio_end) - folio_offset - folio_start; + struct bch_folio *s; + + BUG_ON(end <= folio_start); + + folio_lock(folio); + s = bch2_folio(folio); + + if (s) { + spin_lock(&s->lock); + for (j = folio_offset; j < folio_offset + folio_len; j++) { + i_sectors_delta -= s->s[j].state == SECTOR_dirty; + folio_sector_set(folio, s, j, folio_sector_reserve(s->s[j].state)); + } + spin_unlock(&s->lock); + } + + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + i_sectors_acct(c, inode, NULL, i_sectors_delta); +} + +static inline unsigned inode_nr_replicas(struct bch_fs *c, struct bch_inode_info *inode) +{ + /* XXX: this should not be open coded */ + return inode->ei_inode.bi_data_replicas + ? inode->ei_inode.bi_data_replicas - 1 + : c->opts.data_replicas; +} + +static inline unsigned sectors_to_reserve(struct bch_folio_sector *s, + unsigned nr_replicas) +{ + return max(0, (int) nr_replicas - + s->nr_replicas - + s->replicas_reserved); +} + +static int bch2_get_folio_disk_reservation(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, bool check_enospc) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned nr_replicas = inode_nr_replicas(c, inode); + struct disk_reservation disk_res = { 0 }; + unsigned i, sectors = folio_sectors(folio), disk_res_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + for (i = 0; i < sectors; i++) + disk_res_sectors += sectors_to_reserve(&s->s[i], nr_replicas); + + if (!disk_res_sectors) + return 0; + + ret = bch2_disk_reservation_get(c, &disk_res, + disk_res_sectors, 1, + !check_enospc + ? BCH_DISK_RESERVATION_NOFAIL + : 0); + if (unlikely(ret)) + return ret; + + for (i = 0; i < sectors; i++) + s->s[i].replicas_reserved += + sectors_to_reserve(&s->s[i], nr_replicas); + + return 0; +} + +struct bch2_folio_reservation { + struct disk_reservation disk; + struct quota_res quota; +}; + +static void bch2_folio_reservation_init(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + memset(res, 0, sizeof(*res)); + + res->disk.nr_replicas = inode_nr_replicas(c, inode); +} + +static void bch2_folio_reservation_put(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch2_folio_reservation *res) +{ + bch2_disk_reservation_put(c, &res->disk); + bch2_quota_reservation_put(c, inode, &res->quota); +} + +static int bch2_folio_reservation_get(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio_create(folio, 0); + unsigned i, disk_sectors = 0, quota_sectors = 0; + int ret; + + if (!s) + return -ENOMEM; + + BUG_ON(!s->uptodate); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + disk_sectors += sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + quota_sectors += s->s[i].state == SECTOR_unallocated; + } + + if (disk_sectors) { + ret = bch2_disk_reservation_add(c, &res->disk, disk_sectors, 0); + if (unlikely(ret)) + return ret; + } + + if (quota_sectors) { + ret = bch2_quota_reservation_add(c, inode, &res->quota, + quota_sectors, true); + if (unlikely(ret)) { + struct disk_reservation tmp = { + .sectors = disk_sectors + }; + + bch2_disk_reservation_put(c, &tmp); + res->disk.sectors -= disk_sectors; + return ret; + } + } + + return 0; +} + +static void bch2_clear_folio_bits(struct folio *folio) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_folio *s = bch2_folio(folio); + struct disk_reservation disk_res = { 0 }; + int i, sectors = folio_sectors(folio), dirty_sectors = 0; + + if (!s) + return; + + EBUG_ON(!folio_test_locked(folio)); + EBUG_ON(folio_test_writeback(folio)); + + for (i = 0; i < sectors; i++) { + disk_res.sectors += s->s[i].replicas_reserved; + s->s[i].replicas_reserved = 0; + + dirty_sectors -= s->s[i].state == SECTOR_dirty; + folio_sector_set(folio, s, i, folio_sector_undirty(s->s[i].state)); + } + + bch2_disk_reservation_put(c, &disk_res); + + i_sectors_acct(c, inode, NULL, dirty_sectors); + + bch2_folio_release(folio); +} + +static void bch2_set_folio_dirty(struct bch_fs *c, + struct bch_inode_info *inode, + struct folio *folio, + struct bch2_folio_reservation *res, + unsigned offset, unsigned len) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, dirty_sectors = 0; + + WARN_ON((u64) folio_pos(folio) + offset + len > + round_up((u64) i_size_read(&inode->v), block_bytes(c))); + + BUG_ON(!s->uptodate); + + spin_lock(&s->lock); + + for (i = round_down(offset, block_bytes(c)) >> 9; + i < round_up(offset + len, block_bytes(c)) >> 9; + i++) { + unsigned sectors = sectors_to_reserve(&s->s[i], + res->disk.nr_replicas); + + /* + * This can happen if we race with the error path in + * bch2_writepage_io_done(): + */ + sectors = min_t(unsigned, sectors, res->disk.sectors); + + s->s[i].replicas_reserved += sectors; + res->disk.sectors -= sectors; + + dirty_sectors += s->s[i].state == SECTOR_unallocated; + + folio_sector_set(folio, s, i, folio_sector_dirty(s->s[i].state)); + } + + spin_unlock(&s->lock); + + i_sectors_acct(c, inode, &res->quota, dirty_sectors); + + if (!folio_test_dirty(folio)) + filemap_dirty_folio(inode->v.i_mapping, folio); +} + +vm_fault_t bch2_page_fault(struct vm_fault *vmf) +{ + struct file *file = vmf->vma->vm_file; + struct address_space *mapping = file->f_mapping; + struct address_space *fdm = faults_disabled_mapping(); + struct bch_inode_info *inode = file_bch_inode(file); + vm_fault_t ret; + + if (fdm == mapping) + return VM_FAULT_SIGBUS; + + /* Lock ordering: */ + if (fdm > mapping) { + struct bch_inode_info *fdm_host = to_bch_ei(fdm->host); + + if (bch2_pagecache_add_tryget(inode)) + goto got_lock; + + bch2_pagecache_block_put(fdm_host); + + bch2_pagecache_add_get(inode); + bch2_pagecache_add_put(inode); + + bch2_pagecache_block_get(fdm_host); + + /* Signal that lock has been dropped: */ + set_fdm_dropped_locks(); + return VM_FAULT_SIGBUS; + } + + bch2_pagecache_add_get(inode); +got_lock: + ret = filemap_fault(vmf); + bch2_pagecache_add_put(inode); + + return ret; +} + +vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) +{ + struct folio *folio = page_folio(vmf->page); + struct file *file = vmf->vma->vm_file; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + unsigned len; + loff_t isize; + vm_fault_t ret; + + bch2_folio_reservation_init(c, inode, &res); + + sb_start_pagefault(inode->v.i_sb); + file_update_time(file); + + /* + * Not strictly necessary, but helps avoid dio writes livelocking in + * write_invalidate_inode_pages_range() - can drop this if/when we get + * a write_invalidate_inode_pages_range() that works without dropping + * page lock before invalidating page + */ + bch2_pagecache_add_get(inode); + + folio_lock(folio); + isize = i_size_read(&inode->v); + + if (folio->mapping != mapping || folio_pos(folio) >= isize) { + folio_unlock(folio); + ret = VM_FAULT_NOPAGE; + goto out; + } + + len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + + if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: + bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + folio_unlock(folio); + ret = VM_FAULT_SIGBUS; + goto out; + } + + bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_folio_reservation_put(c, inode, &res); + + folio_wait_stable(folio); + ret = VM_FAULT_LOCKED; +out: + bch2_pagecache_add_put(inode); + sb_end_pagefault(inode->v.i_sb); + + return ret; +} + +void bch2_invalidate_folio(struct folio *folio, size_t offset, size_t length) +{ + if (offset || length < folio_size(folio)) + return; + + bch2_clear_folio_bits(folio); +} + +bool bch2_release_folio(struct folio *folio, gfp_t gfp_mask) +{ + if (folio_test_dirty(folio) || folio_test_writeback(folio)) + return false; + + bch2_clear_folio_bits(folio); + return true; +} + +/* readpage(s): */ + +static void bch2_readpages_end_io(struct bio *bio) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) { + if (!bio->bi_status) { + folio_mark_uptodate(fi.folio); + } else { + folio_clear_uptodate(fi.folio); + folio_set_error(fi.folio); + } + folio_unlock(fi.folio); + } + + bio_put(bio); +} + +struct readpages_iter { + struct address_space *mapping; + unsigned idx; + folios folios; +}; + +static int readpages_iter_init(struct readpages_iter *iter, + struct readahead_control *ractl) +{ + struct folio **fi; + int ret; + + memset(iter, 0, sizeof(*iter)); + + iter->mapping = ractl->mapping; + + ret = filemap_get_contig_folios_d(iter->mapping, + ractl->_index << PAGE_SHIFT, + (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT, + 0, mapping_gfp_mask(iter->mapping), + &iter->folios); + if (ret) + return ret; + + darray_for_each(iter->folios, fi) { + ractl->_nr_pages -= 1U << folio_order(*fi); + __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL); + folio_put(*fi); + folio_put(*fi); + } + + return 0; +} + +static inline struct folio *readpage_iter_peek(struct readpages_iter *iter) +{ + if (iter->idx >= iter->folios.nr) + return NULL; + return iter->folios.data[iter->idx]; +} + +static inline void readpage_iter_advance(struct readpages_iter *iter) +{ + iter->idx++; +} + +static bool extent_partial_reads_expensive(struct bkey_s_c k) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct bch_extent_crc_unpacked crc; + const union bch_extent_entry *i; + + bkey_for_each_crc(k.k, ptrs, crc, i) + if (crc.csum_type || crc.compression_type) + return true; + return false; +} + +static int readpage_bio_extend(struct btree_trans *trans, + struct readpages_iter *iter, + struct bio *bio, + unsigned sectors_this_extent, + bool get_more) +{ + /* Don't hold btree locks while allocating memory: */ + bch2_trans_unlock(trans); + + while (bio_sectors(bio) < sectors_this_extent && + bio->bi_vcnt < bio->bi_max_vecs) { + struct folio *folio = readpage_iter_peek(iter); + int ret; + + if (folio) { + readpage_iter_advance(iter); + } else { + pgoff_t folio_offset = bio_end_sector(bio) >> PAGE_SECTORS_SHIFT; + + if (!get_more) + break; + + folio = xa_load(&iter->mapping->i_pages, folio_offset); + if (folio && !xa_is_value(folio)) + break; + + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + if (!folio) + break; + + if (!__bch2_folio_create(folio, GFP_KERNEL)) { + folio_put(folio); + break; + } + + ret = filemap_add_folio(iter->mapping, folio, folio_offset, GFP_KERNEL); + if (ret) { + __bch2_folio_release(folio); + folio_put(folio); + break; + } + + folio_put(folio); + } + + BUG_ON(folio_sector(folio) != bio_end_sector(bio)); + + BUG_ON(!bio_add_folio(bio, folio, folio_size(folio), 0)); + } + + return bch2_trans_relock(trans); +} + +static void bchfs_read(struct btree_trans *trans, + struct bch_read_bio *rbio, + subvol_inum inum, + struct readpages_iter *readpages_iter) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_buf sk; + int flags = BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE; + u32 snapshot; + int ret = 0; + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + bch2_bkey_buf_init(&sk); +retry: + bch2_trans_begin(trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, rbio->bio.bi_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bkey_s_c k; + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, rbio->bio.bi_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + sectors = min(sectors, k.k->size - offset_into_extent); + + if (readpages_iter) { + ret = readpage_bio_extend(trans, readpages_iter, &rbio->bio, sectors, + extent_partial_reads_expensive(k)); + if (ret) + break; + } + + bytes = min(sectors, bio_sectors(&rbio->bio)) << 9; + swap(rbio->bio.bi_iter.bi_size, bytes); + + if (rbio->bio.bi_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + bch2_bio_page_state_set(&rbio->bio, k); + + bch2_read_extent(trans, rbio, iter.pos, + data_btree, k, offset_into_extent, flags); + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(rbio->bio.bi_iter.bi_size, bytes); + bio_advance(&rbio->bio, bytes); + + ret = btree_trans_too_many_iters(trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + iter.pos.inode, + iter.pos.offset << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bio_endio(&rbio->bio); + } + + bch2_bkey_buf_exit(&sk, c); +} + +void bch2_readahead(struct readahead_control *ractl) +{ + struct bch_inode_info *inode = to_bch_ei(ractl->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct btree_trans trans; + struct folio *folio; + struct readpages_iter readpages_iter; + int ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + ret = readpages_iter_init(&readpages_iter, ractl); + BUG_ON(ret); + + bch2_trans_init(&trans, c, 0, 0); + + bch2_pagecache_add_get(inode); + + while ((folio = readpage_iter_peek(&readpages_iter))) { + unsigned n = min_t(unsigned, + readpages_iter.folios.nr - + readpages_iter.idx, + BIO_MAX_VECS); + struct bch_read_bio *rbio = + rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, + GFP_KERNEL, &c->bio_read), + opts); + + readpage_iter_advance(&readpages_iter); + + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + rbio->bio.bi_end_io = bch2_readpages_end_io; + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bchfs_read(&trans, rbio, inode_inum(inode), + &readpages_iter); + bch2_trans_unlock(&trans); + } + + bch2_pagecache_add_put(inode); + + bch2_trans_exit(&trans); + darray_exit(&readpages_iter.folios); +} + +static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum, struct folio *folio) +{ + struct btree_trans trans; + + bch2_folio_create(folio, __GFP_NOFAIL); + + rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; + rbio->bio.bi_iter.bi_sector = folio_sector(folio); + BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); + + bch2_trans_init(&trans, c, 0, 0); + bchfs_read(&trans, rbio, inum, NULL); + bch2_trans_exit(&trans); +} + +static void bch2_read_single_folio_end_io(struct bio *bio) +{ + complete(bio->bi_private); +} + +static int bch2_read_single_folio(struct folio *folio, + struct address_space *mapping) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_read_bio *rbio; + struct bch_io_opts opts; + int ret; + DECLARE_COMPLETION_ONSTACK(done); + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), + opts); + rbio->bio.bi_private = &done; + rbio->bio.bi_end_io = bch2_read_single_folio_end_io; + + __bchfs_readfolio(c, rbio, inode_inum(inode), folio); + wait_for_completion(&done); + + ret = blk_status_to_errno(rbio->bio.bi_status); + bio_put(&rbio->bio); + + if (ret < 0) + return ret; + + folio_mark_uptodate(folio); + return 0; +} + +int bch2_read_folio(struct file *file, struct folio *folio) +{ + int ret; + + ret = bch2_read_single_folio(folio, folio->mapping); + folio_unlock(folio); + return bch2_err_class(ret); +} + +/* writepages: */ + +struct bch_writepage_state { + struct bch_writepage_io *io; + struct bch_io_opts opts; + struct bch_folio_sector *tmp; + unsigned tmp_sectors; +}; + +static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct bch_writepage_state ret = { 0 }; + + bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); + return ret; +} + +static void bch2_writepage_io_done(struct bch_write_op *op) +{ + struct bch_writepage_io *io = + container_of(op, struct bch_writepage_io, op); + struct bch_fs *c = io->op.c; + struct bio *bio = &io->op.wbio.bio; + struct folio_iter fi; + unsigned i; + + if (io->op.error) { + set_bit(EI_INODE_ERROR, &io->inode->ei_flags); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + folio_set_error(fi.folio); + mapping_set_error(fi.folio->mapping, -EIO); + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s; + + s = __bch2_folio(fi.folio); + spin_lock(&s->lock); + for (i = 0; i < folio_sectors(fi.folio); i++) + s->s[i].nr_replicas = 0; + spin_unlock(&s->lock); + } + } + + /* + * racing with fallocate can cause us to add fewer sectors than + * expected - but we shouldn't add more sectors than expected: + */ + WARN_ON_ONCE(io->op.i_sectors_delta > 0); + + /* + * (error (due to going RO) halfway through a page can screw that up + * slightly) + * XXX wtf? + BUG_ON(io->op.op.i_sectors_delta >= PAGE_SECTORS); + */ + + /* + * PageWriteback is effectively our ref on the inode - fixup i_blocks + * before calling end_page_writeback: + */ + i_sectors_acct(c, io->inode, NULL, io->op.i_sectors_delta); + + bio_for_each_folio_all(fi, bio) { + struct bch_folio *s = __bch2_folio(fi.folio); + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(fi.folio); + } + + bio_put(&io->op.wbio.bio); +} + +static void bch2_writepage_do_io(struct bch_writepage_state *w) +{ + struct bch_writepage_io *io = w->io; + + w->io = NULL; + closure_call(&io->op.cl, bch2_write, NULL, NULL); +} + +/* + * Get a bch_writepage_io and add @page to it - appending to an existing one if + * possible, else allocating a new one: + */ +static void bch2_writepage_io_alloc(struct bch_fs *c, + struct writeback_control *wbc, + struct bch_writepage_state *w, + struct bch_inode_info *inode, + u64 sector, + unsigned nr_replicas) +{ + struct bch_write_op *op; + + w->io = container_of(bio_alloc_bioset(NULL, BIO_MAX_VECS, + REQ_OP_WRITE, + GFP_KERNEL, + &c->writepage_bioset), + struct bch_writepage_io, op.wbio.bio); + + w->io->inode = inode; + op = &w->io->op; + bch2_write_op_init(op, c, w->opts); + op->target = w->opts.foreground_target; + op->nr_replicas = nr_replicas; + op->res.nr_replicas = nr_replicas; + op->write_point = writepoint_hashed(inode->ei_last_dirtied); + op->subvol = inode->ei_subvol; + op->pos = POS(inode->v.i_ino, sector); + op->end_io = bch2_writepage_io_done; + op->devs_need_flush = &inode->ei_devs_need_flush; + op->wbio.bio.bi_iter.bi_sector = sector; + op->wbio.bio.bi_opf = wbc_to_write_flags(wbc); +} + +static int __bch2_writepage(struct folio *folio, + struct writeback_control *wbc, + void *data) +{ + struct bch_inode_info *inode = to_bch_ei(folio->mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_writepage_state *w = data; + struct bch_folio *s; + unsigned i, offset, f_sectors, nr_replicas_this_write = U32_MAX; + loff_t i_size = i_size_read(&inode->v); + int ret; + + EBUG_ON(!folio_test_uptodate(folio)); + + /* Is the folio fully inside i_size? */ + if (folio_end_pos(folio) <= i_size) + goto do_io; + + /* Is the folio fully outside i_size? (truncate in progress) */ + if (folio_pos(folio) >= i_size) { + folio_unlock(folio); + return 0; + } + + /* + * The folio straddles i_size. It must be zeroed out on each and every + * writepage invocation because it may be mmapped. "A file is mapped + * in multiples of the folio size. For a file that is not a multiple of + * the folio size, the remaining memory is zeroed when mapped, and + * writes to that region are not written out to the file." + */ + folio_zero_segment(folio, + i_size - folio_pos(folio), + folio_size(folio)); +do_io: + f_sectors = folio_sectors(folio); + s = bch2_folio(folio); + + if (f_sectors > w->tmp_sectors) { + kfree(w->tmp); + w->tmp = kzalloc(sizeof(struct bch_folio_sector) * + f_sectors, __GFP_NOFAIL); + w->tmp_sectors = f_sectors; + } + + /* + * Things get really hairy with errors during writeback: + */ + ret = bch2_get_folio_disk_reservation(c, inode, folio, false); + BUG_ON(ret); + + /* Before unlocking the page, get copy of reservations: */ + spin_lock(&s->lock); + memcpy(w->tmp, s->s, sizeof(struct bch_folio_sector) * f_sectors); + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + nr_replicas_this_write = + min_t(unsigned, nr_replicas_this_write, + s->s[i].nr_replicas + + s->s[i].replicas_reserved); + } + + for (i = 0; i < f_sectors; i++) { + if (s->s[i].state < SECTOR_dirty) + continue; + + s->s[i].nr_replicas = w->opts.compression + ? 0 : nr_replicas_this_write; + + s->s[i].replicas_reserved = 0; + folio_sector_set(folio, s, i, SECTOR_allocated); + } + spin_unlock(&s->lock); + + BUG_ON(atomic_read(&s->write_count)); + atomic_set(&s->write_count, 1); + + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + + folio_unlock(folio); + + offset = 0; + while (1) { + unsigned sectors = 0, dirty_sectors = 0, reserved_sectors = 0; + u64 sector; + + while (offset < f_sectors && + w->tmp[offset].state < SECTOR_dirty) + offset++; + + if (offset == f_sectors) + break; + + while (offset + sectors < f_sectors && + w->tmp[offset + sectors].state >= SECTOR_dirty) { + reserved_sectors += w->tmp[offset + sectors].replicas_reserved; + dirty_sectors += w->tmp[offset + sectors].state == SECTOR_dirty; + sectors++; + } + BUG_ON(!sectors); + + sector = folio_sector(folio) + offset; + + if (w->io && + (w->io->op.res.nr_replicas != nr_replicas_this_write || + bio_full(&w->io->op.wbio.bio, sectors << 9) || + w->io->op.wbio.bio.bi_iter.bi_size + (sectors << 9) >= + (BIO_MAX_VECS * PAGE_SIZE) || + bio_end_sector(&w->io->op.wbio.bio) != sector)) + bch2_writepage_do_io(w); + + if (!w->io) + bch2_writepage_io_alloc(c, wbc, w, inode, sector, + nr_replicas_this_write); + + atomic_inc(&s->write_count); + + BUG_ON(inode != w->io->inode); + BUG_ON(!bio_add_folio(&w->io->op.wbio.bio, folio, + sectors << 9, offset << 9)); + + /* Check for writing past i_size: */ + WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) > + round_up(i_size, block_bytes(c)) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags), + "writing past i_size: %llu > %llu (unrounded %llu)\n", + bio_end_sector(&w->io->op.wbio.bio) << 9, + round_up(i_size, block_bytes(c)), + i_size); + + w->io->op.res.sectors += reserved_sectors; + w->io->op.i_sectors_delta -= dirty_sectors; + w->io->op.new_i_size = i_size; + + offset += sectors; + } + + if (atomic_dec_and_test(&s->write_count)) + folio_end_writeback(folio); + + return 0; +} + +int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + struct bch_fs *c = mapping->host->i_sb->s_fs_info; + struct bch_writepage_state w = + bch_writepage_state_init(c, to_bch_ei(mapping->host)); + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); + if (w.io) + bch2_writepage_do_io(&w); + blk_finish_plug(&plug); + kfree(w.tmp); + return bch2_err_class(ret); +} + +/* buffered writes: */ + +int bch2_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, + struct page **pagep, void **fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res; + struct folio *folio; + unsigned offset; + int ret = -ENOMEM; + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return -ENOMEM; + + bch2_folio_reservation_init(c, inode, res); + *fsdata = res; + + bch2_pagecache_add_get(inode); + + folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, + FGP_LOCK|FGP_WRITE|FGP_CREAT|FGP_STABLE, + mapping_gfp_mask(mapping)); + if (IS_ERR_OR_NULL(folio)) + goto err_unlock; + + if (folio_test_uptodate(folio)) + goto out; + + offset = pos - folio_pos(folio); + len = min_t(size_t, len, folio_end_pos(folio) - pos); + + /* If we're writing entire folio, don't need to read it in first: */ + if (!offset && len == folio_size(folio)) + goto out; + + if (!offset && pos + len >= inode->v.i_size) { + folio_zero_segment(folio, len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } + + if (folio_pos(folio) >= inode->v.i_size) { + folio_zero_segments(folio, 0, offset, offset + len, folio_size(folio)); + flush_dcache_folio(folio); + goto out; + } +readpage: + ret = bch2_read_single_folio(folio, mapping); + if (ret) + goto err; +out: + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto err; + + ret = bch2_folio_reservation_get(c, inode, folio, res, offset, len); + if (ret) { + if (!folio_test_uptodate(folio)) { + /* + * If the folio hasn't been read in, we won't know if we + * actually need a reservation - we don't actually need + * to read here, we just need to check if the folio is + * fully backed by uncompressed data: + */ + goto readpage; + } + + goto err; + } + + *pagep = &folio->page; + return 0; +err: + folio_unlock(folio); + folio_put(folio); + *pagep = NULL; +err_unlock: + bch2_pagecache_add_put(inode); + kfree(res); + *fsdata = NULL; + return bch2_err_class(ret); +} + +int bch2_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct bch_inode_info *inode = to_bch_ei(mapping->host); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation *res = fsdata; + struct folio *folio = page_folio(page); + unsigned offset = pos - folio_pos(folio); + + lockdep_assert_held(&inode->v.i_rwsem); + BUG_ON(offset + copied > folio_size(folio)); + + if (unlikely(copied < len && !folio_test_uptodate(folio))) { + /* + * The folio needs to be read in, but that would destroy + * our partial write - simplest thing is to just force + * userspace to redo the write: + */ + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + copied = 0; + } + + spin_lock(&inode->v.i_lock); + if (pos + copied > inode->v.i_size) + i_size_write(&inode->v, pos + copied); + spin_unlock(&inode->v.i_lock); + + if (copied) { + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + + bch2_set_folio_dirty(c, inode, folio, res, offset, copied); + + inode->ei_last_dirtied = (unsigned long) current; + } + + folio_unlock(folio); + folio_put(folio); + bch2_pagecache_add_put(inode); + + bch2_folio_reservation_put(c, inode, res); + kfree(res); + + return copied; +} + +static noinline void folios_trunc(folios *folios, struct folio **fi) +{ + while (folios->data + folios->nr > fi) { + struct folio *f = darray_pop(folios); + + folio_unlock(f); + folio_put(f); + } +} + +static int __bch2_buffered_write(struct bch_inode_info *inode, + struct address_space *mapping, + struct iov_iter *iter, + loff_t pos, unsigned len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch2_folio_reservation res; + folios folios; + struct folio **fi, *f; + unsigned copied = 0, f_offset; + u64 end = pos + len, f_pos; + loff_t last_folio_pos = inode->v.i_size; + int ret = 0; + + BUG_ON(!len); + + bch2_folio_reservation_init(c, inode, &res); + darray_init(&folios); + + ret = filemap_get_contig_folios_d(mapping, pos, end, + FGP_LOCK|FGP_WRITE|FGP_STABLE|FGP_CREAT, + mapping_gfp_mask(mapping), + &folios); + if (ret) + goto out; + + BUG_ON(!folios.nr); + + f = darray_first(folios); + if (pos != folio_pos(f) && !folio_test_uptodate(f)) { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + + f = darray_last(folios); + end = min(end, folio_end_pos(f)); + last_folio_pos = folio_pos(f); + if (end != folio_end_pos(f) && !folio_test_uptodate(f)) { + if (end >= inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + } else { + ret = bch2_read_single_folio(f, mapping); + if (ret) + goto out; + } + } + + ret = bch2_folio_set(c, inode_inum(inode), folios.data, folios.nr); + if (ret) + goto out; + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + + /* + * XXX: per POSIX and fstests generic/275, on -ENOSPC we're + * supposed to write as much as we have disk space for. + * + * On failure here we should still write out a partial page if + * we aren't completely out of disk space - we don't do that + * yet: + */ + ret = bch2_folio_reservation_get(c, inode, f, &res, f_offset, f_len); + if (unlikely(ret)) { + folios_trunc(&folios, fi); + if (!folios.nr) + goto out; + + end = min(end, folio_end_pos(darray_last(folios))); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (mapping_writably_mapped(mapping)) + darray_for_each(folios, fi) + flush_dcache_folio(*fi); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + unsigned f_copied = copy_page_from_iter_atomic(&f->page, f_offset, f_len, iter); + + if (!f_copied) { + folios_trunc(&folios, fi); + break; + } + + if (!folio_test_uptodate(f) && + f_copied != folio_size(f) && + pos + copied + f_copied < inode->v.i_size) { + folio_zero_range(f, 0, folio_size(f)); + folios_trunc(&folios, fi); + break; + } + + flush_dcache_folio(f); + copied += f_copied; + + if (f_copied != f_len) { + folios_trunc(&folios, fi + 1); + break; + } + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + if (!copied) + goto out; + + end = pos + copied; + + spin_lock(&inode->v.i_lock); + if (end > inode->v.i_size) + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); + + f_pos = pos; + f_offset = pos - folio_pos(darray_first(folios)); + darray_for_each(folios, fi) { + struct folio *f = *fi; + u64 f_len = min(end, folio_end_pos(f)) - f_pos; + + if (!folio_test_uptodate(f)) + folio_mark_uptodate(f); + + bch2_set_folio_dirty(c, inode, f, &res, f_offset, f_len); + + f_pos = folio_end_pos(f); + f_offset = 0; + } + + inode->ei_last_dirtied = (unsigned long) current; +out: + darray_for_each(folios, fi) { + folio_unlock(*fi); + folio_put(*fi); + } + + /* + * If the last folio added to the mapping starts beyond current EOF, we + * performed a short write but left around at least one post-EOF folio. + * Clean up the mapping before we return. + */ + if (last_folio_pos >= inode->v.i_size) + truncate_pagecache(&inode->v, inode->v.i_size); + + darray_exit(&folios); + bch2_folio_reservation_put(c, inode, &res); + + return copied ?: ret; +} + +static ssize_t bch2_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + loff_t pos = iocb->ki_pos; + ssize_t written = 0; + int ret = 0; + + bch2_pagecache_add_get(inode); + + do { + unsigned offset = pos & (PAGE_SIZE - 1); + unsigned bytes = iov_iter_count(iter); +again: + /* + * Bring in the user page that we will copy from _first_. + * Otherwise there's a nasty deadlock on copying from the + * same page as we're writing to, without it being marked + * up-to-date. + * + * Not only is this an optimisation, but it is also required + * to check that the address is actually valid, when atomic + * usercopies are used, below. + */ + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + bytes = min_t(unsigned long, iov_iter_count(iter), + PAGE_SIZE - offset); + + if (unlikely(fault_in_iov_iter_readable(iter, bytes))) { + ret = -EFAULT; + break; + } + } + + if (unlikely(fatal_signal_pending(current))) { + ret = -EINTR; + break; + } + + ret = __bch2_buffered_write(inode, mapping, iter, pos, bytes); + if (unlikely(ret < 0)) + break; + + cond_resched(); + + if (unlikely(ret == 0)) { + /* + * If we were unable to copy any data at all, we must + * fall back to a single segment length write. + * + * If we didn't fallback here, we could livelock + * because not all segments in the iov can be copied at + * once without a pagefault. + */ + bytes = min_t(unsigned long, PAGE_SIZE - offset, + iov_iter_single_seg_count(iter)); + goto again; + } + pos += ret; + written += ret; + ret = 0; + + balance_dirty_pages_ratelimited(mapping); + } while (iov_iter_count(iter)); + + bch2_pagecache_add_put(inode); + + return written ? written : ret; +} + +/* O_DIRECT reads */ + +static void bio_check_or_release(struct bio *bio, bool check_dirty) +{ + if (check_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static void bch2_dio_read_complete(struct closure *cl) +{ + struct dio_read *dio = container_of(cl, struct dio_read, cl); + + dio->req->ki_complete(dio->req, dio->ret); + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); +} + +static void bch2_direct_IO_read_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + + if (bio->bi_status) + dio->ret = blk_status_to_errno(bio->bi_status); + + closure_put(&dio->cl); +} + +static void bch2_direct_IO_read_split_endio(struct bio *bio) +{ + struct dio_read *dio = bio->bi_private; + bool should_dirty = dio->should_dirty; + + bch2_direct_IO_read_endio(bio); + bio_check_or_release(bio, should_dirty); +} + +static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_io_opts opts; + struct dio_read *dio; + struct bio *bio; + loff_t offset = req->ki_pos; + bool sync = is_sync_kiocb(req); + size_t shorten; + ssize_t ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + if ((offset|iter->count) & (block_bytes(c) - 1)) + return -EINVAL; + + ret = min_t(loff_t, iter->count, + max_t(loff_t, 0, i_size_read(&inode->v) - offset)); + + if (!ret) + return ret; + + shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c)); + iter->count -= shorten; + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->dio_read_bioset); + + bio->bi_end_io = bch2_direct_IO_read_endio; + + dio = container_of(bio, struct dio_read, rbio.bio); + closure_init(&dio->cl, NULL); + + /* + * this is a _really_ horrible hack just to avoid an atomic sub at the + * end: + */ + if (!sync) { + set_closure_fn(&dio->cl, bch2_dio_read_complete, NULL); + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER - + CLOSURE_RUNNING + + CLOSURE_DESTRUCTOR); + } else { + atomic_set(&dio->cl.remaining, + CLOSURE_REMAINING_INITIALIZER + 1); + } + + dio->req = req; + dio->ret = ret; + /* + * This is one of the sketchier things I've encountered: we have to skip + * the dirtying of requests that are internal from the kernel (i.e. from + * loopback), because we'll deadlock on page_lock. + */ + dio->should_dirty = iter_is_iovec(iter); + + goto start; + while (iter->count) { + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_READ, + GFP_KERNEL, + &c->bio_read); + bio->bi_end_io = bch2_direct_IO_read_split_endio; +start: + bio->bi_opf = REQ_OP_READ|REQ_SYNC; + bio->bi_iter.bi_sector = offset >> 9; + bio->bi_private = dio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (ret < 0) { + /* XXX: fault inject this path */ + bio->bi_status = BLK_STS_RESOURCE; + bio_endio(bio); + break; + } + + offset += bio->bi_iter.bi_size; + + if (dio->should_dirty) + bio_set_pages_dirty(bio); + + if (iter->count) + closure_get(&dio->cl); + + bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + } + + iter->count += shorten; + + if (sync) { + closure_sync(&dio->cl); + closure_debug_destroy(&dio->cl); + ret = dio->ret; + bio_check_or_release(&dio->rbio.bio, dio->should_dirty); + return ret; + } else { + return -EIOCBQUEUED; + } +} + +ssize_t bch2_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + struct address_space *mapping = file->f_mapping; + size_t count = iov_iter_count(iter); + ssize_t ret; + + if (!count) + return 0; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct blk_plug plug; + + if (unlikely(mapping->nrpages)) { + ret = filemap_write_and_wait_range(mapping, + iocb->ki_pos, + iocb->ki_pos + count - 1); + if (ret < 0) + goto out; + } + + file_accessed(file); + + blk_start_plug(&plug); + ret = bch2_direct_IO_read(iocb, iter); + blk_finish_plug(&plug); + + if (ret >= 0) + iocb->ki_pos += ret; + } else { + bch2_pagecache_add_get(inode); + ret = generic_file_read_iter(iocb, iter); + bch2_pagecache_add_put(inode); + } +out: + return bch2_err_class(ret); +} + +/* O_DIRECT writes */ + +static bool bch2_check_range_allocated(struct bch_fs *c, subvol_inum inum, + u64 offset, u64 size, + unsigned nr_replicas, bool compressed) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u64 end = offset + size; + u32 snapshot; + bool ret = true; + int err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + err = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (err) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inum.inum, offset, snapshot), + BTREE_ITER_SLOTS, k, err) { + if (bkey_ge(bkey_start_pos(k.k), POS(inum.inum, end))) + break; + + if (k.k->p.snapshot != snapshot || + nr_replicas > bch2_bkey_replicas(c, k) || + (!compressed && bch2_bkey_sectors_compressed(k))) { + ret = false; + break; + } + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(err, BCH_ERR_transaction_restart)) + goto retry; + bch2_trans_exit(&trans); + + return err ? false : ret; +} + +static noinline bool bch2_dio_write_check_allocated(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + return bch2_check_range_allocated(c, inode_inum(inode), + dio->op.pos.offset, bio_sectors(bio), + dio->op.opts.data_replicas, + dio->op.opts.compression != 0); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *); +static __always_inline long bch2_dio_write_done(struct dio_write *dio); + +/* + * We're going to return -EIOCBQUEUED, but we haven't finished consuming the + * iov_iter yet, so we need to stash a copy of the iovec: it might be on the + * caller's stack, we're not guaranteed that it will live for the duration of + * the IO: + */ +static noinline int bch2_dio_write_copy_iov(struct dio_write *dio) +{ + struct iovec *iov = dio->inline_vecs; + + /* + * iov_iter has a single embedded iovec - nothing to do: + */ + if (iter_is_ubuf(&dio->iter)) + return 0; + + /* + * We don't currently handle non-iovec iov_iters here - return an error, + * and we'll fall back to doing the IO synchronously: + */ + if (!iter_is_iovec(&dio->iter)) + return -1; + + if (dio->iter.nr_segs > ARRAY_SIZE(dio->inline_vecs)) { + iov = kmalloc_array(dio->iter.nr_segs, sizeof(*iov), + GFP_KERNEL); + if (unlikely(!iov)) + return -ENOMEM; + + dio->free_iov = true; + } + + memcpy(iov, dio->iter.__iov, dio->iter.nr_segs * sizeof(*iov)); + dio->iter.__iov = iov; + return 0; +} + +static void bch2_dio_write_flush_done(struct closure *cl) +{ + struct dio_write *dio = container_of(cl, struct dio_write, op.cl); + struct bch_fs *c = dio->op.c; + + closure_debug_destroy(cl); + + dio->op.error = bch2_journal_error(&c->journal); + + bch2_dio_write_done(dio); +} + +static noinline void bch2_dio_write_flush(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct bch_inode_unpacked inode; + int ret; + + dio->flush = 0; + + closure_init(&dio->op.cl, NULL); + + if (!dio->op.error) { + ret = bch2_inode_find_by_inum(c, inode_inum(dio->inode), &inode); + if (ret) { + dio->op.error = ret; + } else { + bch2_journal_flush_seq_async(&c->journal, inode.bi_journal_seq, &dio->op.cl); + bch2_inode_flush_nocow_writes_async(c, dio->inode, &dio->op.cl); + } + } + + if (dio->sync) { + closure_sync(&dio->op.cl); + closure_debug_destroy(&dio->op.cl); + } else { + continue_at(&dio->op.cl, bch2_dio_write_flush_done, NULL); + } +} + +static __always_inline long bch2_dio_write_done(struct dio_write *dio) +{ + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + bool sync = dio->sync; + long ret; + + if (unlikely(dio->flush)) { + bch2_dio_write_flush(dio); + if (!sync) + return -EIOCBQUEUED; + } + + bch2_pagecache_block_put(inode); + + if (dio->free_iov) + kfree(dio->iter.__iov); + + ret = dio->op.error ?: ((long) dio->written << 9); + bio_put(&dio->op.wbio.bio); + + /* inode->i_dio_count is our ref on inode and thus bch_fs */ + inode_dio_end(&inode->v); + + if (ret < 0) + ret = bch2_err_class(ret); + + if (!sync) { + req->ki_complete(req, ret); + ret = -EIOCBQUEUED; + } + return ret; +} + +static __always_inline void bch2_dio_write_end(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct bch_inode_info *inode = dio->inode; + struct bio *bio = &dio->op.wbio.bio; + + req->ki_pos += (u64) dio->op.written << 9; + dio->written += dio->op.written; + + if (dio->extending) { + spin_lock(&inode->v.i_lock); + if (req->ki_pos > inode->v.i_size) + i_size_write(&inode->v, req->ki_pos); + spin_unlock(&inode->v.i_lock); + } + + if (dio->op.i_sectors_delta || dio->quota_res.sectors) { + mutex_lock(&inode->ei_quota_lock); + __i_sectors_acct(c, inode, &dio->quota_res, dio->op.i_sectors_delta); + __bch2_quota_reservation_put(c, inode, &dio->quota_res); + mutex_unlock(&inode->ei_quota_lock); + } + + bio_release_pages(bio, false); + + if (unlikely(dio->op.error)) + set_bit(EI_INODE_ERROR, &inode->ei_flags); +} + +static __always_inline long bch2_dio_write_loop(struct dio_write *dio) +{ + struct bch_fs *c = dio->op.c; + struct kiocb *req = dio->req; + struct address_space *mapping = dio->mapping; + struct bch_inode_info *inode = dio->inode; + struct bch_io_opts opts; + struct bio *bio = &dio->op.wbio.bio; + unsigned unaligned, iter_count; + bool sync = dio->sync, dropped_locks; + long ret; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + + while (1) { + iter_count = dio->iter.count; + + EBUG_ON(current->faults_disabled_mapping); + current->faults_disabled_mapping = mapping; + + ret = bio_iov_iter_get_pages(bio, &dio->iter); + + dropped_locks = fdm_dropped_locks(); + + current->faults_disabled_mapping = NULL; + + /* + * If the fault handler returned an error but also signalled + * that it dropped & retook ei_pagecache_lock, we just need to + * re-shoot down the page cache and retry: + */ + if (dropped_locks && ret) + ret = 0; + + if (unlikely(ret < 0)) + goto err; + + if (unlikely(dropped_locks)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter_count - 1); + if (unlikely(ret)) + goto err; + + if (!bio->bi_iter.bi_size) + continue; + } + + unaligned = bio->bi_iter.bi_size & (block_bytes(c) - 1); + bio->bi_iter.bi_size -= unaligned; + iov_iter_revert(&dio->iter, unaligned); + + if (!bio->bi_iter.bi_size) { + /* + * bio_iov_iter_get_pages was only able to get < + * blocksize worth of pages: + */ + ret = -EFAULT; + goto err; + } + + bch2_write_op_init(&dio->op, c, opts); + dio->op.end_io = sync + ? NULL + : bch2_dio_write_loop_async; + dio->op.target = dio->op.opts.foreground_target; + dio->op.write_point = writepoint_hashed((unsigned long) current); + dio->op.nr_replicas = dio->op.opts.data_replicas; + dio->op.subvol = inode->ei_subvol; + dio->op.pos = POS(inode->v.i_ino, (u64) req->ki_pos >> 9); + dio->op.devs_need_flush = &inode->ei_devs_need_flush; + + if (sync) + dio->op.flags |= BCH_WRITE_SYNC; + dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + + ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, + bio_sectors(bio), true); + if (unlikely(ret)) + goto err; + + ret = bch2_disk_reservation_get(c, &dio->op.res, bio_sectors(bio), + dio->op.opts.data_replicas, 0); + if (unlikely(ret) && + !bch2_dio_write_check_allocated(dio)) + goto err; + + task_io_account_write(bio->bi_iter.bi_size); + + if (unlikely(dio->iter.count) && + !dio->sync && + !dio->loop && + bch2_dio_write_copy_iov(dio)) + dio->sync = sync = true; + + dio->loop = true; + closure_call(&dio->op.cl, bch2_write, NULL, NULL); + + if (!sync) + return -EIOCBQUEUED; + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + break; + + bio_reset(bio, NULL, REQ_OP_WRITE); + } +out: + return bch2_dio_write_done(dio); +err: + dio->op.error = ret; + + bio_release_pages(bio, false); + + bch2_quota_reservation_put(c, inode, &dio->quota_res); + goto out; +} + +static noinline __cold void bch2_dio_write_continue(struct dio_write *dio) +{ + struct mm_struct *mm = dio->mm; + + bio_reset(&dio->op.wbio.bio, NULL, REQ_OP_WRITE); + + if (mm) + kthread_use_mm(mm); + bch2_dio_write_loop(dio); + if (mm) + kthread_unuse_mm(mm); +} + +static void bch2_dio_write_loop_async(struct bch_write_op *op) +{ + struct dio_write *dio = container_of(op, struct dio_write, op); + + bch2_dio_write_end(dio); + + if (likely(!dio->iter.count) || dio->op.error) + bch2_dio_write_done(dio); + else + bch2_dio_write_continue(dio); +} + +static noinline +ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) +{ + struct file *file = req->ki_filp; + struct address_space *mapping = file->f_mapping; + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct dio_write *dio; + struct bio *bio; + bool locked = true, extending; + ssize_t ret; + + prefetch(&c->opts); + prefetch((void *) &c->opts + 64); + prefetch(&inode->ei_inode); + prefetch((void *) &inode->ei_inode + 64); + + inode_lock(&inode->v); + + ret = generic_write_checks(req, iter); + if (unlikely(ret <= 0)) + goto err; + + ret = file_remove_privs(file); + if (unlikely(ret)) + goto err; + + ret = file_update_time(file); + if (unlikely(ret)) + goto err; + + if (unlikely((req->ki_pos|iter->count) & (block_bytes(c) - 1))) + goto err; + + inode_dio_begin(&inode->v); + bch2_pagecache_block_get(inode); + + extending = req->ki_pos + iter->count > inode->v.i_size; + if (!extending) { + inode_unlock(&inode->v); + locked = false; + } + + bio = bio_alloc_bioset(NULL, + bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), + REQ_OP_WRITE, + GFP_KERNEL, + &c->dio_write_bioset); + dio = container_of(bio, struct dio_write, op.wbio.bio); + dio->req = req; + dio->mapping = mapping; + dio->inode = inode; + dio->mm = current->mm; + dio->loop = false; + dio->extending = extending; + dio->sync = is_sync_kiocb(req) || extending; + dio->flush = iocb_is_dsync(req) && !c->opts.journal_flush_disabled; + dio->free_iov = false; + dio->quota_res.sectors = 0; + dio->written = 0; + dio->iter = *iter; + dio->op.c = c; + + if (unlikely(mapping->nrpages)) { + ret = write_invalidate_inode_pages_range(mapping, + req->ki_pos, + req->ki_pos + iter->count - 1); + if (unlikely(ret)) + goto err_put_bio; + } + + ret = bch2_dio_write_loop(dio); +err: + if (locked) + inode_unlock(&inode->v); + return ret; +err_put_bio: + bch2_pagecache_block_put(inode); + bio_put(bio); + inode_dio_end(&inode->v); + goto err; +} + +ssize_t bch2_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct bch_inode_info *inode = file_bch_inode(file); + ssize_t ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = bch2_direct_write(iocb, from); + goto out; + } + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = inode_to_bdi(&inode->v); + inode_lock(&inode->v); + + ret = generic_write_checks(iocb, from); + if (ret <= 0) + goto unlock; + + ret = file_remove_privs(file); + if (ret) + goto unlock; + + ret = file_update_time(file); + if (ret) + goto unlock; + + ret = bch2_buffered_write(iocb, from); + if (likely(ret > 0)) + iocb->ki_pos += ret; +unlock: + inode_unlock(&inode->v); + current->backing_dev_info = NULL; + + if (ret > 0) + ret = generic_write_sync(iocb, ret); +out: + return bch2_err_class(ret); +} + +/* fsync: */ + +/* + * inode->ei_inode.bi_journal_seq won't be up to date since it's set in an + * insert trigger: look up the btree inode instead + */ +static int bch2_flush_inode(struct bch_fs *c, + struct bch_inode_info *inode) +{ + struct bch_inode_unpacked u; + int ret; + + if (c->opts.journal_flush_disabled) + return 0; + + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &u); + if (ret) + return ret; + + return bch2_journal_flush_seq(&c->journal, u.bi_journal_seq) ?: + bch2_inode_flush_nocow_writes(c, inode); +} + +int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret, ret2, ret3; + + ret = file_write_and_wait_range(file, start, end); + ret2 = sync_inode_metadata(&inode->v, 1); + ret3 = bch2_flush_inode(c, inode); + + return bch2_err_class(ret ?: ret2 ?: ret3); +} + +/* truncate: */ + +static inline int range_has_data(struct bch_fs *c, u32 subvol, + struct bpos start, + struct bpos end) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, subvol, &start.snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, start, end, 0, k, ret) + if (bkey_extent_is_data(k.k) && !bkey_extent_is_unwritten(k)) { + ret = 1; + break; + } + start = iter.pos; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + return ret; +} + +static int __bch2_truncate_folio(struct bch_inode_info *inode, + pgoff_t index, loff_t start, loff_t end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + struct bch_folio *s; + unsigned start_offset = start & (PAGE_SIZE - 1); + unsigned end_offset = ((end - 1) & (PAGE_SIZE - 1)) + 1; + unsigned i; + struct folio *folio; + s64 i_sectors_delta = 0; + int ret = 0; + u64 end_pos; + + folio = filemap_lock_folio(mapping, index); + if (IS_ERR_OR_NULL(folio)) { + /* + * XXX: we're doing two index lookups when we end up reading the + * folio + */ + ret = range_has_data(c, inode->ei_subvol, + POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT)), + POS(inode->v.i_ino, (index << PAGE_SECTORS_SHIFT) + PAGE_SECTORS)); + if (ret <= 0) + return ret; + + folio = __filemap_get_folio(mapping, index, + FGP_LOCK|FGP_CREAT, GFP_KERNEL); + if (unlikely(IS_ERR_OR_NULL(folio))) { + ret = -ENOMEM; + goto out; + } + } + + BUG_ON(start >= folio_end_pos(folio)); + BUG_ON(end <= folio_pos(folio)); + + start_offset = max(start, folio_pos(folio)) - folio_pos(folio); + end_offset = min_t(u64, end, folio_end_pos(folio)) - folio_pos(folio); + + /* Folio boundary? Nothing to do */ + if (start_offset == 0 && + end_offset == folio_size(folio)) { + ret = 0; + goto unlock; + } + + s = bch2_folio_create(folio, 0); + if (!s) { + ret = -ENOMEM; + goto unlock; + } + + if (!folio_test_uptodate(folio)) { + ret = bch2_read_single_folio(folio, mapping); + if (ret) + goto unlock; + } + + ret = bch2_folio_set(c, inode_inum(inode), &folio, 1); + if (ret) + goto unlock; + + for (i = round_up(start_offset, block_bytes(c)) >> 9; + i < round_down(end_offset, block_bytes(c)) >> 9; + i++) { + s->s[i].nr_replicas = 0; + + i_sectors_delta -= s->s[i].state == SECTOR_dirty; + folio_sector_set(folio, s, i, SECTOR_unallocated); + } + + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + /* + * Caller needs to know whether this folio will be written out by + * writeback - doing an i_size update if necessary - or whether it will + * be responsible for the i_size update. + * + * Note that we shouldn't ever see a folio beyond EOF, but check and + * warn if so. This has been observed by failure to clean up folios + * after a short write and there's still a chance reclaim will fix + * things up. + */ + WARN_ON_ONCE(folio_pos(folio) >= inode->v.i_size); + end_pos = folio_end_pos(folio); + if (inode->v.i_size > folio_pos(folio)) + end_pos = min_t(u64, inode->v.i_size, end_pos); + ret = s->s[folio_pos_to_s(folio, end_pos - 1)].state >= SECTOR_dirty; + + folio_zero_segment(folio, start_offset, end_offset); + + /* + * Bit of a hack - we don't want truncate to fail due to -ENOSPC. + * + * XXX: because we aren't currently tracking whether the folio has actual + * data in it (vs. just 0s, or only partially written) this wrong. ick. + */ + BUG_ON(bch2_get_folio_disk_reservation(c, inode, folio, false)); + + /* + * This removes any writeable userspace mappings; we need to force + * .page_mkwrite to be called again before any mmapped writes, to + * redirty the full page: + */ + folio_mkclean(folio); + filemap_dirty_folio(mapping, folio); +unlock: + folio_unlock(folio); + folio_put(folio); +out: + return ret; +} + +static int bch2_truncate_folio(struct bch_inode_info *inode, loff_t from) +{ + return __bch2_truncate_folio(inode, from >> PAGE_SHIFT, + from, ANYSINT_MAX(loff_t)); +} + +static int bch2_truncate_folios(struct bch_inode_info *inode, + loff_t start, loff_t end) +{ + int ret = __bch2_truncate_folio(inode, start >> PAGE_SHIFT, + start, end); + + if (ret >= 0 && + start >> PAGE_SHIFT != end >> PAGE_SHIFT) + ret = __bch2_truncate_folio(inode, + (end - 1) >> PAGE_SHIFT, + start, end); + return ret; +} + +static int bch2_extend(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct bch_inode_unpacked *inode_u, + struct iattr *iattr) +{ + struct address_space *mapping = inode->v.i_mapping; + int ret; + + /* + * sync appends: + * + * this has to be done _before_ extending i_size: + */ + ret = filemap_write_and_wait_range(mapping, inode_u->bi_size, S64_MAX); + if (ret) + return ret; + + truncate_setsize(&inode->v, iattr->ia_size); + + return bch2_setattr_nonsize(idmap, inode, iattr); +} + +static int bch2_truncate_finish_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + bi->bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + return 0; +} + +static int bch2_truncate_start_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + u64 *new_i_size = p; + + bi->bi_flags |= BCH_INODE_I_SIZE_DIRTY; + bi->bi_size = *new_i_size; + return 0; +} + +int bch2_truncate(struct mnt_idmap *idmap, + struct bch_inode_info *inode, struct iattr *iattr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + struct bch_inode_unpacked inode_u; + u64 new_i_size = iattr->ia_size; + s64 i_sectors_delta = 0; + int ret = 0; + + /* + * If the truncate call with change the size of the file, the + * cmtimes should be updated. If the size will not change, we + * do not need to update the cmtimes. + */ + if (iattr->ia_size != inode->v.i_size) { + if (!(iattr->ia_valid & ATTR_MTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_mtime); + if (!(iattr->ia_valid & ATTR_CTIME)) + ktime_get_coarse_real_ts64(&iattr->ia_ctime); + iattr->ia_valid |= ATTR_MTIME|ATTR_CTIME; + } + + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(inode); + + ret = bch2_inode_find_by_inum(c, inode_inum(inode), &inode_u); + if (ret) + goto err; + + /* + * check this before next assertion; on filesystem error our normal + * invariants are a bit broken (truncate has to truncate the page cache + * before the inode). + */ + ret = bch2_journal_error(&c->journal); + if (ret) + goto err; + + WARN_ONCE(!test_bit(EI_INODE_ERROR, &inode->ei_flags) && + inode->v.i_size < inode_u.bi_size, + "truncate spotted in mem i_size < btree i_size: %llu < %llu\n", + (u64) inode->v.i_size, inode_u.bi_size); + + if (iattr->ia_size > inode->v.i_size) { + ret = bch2_extend(idmap, inode, &inode_u, iattr); + goto err; + } + + iattr->ia_valid &= ~ATTR_SIZE; + + ret = bch2_truncate_folio(inode, iattr->ia_size); + if (unlikely(ret < 0)) + goto err; + + /* + * When extending, we're going to write the new i_size to disk + * immediately so we need to flush anything above the current on disk + * i_size first: + * + * Also, when extending we need to flush the page that i_size currently + * straddles - if it's mapped to userspace, we need to ensure that + * userspace has to redirty it and call .mkwrite -> set_page_dirty + * again to allocate the part of the page that was extended. + */ + if (iattr->ia_size > inode_u.bi_size) + ret = filemap_write_and_wait_range(mapping, + inode_u.bi_size, + iattr->ia_size - 1); + else if (iattr->ia_size & (PAGE_SIZE - 1)) + ret = filemap_write_and_wait_range(mapping, + round_down(iattr->ia_size, PAGE_SIZE), + iattr->ia_size - 1); + if (ret) + goto err; + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_truncate_start_fn, + &new_i_size, 0); + mutex_unlock(&inode->ei_update_lock); + + if (unlikely(ret)) + goto err; + + truncate_setsize(&inode->v, iattr->ia_size); + + ret = bch2_fpunch(c, inode_inum(inode), + round_up(iattr->ia_size, block_bytes(c)) >> 9, + U64_MAX, &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal), c, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); + if (unlikely(ret)) + goto err; + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_truncate_finish_fn, NULL, 0); + mutex_unlock(&inode->ei_update_lock); + + ret = bch2_setattr_nonsize(idmap, inode, iattr); +err: + bch2_pagecache_block_put(inode); + return bch2_err_class(ret); +} + +/* fallocate: */ + +static int inode_update_times_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_mtime = bi->bi_ctime = bch2_current_time(c); + return 0; +} + +static long bchfs_fpunch(struct bch_inode_info *inode, loff_t offset, loff_t len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + u64 end = offset + len; + u64 block_start = round_up(offset, block_bytes(c)); + u64 block_end = round_down(end, block_bytes(c)); + bool truncated_last_page; + int ret = 0; + + ret = bch2_truncate_folios(inode, offset, end); + if (unlikely(ret < 0)) + goto err; + + truncated_last_page = ret; + + truncate_pagecache_range(&inode->v, offset, end - 1); + + if (block_start < block_end) { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode_inum(inode), + block_start >> 9, block_end >> 9, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + } + + mutex_lock(&inode->ei_update_lock); + if (end >= inode->v.i_size && !truncated_last_page) { + ret = bch2_write_inode_size(c, inode, inode->v.i_size, + ATTR_MTIME|ATTR_CTIME); + } else { + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&inode->ei_update_lock); +err: + return ret; +} + +static long bchfs_fcollapse_finsert(struct bch_inode_info *inode, + loff_t offset, loff_t len, + bool insert) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct address_space *mapping = inode->v.i_mapping; + struct bkey_buf copy; + struct btree_trans trans; + struct btree_iter src, dst, del; + loff_t shift, new_size; + u64 src_start; + int ret = 0; + + if ((offset | len) & (block_bytes(c) - 1)) + return -EINVAL; + + if (insert) { + if (inode->v.i_sb->s_maxbytes - inode->v.i_size < len) + return -EFBIG; + + if (offset >= inode->v.i_size) + return -EINVAL; + + src_start = U64_MAX; + shift = len; + } else { + if (offset + len >= inode->v.i_size) + return -EINVAL; + + src_start = offset + len; + shift = -len; + } + + new_size = inode->v.i_size + shift; + + ret = write_invalidate_inode_pages_range(mapping, offset, LLONG_MAX); + if (ret) + return ret; + + if (insert) { + i_size_write(&inode->v, new_size); + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + } else { + s64 i_sectors_delta = 0; + + ret = bch2_fpunch(c, inode_inum(inode), + offset >> 9, (offset + len) >> 9, + &i_sectors_delta); + i_sectors_acct(c, inode, NULL, i_sectors_delta); + + if (ret) + return ret; + } + + bch2_bkey_buf_init(©); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + bch2_trans_iter_init(&trans, &src, BTREE_ID_extents, + POS(inode->v.i_ino, src_start >> 9), + BTREE_ITER_INTENT); + bch2_trans_copy_iter(&dst, &src); + bch2_trans_copy_iter(&del, &src); + + while (ret == 0 || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + struct bkey_s_c k; + struct bpos next_pos; + struct bpos move_pos = POS(inode->v.i_ino, offset >> 9); + struct bpos atomic_end; + unsigned trigger_flags = 0; + u32 snapshot; + + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src, snapshot); + bch2_btree_iter_set_snapshot(&dst, snapshot); + bch2_btree_iter_set_snapshot(&del, snapshot); + + bch2_trans_begin(&trans); + + k = insert + ? bch2_btree_iter_peek_prev(&src) + : bch2_btree_iter_peek_upto(&src, POS(inode->v.i_ino, U64_MAX)); + if ((ret = bkey_err(k))) + continue; + + if (!k.k || k.k->p.inode != inode->v.i_ino) + break; + + if (insert && + bkey_le(k.k->p, POS(inode->v.i_ino, offset >> 9))) + break; +reassemble: + bch2_bkey_buf_reassemble(©, c, k); + + if (insert && + bkey_lt(bkey_start_pos(k.k), move_pos)) + bch2_cut_front(move_pos, copy.k); + + copy.k->k.p.offset += shift >> 9; + bch2_btree_iter_set_pos(&dst, bkey_start_pos(©.k->k)); + + ret = bch2_extent_atomic_end(&trans, &dst, copy.k, &atomic_end); + if (ret) + continue; + + if (!bkey_eq(atomic_end, copy.k->k.p)) { + if (insert) { + move_pos = atomic_end; + move_pos.offset -= shift >> 9; + goto reassemble; + } else { + bch2_cut_back(atomic_end, copy.k); + } + } + + bkey_init(&delete.k); + delete.k.p = copy.k->k.p; + delete.k.size = copy.k->k.size; + delete.k.p.offset -= shift >> 9; + bch2_btree_iter_set_pos(&del, bkey_start_pos(&delete.k)); + + next_pos = insert ? bkey_start_pos(&delete.k) : delete.k.p; + + if (copy.k->k.size != k.k->size) { + /* We might end up splitting compressed extents: */ + unsigned nr_ptrs = + bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(copy.k)); + + ret = bch2_disk_reservation_get(c, &disk_res, + copy.k->k.size, nr_ptrs, + BCH_DISK_RESERVATION_NOFAIL); + BUG_ON(ret); + } + + ret = bch2_btree_iter_traverse(&del) ?: + bch2_trans_update(&trans, &del, &delete, trigger_flags) ?: + bch2_trans_update(&trans, &dst, copy.k, trigger_flags) ?: + bch2_trans_commit(&trans, &disk_res, NULL, + BTREE_INSERT_NOFAIL); + bch2_disk_reservation_put(c, &disk_res); + + if (!ret) + bch2_btree_iter_set_pos(&src, next_pos); + } + bch2_trans_iter_exit(&trans, &del); + bch2_trans_iter_exit(&trans, &dst); + bch2_trans_iter_exit(&trans, &src); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(©, c); + + if (ret) + return ret; + + mutex_lock(&inode->ei_update_lock); + if (!insert) { + i_size_write(&inode->v, new_size); + ret = bch2_write_inode_size(c, inode, new_size, + ATTR_MTIME|ATTR_CTIME); + } else { + /* We need an inode update to update bi_journal_seq for fsync: */ + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_MTIME|ATTR_CTIME); + } + mutex_unlock(&inode->ei_update_lock); + return ret; +} + +static int __bchfs_fallocate(struct bch_inode_info *inode, int mode, + u64 start_sector, u64 end_sector) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter; + struct bpos end_pos = POS(inode->v.i_ino, end_sector); + struct bch_io_opts opts; + int ret = 0; + + bch2_inode_opts_get(&opts, c, &inode->ei_inode); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 512); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inode->v.i_ino, start_sector), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + while (!ret && bkey_lt(iter.pos, end_pos)) { + s64 i_sectors_delta = 0; + struct quota_res quota_res = { 0 }; + struct bkey_s_c k; + unsigned sectors; + bool is_allocation; + u64 hole_start, hole_end; + u32 snapshot; + + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, + inode->ei_subvol, &snapshot); + if (ret) + goto bkey_err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_slot(&iter); + if ((ret = bkey_err(k))) + goto bkey_err; + + hole_start = iter.pos.offset; + hole_end = bpos_min(k.k->p, end_pos).offset; + is_allocation = bkey_extent_is_allocation(k.k); + + /* already reserved */ + if (bkey_extent_is_reservation(k) && + bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { + bch2_btree_iter_advance(&iter); + continue; + } + + if (bkey_extent_is_data(k.k) && + !(mode & FALLOC_FL_ZERO_RANGE)) { + bch2_btree_iter_advance(&iter); + continue; + } + + if (!(mode & FALLOC_FL_ZERO_RANGE)) { + ret = drop_locks_do(&trans, + (bch2_clamp_data_hole(&inode->v, + &hole_start, + &hole_end, + opts.data_replicas), 0)); + bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + + if (ret) + goto bkey_err; + + if (hole_start == hole_end) + continue; + } + + sectors = hole_end - hole_start; + + if (!is_allocation) { + ret = bch2_quota_reservation_add(c, inode, + "a_res, sectors, true); + if (unlikely(ret)) + goto bkey_err; + } + + ret = bch2_extent_fallocate(&trans, inode_inum(inode), &iter, + sectors, opts, &i_sectors_delta, + writepoint_hashed((unsigned long) current)); + if (ret) + goto bkey_err; + + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + + drop_locks_do(&trans, + (mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0)); +bkey_err: + bch2_quota_reservation_put(c, inode, "a_res); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + if (bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE)) { + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + + bch2_fpunch_at(&trans, &iter, inode_inum(inode), + end_sector, &i_sectors_delta); + i_sectors_acct(c, inode, "a_res, i_sectors_delta); + bch2_quota_reservation_put(c, inode, "a_res); + } + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static long bchfs_fallocate(struct bch_inode_info *inode, int mode, + loff_t offset, loff_t len) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + u64 end = offset + len; + u64 block_start = round_down(offset, block_bytes(c)); + u64 block_end = round_up(end, block_bytes(c)); + bool truncated_last_page = false; + int ret, ret2 = 0; + + if (!(mode & FALLOC_FL_KEEP_SIZE) && end > inode->v.i_size) { + ret = inode_newsize_ok(&inode->v, end); + if (ret) + return ret; + } + + if (mode & FALLOC_FL_ZERO_RANGE) { + ret = bch2_truncate_folios(inode, offset, end); + if (unlikely(ret < 0)) + return ret; + + truncated_last_page = ret; + + truncate_pagecache_range(&inode->v, offset, end - 1); + + block_start = round_up(offset, block_bytes(c)); + block_end = round_down(end, block_bytes(c)); + } + + ret = __bchfs_fallocate(inode, mode, block_start >> 9, block_end >> 9); + + /* + * On -ENOSPC in ZERO_RANGE mode, we still want to do the inode update, + * so that the VFS cache i_size is consistent with the btree i_size: + */ + if (ret && + !(bch2_err_matches(ret, ENOSPC) && (mode & FALLOC_FL_ZERO_RANGE))) + return ret; + + if (mode & FALLOC_FL_KEEP_SIZE && end > inode->v.i_size) + end = inode->v.i_size; + + if (end >= inode->v.i_size && + (((mode & FALLOC_FL_ZERO_RANGE) && !truncated_last_page) || + !(mode & FALLOC_FL_KEEP_SIZE))) { + spin_lock(&inode->v.i_lock); + i_size_write(&inode->v, end); + spin_unlock(&inode->v.i_lock); + + mutex_lock(&inode->ei_update_lock); + ret2 = bch2_write_inode_size(c, inode, end, 0); + mutex_unlock(&inode->ei_update_lock); + } + + return ret ?: ret2; +} + +long bch2_fallocate_dispatch(struct file *file, int mode, + loff_t offset, loff_t len) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + return -EROFS; + + inode_lock(&inode->v); + inode_dio_wait(&inode->v); + bch2_pagecache_block_get(inode); + + ret = file_modified(file); + if (ret) + goto err; + + if (!(mode & ~(FALLOC_FL_KEEP_SIZE|FALLOC_FL_ZERO_RANGE))) + ret = bchfs_fallocate(inode, mode, offset, len); + else if (mode == (FALLOC_FL_PUNCH_HOLE|FALLOC_FL_KEEP_SIZE)) + ret = bchfs_fpunch(inode, offset, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + ret = bchfs_fcollapse_finsert(inode, offset, len, false); + else + ret = -EOPNOTSUPP; +err: + bch2_pagecache_block_put(inode); + inode_unlock(&inode->v); + bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + + return bch2_err_class(ret); +} + +/* + * Take a quota reservation for unallocated blocks in a given file range + * Does not check pagecache + */ +static int quota_reserve_range(struct bch_inode_info *inode, + struct quota_res *res, + u64 start, u64 end) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + u32 snapshot; + u64 sectors = end - start; + u64 pos = start; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, pos, snapshot), 0); + + while (!(ret = btree_trans_too_many_iters(&trans)) && + (k = bch2_btree_iter_peek_upto(&iter, POS(inode->v.i_ino, end - 1))).k && + !(ret = bkey_err(k))) { + if (bkey_extent_is_allocation(k.k)) { + u64 s = min(end, k.k->p.offset) - + max(start, bkey_start_offset(k.k)); + BUG_ON(s > sectors); + sectors -= s; + } + bch2_btree_iter_advance(&iter); + } + pos = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + + if (ret) + return ret; + + return bch2_quota_reservation_add(c, inode, res, sectors, true); +} + +loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src, + struct file *file_dst, loff_t pos_dst, + loff_t len, unsigned remap_flags) +{ + struct bch_inode_info *src = file_bch_inode(file_src); + struct bch_inode_info *dst = file_bch_inode(file_dst); + struct bch_fs *c = src->v.i_sb->s_fs_info; + struct quota_res quota_res = { 0 }; + s64 i_sectors_delta = 0; + u64 aligned_len; + loff_t ret = 0; + + if (remap_flags & ~(REMAP_FILE_DEDUP|REMAP_FILE_ADVISORY)) + return -EINVAL; + + if (remap_flags & REMAP_FILE_DEDUP) + return -EOPNOTSUPP; + + if ((pos_src & (block_bytes(c) - 1)) || + (pos_dst & (block_bytes(c) - 1))) + return -EINVAL; + + if (src == dst && + abs(pos_src - pos_dst) < len) + return -EINVAL; + + bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + inode_dio_wait(&src->v); + inode_dio_wait(&dst->v); + + ret = generic_remap_file_range_prep(file_src, pos_src, + file_dst, pos_dst, + &len, remap_flags); + if (ret < 0 || len == 0) + goto err; + + aligned_len = round_up((u64) len, block_bytes(c)); + + ret = write_invalidate_inode_pages_range(dst->v.i_mapping, + pos_dst, pos_dst + len - 1); + if (ret) + goto err; + + ret = quota_reserve_range(dst, "a_res, pos_dst >> 9, + (pos_dst + aligned_len) >> 9); + if (ret) + goto err; + + file_update_time(file_dst); + + mark_pagecache_unallocated(src, pos_src >> 9, + (pos_src + aligned_len) >> 9); + + ret = bch2_remap_range(c, + inode_inum(dst), pos_dst >> 9, + inode_inum(src), pos_src >> 9, + aligned_len >> 9, + pos_dst + len, &i_sectors_delta); + if (ret < 0) + goto err; + + /* + * due to alignment, we might have remapped slightly more than requsted + */ + ret = min((u64) ret << 9, (u64) len); + + i_sectors_acct(c, dst, "a_res, i_sectors_delta); + + spin_lock(&dst->v.i_lock); + if (pos_dst + ret > dst->v.i_size) + i_size_write(&dst->v, pos_dst + ret); + spin_unlock(&dst->v.i_lock); + + if ((file_dst->f_flags & (__O_SYNC | O_DSYNC)) || + IS_SYNC(file_inode(file_dst))) + ret = bch2_flush_inode(c, dst); +err: + bch2_quota_reservation_put(c, dst, "a_res); + bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst); + + return bch2_err_class(ret); +} + +/* fseek: */ + +static int folio_data_offset(struct folio *folio, loff_t pos, + unsigned min_replicas) +{ + struct bch_folio *s = bch2_folio(folio); + unsigned i, sectors = folio_sectors(folio); + + if (s) + for (i = folio_pos_to_s(folio, pos); i < sectors; i++) + if (s->s[i].state >= SECTOR_dirty && + s->s[i].nr_replicas + s->s[i].replicas_reserved >= min_replicas) + return i << SECTOR_SHIFT; + + return -1; +} + +static loff_t bch2_seek_pagecache_data(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas) +{ + struct folio_batch fbatch; + pgoff_t start_index = start_offset >> PAGE_SHIFT; + pgoff_t end_index = end_offset >> PAGE_SHIFT; + pgoff_t index = start_index; + unsigned i; + loff_t ret; + int offset; + + folio_batch_init(&fbatch); + + while (filemap_get_folios(vinode->i_mapping, + &index, end_index, &fbatch)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + struct folio *folio = fbatch.folios[i]; + + folio_lock(folio); + offset = folio_data_offset(folio, + max(folio_pos(folio), start_offset), + min_replicas); + if (offset >= 0) { + ret = clamp(folio_pos(folio) + offset, + start_offset, end_offset); + folio_unlock(folio); + folio_batch_release(&fbatch); + return ret; + } + folio_unlock(folio); + } + folio_batch_release(&fbatch); + cond_resched(); + } + + return end_offset; +} + +static loff_t bch2_seek_data(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); + u64 isize, next_data = MAX_LFS_FILESIZE; + u32 snapshot; + int ret; + + isize = i_size_read(&inode->v); + if (offset >= isize) + return -ENXIO; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), + POS(inode->v.i_ino, U64_MAX), + 0, k, ret) { + if (bkey_extent_is_data(k.k)) { + next_data = max(offset, bkey_start_offset(k.k) << 9); + break; + } else if (k.k->p.offset >> 9 > isize) + break; + } + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + if (ret) + return ret; + + if (next_data > offset) + next_data = bch2_seek_pagecache_data(&inode->v, + offset, next_data, 0); + + if (next_data >= isize) + return -ENXIO; + + return vfs_setpos(file, next_data, MAX_LFS_FILESIZE); +} + +static bool folio_hole_offset(struct address_space *mapping, loff_t *offset, + unsigned min_replicas) +{ + struct folio *folio; + struct bch_folio *s; + unsigned i, sectors; + bool ret = true; + + folio = filemap_lock_folio(mapping, *offset >> PAGE_SHIFT); + if (IS_ERR_OR_NULL(folio)) + return true; + + s = bch2_folio(folio); + if (!s) + goto unlock; + + sectors = folio_sectors(folio); + for (i = folio_pos_to_s(folio, *offset); i < sectors; i++) + if (s->s[i].state < SECTOR_dirty || + s->s[i].nr_replicas + s->s[i].replicas_reserved < min_replicas) { + *offset = max(*offset, + folio_pos(folio) + (i << SECTOR_SHIFT)); + goto unlock; + } + + *offset = folio_end_pos(folio); + ret = false; +unlock: + folio_unlock(folio); + return ret; +} + +static loff_t bch2_seek_pagecache_hole(struct inode *vinode, + loff_t start_offset, + loff_t end_offset, + unsigned min_replicas) +{ + struct address_space *mapping = vinode->i_mapping; + loff_t offset = start_offset; + + while (offset < end_offset && + !folio_hole_offset(mapping, &offset, min_replicas)) + ; + + return min(offset, end_offset); +} + +static void bch2_clamp_data_hole(struct inode *inode, + u64 *hole_start, + u64 *hole_end, + unsigned min_replicas) +{ + *hole_start = bch2_seek_pagecache_hole(inode, + *hole_start << 9, *hole_end << 9, min_replicas) >> 9; + + if (*hole_start == *hole_end) + return; + + *hole_end = bch2_seek_pagecache_data(inode, + *hole_start << 9, *hole_end << 9, min_replicas) >> 9; +} + +static loff_t bch2_seek_hole(struct file *file, u64 offset) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + subvol_inum inum = inode_inum(inode); + u64 isize, next_hole = MAX_LFS_FILESIZE; + u32 snapshot; + int ret; + + isize = i_size_read(&inode->v); + if (offset >= isize) + return -ENXIO; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_norestart(&trans, iter, BTREE_ID_extents, + SPOS(inode->v.i_ino, offset >> 9, snapshot), + BTREE_ITER_SLOTS, k, ret) { + if (k.k->p.inode != inode->v.i_ino) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + offset, MAX_LFS_FILESIZE, 0); + break; + } else if (!bkey_extent_is_data(k.k)) { + next_hole = bch2_seek_pagecache_hole(&inode->v, + max(offset, bkey_start_offset(k.k) << 9), + k.k->p.offset << 9, 0); + + if (next_hole < k.k->p.offset << 9) + break; + } else { + offset = max(offset, bkey_start_offset(k.k) << 9); + } + } + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + if (ret) + return ret; + + if (next_hole > isize) + next_hole = isize; + + return vfs_setpos(file, next_hole, MAX_LFS_FILESIZE); +} + +loff_t bch2_llseek(struct file *file, loff_t offset, int whence) +{ + loff_t ret; + + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + case SEEK_END: + ret = generic_file_llseek(file, offset, whence); + break; + case SEEK_DATA: + ret = bch2_seek_data(file, offset); + break; + case SEEK_HOLE: + ret = bch2_seek_hole(file, offset); + break; + default: + ret = -EINVAL; + break; + } + + return bch2_err_class(ret); +} + +void bch2_fs_fsio_exit(struct bch_fs *c) +{ + bioset_exit(&c->nocow_flush_bioset); + bioset_exit(&c->dio_write_bioset); + bioset_exit(&c->dio_read_bioset); + bioset_exit(&c->writepage_bioset); +} + +int bch2_fs_fsio_init(struct bch_fs *c) +{ + if (bioset_init(&c->writepage_bioset, + 4, offsetof(struct bch_writepage_io, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_writepage_bioset_init; + + if (bioset_init(&c->dio_read_bioset, + 4, offsetof(struct dio_read, rbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_read_bioset_init; + + if (bioset_init(&c->dio_write_bioset, + 4, offsetof(struct dio_write, op.wbio.bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_dio_write_bioset_init; + + if (bioset_init(&c->nocow_flush_bioset, + 1, offsetof(struct nocow_flush, bio), 0)) + return -BCH_ERR_ENOMEM_nocow_flush_bioset_init; + + return 0; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-io.h b/fs/bcachefs/fs-io.h new file mode 100644 index 000000000..af9053315 --- /dev/null +++ b/fs/bcachefs/fs-io.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IO_H +#define _BCACHEFS_FS_IO_H + +#ifndef NO_BCACHEFS_FS + +#include "buckets.h" +#include "io_types.h" + +#include + +struct quota_res; + +int __must_check bch2_write_inode_size(struct bch_fs *, + struct bch_inode_info *, + loff_t, unsigned); + +int bch2_read_folio(struct file *, struct folio *); + +int bch2_writepages(struct address_space *, struct writeback_control *); +void bch2_readahead(struct readahead_control *); + +int bch2_write_begin(struct file *, struct address_space *, loff_t, + unsigned, struct page **, void **); +int bch2_write_end(struct file *, struct address_space *, loff_t, + unsigned, unsigned, struct page *, void *); + +ssize_t bch2_read_iter(struct kiocb *, struct iov_iter *); +ssize_t bch2_write_iter(struct kiocb *, struct iov_iter *); + +int bch2_fsync(struct file *, loff_t, loff_t, int); + +int bch2_truncate(struct mnt_idmap *, + struct bch_inode_info *, struct iattr *); +long bch2_fallocate_dispatch(struct file *, int, loff_t, loff_t); + +loff_t bch2_remap_file_range(struct file *, loff_t, struct file *, + loff_t, loff_t, unsigned); + +loff_t bch2_llseek(struct file *, loff_t, int); + +vm_fault_t bch2_page_fault(struct vm_fault *); +vm_fault_t bch2_page_mkwrite(struct vm_fault *); +void bch2_invalidate_folio(struct folio *, size_t, size_t); +bool bch2_release_folio(struct folio *, gfp_t); + +void bch2_fs_fsio_exit(struct bch_fs *); +int bch2_fs_fsio_init(struct bch_fs *); +#else +static inline void bch2_fs_fsio_exit(struct bch_fs *c) {} +static inline int bch2_fs_fsio_init(struct bch_fs *c) { return 0; } +#endif + +#endif /* _BCACHEFS_FS_IO_H */ diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c new file mode 100644 index 000000000..dfa1bf73c --- /dev/null +++ b/fs/bcachefs/fs-ioctl.c @@ -0,0 +1,556 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "chardev.h" +#include "dirent.h" +#include "fs.h" +#include "fs-common.h" +#include "fs-ioctl.h" +#include "quota.h" + +#include +#include +#include +#include +#include +#include + +#define FS_IOC_GOINGDOWN _IOR('X', 125, __u32) +#define FSOP_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +struct flags_set { + unsigned mask; + unsigned flags; + + unsigned projid; + + bool set_projinherit; + bool projinherit; +}; + +static int bch2_inode_flags_set(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + /* + * We're relying on btree locking here for exclusion with other ioctl + * calls - use the flags in the btree (@bi), not inode->i_flags: + */ + struct flags_set *s = p; + unsigned newflags = s->flags; + unsigned oldflags = bi->bi_flags & s->mask; + + if (((newflags ^ oldflags) & (BCH_INODE_APPEND|BCH_INODE_IMMUTABLE)) && + !capable(CAP_LINUX_IMMUTABLE)) + return -EPERM; + + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && + (newflags & (BCH_INODE_NODUMP|BCH_INODE_NOATIME)) != newflags) + return -EINVAL; + + if (s->set_projinherit) { + bi->bi_fields_set &= ~(1 << Inode_opt_project); + bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); + } + + bi->bi_flags &= ~s->mask; + bi->bi_flags |= newflags; + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); + return 0; +} + +static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) +{ + unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); + + return put_user(flags, arg); +} + +static int bch2_ioc_setflags(struct bch_fs *c, + struct file *file, + struct bch_inode_info *inode, + void __user *arg) +{ + struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; + unsigned uflags; + int ret; + + if (get_user(uflags, (int __user *) arg)) + return -EFAULT; + + s.flags = map_flags_rev(bch_flags_to_uflags, uflags); + if (uflags) + return -EOPNOTSUPP; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(&inode->v); + if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { + ret = -EACCES; + goto setflags_out; + } + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, bch2_inode_flags_set, &s, + ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + +setflags_out: + inode_unlock(&inode->v); + mnt_drop_write_file(file); + return ret; +} + +static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, + struct fsxattr __user *arg) +{ + struct fsxattr fa = { 0 }; + + fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); + + if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) + fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; + + fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; + + return copy_to_user(arg, &fa, sizeof(fa)); +} + +static int fssetxattr_inode_update_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct flags_set *s = p; + + if (s->projid != bi->bi_project) { + bi->bi_fields_set |= 1U << Inode_opt_project; + bi->bi_project = s->projid; + } + + return bch2_inode_flags_set(inode, bi, p); +} + +static int bch2_ioc_fssetxattr(struct bch_fs *c, + struct file *file, + struct bch_inode_info *inode, + struct fsxattr __user *arg) +{ + struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; + struct fsxattr fa; + int ret; + + if (copy_from_user(&fa, arg, sizeof(fa))) + return -EFAULT; + + s.set_projinherit = true; + s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; + fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + + s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); + if (fa.fsx_xflags) + return -EOPNOTSUPP; + + if (fa.fsx_projid >= U32_MAX) + return -EINVAL; + + /* + * inode fields accessible via the xattr interface are stored with a +1 + * bias, so that 0 means unset: + */ + s.projid = fa.fsx_projid + 1; + + ret = mnt_want_write_file(file); + if (ret) + return ret; + + inode_lock(&inode->v); + if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { + ret = -EACCES; + goto err; + } + + mutex_lock(&inode->ei_update_lock); + ret = bch2_set_projid(c, inode, fa.fsx_projid); + if (ret) + goto err_unlock; + + ret = bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); +err_unlock: + mutex_unlock(&inode->ei_update_lock); +err: + inode_unlock(&inode->v); + mnt_drop_write_file(file); + return ret; +} + +static int bch2_reinherit_attrs_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_inode_info *dir = p; + + return !bch2_reinherit_attrs(bi, &dir->ei_inode); +} + +static int bch2_ioc_reinherit_attrs(struct bch_fs *c, + struct file *file, + struct bch_inode_info *src, + const char __user *name) +{ + struct bch_hash_info hash = bch2_hash_info_init(c, &src->ei_inode); + struct bch_inode_info *dst; + struct inode *vinode = NULL; + char *kname = NULL; + struct qstr qstr; + int ret = 0; + subvol_inum inum; + + kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + if (!kname) + return -ENOMEM; + + ret = strncpy_from_user(kname, name, BCH_NAME_MAX); + if (unlikely(ret < 0)) + goto err1; + + qstr.len = ret; + qstr.name = kname; + + ret = bch2_dirent_lookup(c, inode_inum(src), &hash, &qstr, &inum); + if (ret) + goto err1; + + vinode = bch2_vfs_inode_get(c, inum); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) + goto err1; + + dst = to_bch_ei(vinode); + + ret = mnt_want_write_file(file); + if (ret) + goto err2; + + bch2_lock_inodes(INODE_UPDATE_LOCK, src, dst); + + if (inode_attr_changing(src, dst, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst, + src->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err3; + } + + ret = bch2_write_inode(c, dst, bch2_reinherit_attrs_fn, src, 0); +err3: + bch2_unlock_inodes(INODE_UPDATE_LOCK, src, dst); + + /* return true if we did work */ + if (ret >= 0) + ret = !ret; + + mnt_drop_write_file(file); +err2: + iput(vinode); +err1: + kfree(kname); + + return ret; +} + +static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) +{ + u32 flags; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, arg)) + return -EFAULT; + + bch_notice(c, "shutdown by ioctl type %u", flags); + + down_write(&c->vfs_sb->s_umount); + + switch (flags) { + case FSOP_GOING_FLAGS_DEFAULT: + ret = freeze_bdev(c->vfs_sb->s_bdev); + if (ret) + goto err; + + bch2_journal_flush(&c->journal); + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + thaw_bdev(c->vfs_sb->s_bdev); + break; + + case FSOP_GOING_FLAGS_LOGFLUSH: + bch2_journal_flush(&c->journal); + fallthrough; + + case FSOP_GOING_FLAGS_NOLOGFLUSH: + c->vfs_sb->s_flags |= SB_RDONLY; + bch2_fs_emergency_read_only(c); + break; + default: + ret = -EINVAL; + break; + } +err: + up_write(&c->vfs_sb->s_umount); + return ret; +} + +static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct inode *dir; + struct bch_inode_info *inode; + struct user_namespace *s_user_ns; + struct dentry *dst_dentry; + struct path src_path, dst_path; + int how = LOOKUP_FOLLOW; + int error; + subvol_inum snapshot_src = { 0 }; + unsigned lookup_flags = 0; + unsigned create_flags = BCH_CREATE_SUBVOL; + + if (arg.flags & ~(BCH_SUBVOL_SNAPSHOT_CREATE| + BCH_SUBVOL_SNAPSHOT_RO)) + return -EINVAL; + + if (!(arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + (arg.src_ptr || + (arg.flags & BCH_SUBVOL_SNAPSHOT_RO))) + return -EINVAL; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + create_flags |= BCH_CREATE_SNAPSHOT; + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO) + create_flags |= BCH_CREATE_SNAPSHOT_RO; + + /* why do we need this lock? */ + down_read(&c->vfs_sb->s_umount); + + if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) + sync_inodes_sb(c->vfs_sb); +retry: + if (arg.src_ptr) { + error = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.src_ptr, + how, &src_path); + if (error) + goto err1; + + if (src_path.dentry->d_sb->s_fs_info != c) { + path_put(&src_path); + error = -EXDEV; + goto err1; + } + + snapshot_src = inode_inum(to_bch_ei(src_path.dentry->d_inode)); + } + + dst_dentry = user_path_create(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + &dst_path, lookup_flags); + error = PTR_ERR_OR_ZERO(dst_dentry); + if (error) + goto err2; + + if (dst_dentry->d_sb->s_fs_info != c) { + error = -EXDEV; + goto err3; + } + + if (dst_dentry->d_inode) { + error = -EEXIST; + goto err3; + } + + dir = dst_path.dentry->d_inode; + if (IS_DEADDIR(dir)) { + error = -BCH_ERR_ENOENT_directory_dead; + goto err3; + } + + s_user_ns = dir->i_sb->s_user_ns; + if (!kuid_has_mapping(s_user_ns, current_fsuid()) || + !kgid_has_mapping(s_user_ns, current_fsgid())) { + error = -EOVERFLOW; + goto err3; + } + + error = inode_permission(file_mnt_idmap(filp), + dir, MAY_WRITE | MAY_EXEC); + if (error) + goto err3; + + if (!IS_POSIXACL(dir)) + arg.mode &= ~current_umask(); + + error = security_path_mkdir(&dst_path, dst_dentry, arg.mode); + if (error) + goto err3; + + if ((arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) && + !arg.src_ptr) + snapshot_src.subvol = to_bch_ei(dir)->ei_inode.bi_subvol; + + inode = __bch2_create(file_mnt_idmap(filp), to_bch_ei(dir), + dst_dentry, arg.mode|S_IFDIR, + 0, snapshot_src, create_flags); + error = PTR_ERR_OR_ZERO(inode); + if (error) + goto err3; + + d_instantiate(dst_dentry, &inode->v); + fsnotify_mkdir(dir, dst_dentry); +err3: + done_path_create(&dst_path, dst_dentry); +err2: + if (arg.src_ptr) + path_put(&src_path); + + if (retry_estale(error, lookup_flags)) { + lookup_flags |= LOOKUP_REVAL; + goto retry; + } +err1: + up_read(&c->vfs_sb->s_umount); + + return error; +} + +static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, + struct bch_ioctl_subvolume arg) +{ + struct path path; + struct inode *dir; + int ret = 0; + + if (arg.flags) + return -EINVAL; + + ret = user_path_at(arg.dirfd, + (const char __user *)(unsigned long)arg.dst_ptr, + LOOKUP_FOLLOW, &path); + if (ret) + return ret; + + if (path.dentry->d_sb->s_fs_info != c) { + ret = -EXDEV; + goto err; + } + + dir = path.dentry->d_parent->d_inode; + + ret = __bch2_unlink(dir, path.dentry, true); + if (ret) + goto err; + + fsnotify_rmdir(dir, path.dentry); + d_delete(path.dentry); +err: + path_put(&path); + return ret; +} + +long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + long ret; + + switch (cmd) { + case FS_IOC_GETFLAGS: + ret = bch2_ioc_getflags(inode, (int __user *) arg); + break; + + case FS_IOC_SETFLAGS: + ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); + break; + + case FS_IOC_FSGETXATTR: + ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); + break; + + case FS_IOC_FSSETXATTR: + ret = bch2_ioc_fssetxattr(c, file, inode, + (void __user *) arg); + break; + + case BCHFS_IOC_REINHERIT_ATTRS: + ret = bch2_ioc_reinherit_attrs(c, file, inode, + (void __user *) arg); + break; + + case FS_IOC_GETVERSION: + ret = -ENOTTY; + break; + + case FS_IOC_SETVERSION: + ret = -ENOTTY; + break; + + case FS_IOC_GOINGDOWN: + ret = bch2_ioc_goingdown(c, (u32 __user *) arg); + break; + + case BCH_IOCTL_SUBVOLUME_CREATE: { + struct bch_ioctl_subvolume i; + + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_create(c, file, i); + break; + } + + case BCH_IOCTL_SUBVOLUME_DESTROY: { + struct bch_ioctl_subvolume i; + + ret = copy_from_user(&i, (void __user *) arg, sizeof(i)) + ? -EFAULT + : bch2_ioctl_subvolume_destroy(c, file, i); + break; + } + + default: + ret = bch2_fs_ioctl(c, cmd, (void __user *) arg); + break; + } + + return bch2_err_class(ret); +} + +#ifdef CONFIG_COMPAT +long bch2_compat_fs_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + /* These are just misnamed, they actually get/put from/to user an int */ + switch (cmd) { + case FS_IOC_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return -ENOIOCTLCMD; + } + return bch2_fs_file_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); +} +#endif + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h new file mode 100644 index 000000000..f201980ef --- /dev/null +++ b/fs/bcachefs/fs-ioctl.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_IOCTL_H +#define _BCACHEFS_FS_IOCTL_H + +/* Inode flags: */ + +/* bcachefs inode flags -> vfs inode flags: */ +static const unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_SYNC] = S_SYNC, + [__BCH_INODE_IMMUTABLE] = S_IMMUTABLE, + [__BCH_INODE_APPEND] = S_APPEND, + [__BCH_INODE_NOATIME] = S_NOATIME, +}; + +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_SYNC] = FS_SYNC_FL, + [__BCH_INODE_IMMUTABLE] = FS_IMMUTABLE_FL, + [__BCH_INODE_APPEND] = FS_APPEND_FL, + [__BCH_INODE_NODUMP] = FS_NODUMP_FL, + [__BCH_INODE_NOATIME] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_SYNC] = FS_XFLAG_SYNC, + [__BCH_INODE_IMMUTABLE] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_APPEND] = FS_XFLAG_APPEND, + [__BCH_INODE_NODUMP] = FS_XFLAG_NODUMP, + [__BCH_INODE_NOATIME] = FS_XFLAG_NOATIME, + //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; +}; + +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +{ + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); +} + +long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); +long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); + +#endif /* _BCACHEFS_FS_IOCTL_H */ diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c new file mode 100644 index 000000000..8d2f388b4 --- /dev/null +++ b/fs/bcachefs/fs.c @@ -0,0 +1,1943 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef NO_BCACHEFS_FS + +#include "bcachefs.h" +#include "acl.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "chardev.h" +#include "dirent.h" +#include "errcode.h" +#include "extents.h" +#include "fs.h" +#include "fs-common.h" +#include "fs-io.h" +#include "fs-ioctl.h" +#include "fsck.h" +#include "inode.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "quota.h" +#include "super.h" +#include "xattr.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *bch2_inode_cache; + +static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, + struct bch_inode_info *, + struct bch_inode_unpacked *, + struct bch_subvolume *); + +void bch2_inode_update_after_write(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + unsigned fields) +{ + struct bch_fs *c = trans->c; + + BUG_ON(bi->bi_inum != inode->v.i_ino); + + bch2_assert_pos_locked(trans, BTREE_ID_inodes, + POS(0, bi->bi_inum), + c->opts.inodes_use_key_cache); + + set_nlink(&inode->v, bch2_inode_nlink_get(bi)); + i_uid_write(&inode->v, bi->bi_uid); + i_gid_write(&inode->v, bi->bi_gid); + inode->v.i_mode = bi->bi_mode; + + if (fields & ATTR_ATIME) + inode->v.i_atime = bch2_time_to_timespec(c, bi->bi_atime); + if (fields & ATTR_MTIME) + inode->v.i_mtime = bch2_time_to_timespec(c, bi->bi_mtime); + if (fields & ATTR_CTIME) + inode->v.i_ctime = bch2_time_to_timespec(c, bi->bi_ctime); + + inode->ei_inode = *bi; + + bch2_inode_flags_to_vfs(inode); +} + +int __must_check bch2_write_inode(struct bch_fs *c, + struct bch_inode_info *inode, + inode_set_fn set, + void *p, unsigned fields) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bch_inode_unpacked inode_u; + int ret; + + bch2_trans_init(&trans, c, 0, 512); +retry: + bch2_trans_begin(&trans); + + ret = bch2_inode_peek(&trans, &iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT) ?: + (set ? set(inode, &inode_u, p) : 0) ?: + bch2_inode_write(&trans, &iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, BTREE_INSERT_NOFAIL); + + /* + * the btree node lock protects inode->ei_inode, not ei_update_lock; + * this is important for inode updates via bchfs_write_index_update + */ + if (!ret) + bch2_inode_update_after_write(&trans, inode, &inode_u, fields); + + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, + "inode %u:%llu not found when updating", + inode_inum(inode).subvol, + inode_inum(inode).inum); + + bch2_trans_exit(&trans); + return ret < 0 ? ret : 0; +} + +int bch2_fs_quota_transfer(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_qid new_qid, + unsigned qtypes, + enum quota_acct_mode mode) +{ + unsigned i; + int ret; + + qtypes &= enabled_qtypes(c); + + for (i = 0; i < QTYP_NR; i++) + if (new_qid.q[i] == inode->ei_qid.q[i]) + qtypes &= ~(1U << i); + + if (!qtypes) + return 0; + + mutex_lock(&inode->ei_quota_lock); + + ret = bch2_quota_transfer(c, qtypes, new_qid, + inode->ei_qid, + inode->v.i_blocks + + inode->ei_quota_reserved, + mode); + if (!ret) + for (i = 0; i < QTYP_NR; i++) + if (qtypes & (1 << i)) + inode->ei_qid.q[i] = new_qid.q[i]; + + mutex_unlock(&inode->ei_quota_lock); + + return ret; +} + +static int bch2_iget5_test(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + return inode->ei_subvol == inum->subvol && + inode->ei_inode.bi_inum == inum->inum; +} + +static int bch2_iget5_set(struct inode *vinode, void *p) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + subvol_inum *inum = p; + + inode->v.i_ino = inum->inum; + inode->ei_subvol = inum->subvol; + inode->ei_inode.bi_inum = inum->inum; + return 0; +} + +static unsigned bch2_inode_hash(subvol_inum inum) +{ + return jhash_3words(inum.subvol, inum.inum >> 32, inum.inum, JHASH_INITVAL); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *c, subvol_inum inum) +{ + struct bch_inode_unpacked inode_u; + struct bch_inode_info *inode; + struct btree_trans trans; + struct bch_subvolume subvol; + int ret; + + inode = to_bch_ei(iget5_locked(c->vfs_sb, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + if (unlikely(!inode)) + return ERR_PTR(-ENOMEM); + if (!(inode->v.i_state & I_NEW)) + return &inode->v; + + bch2_trans_init(&trans, c, 8, 0); + ret = lockrestart_do(&trans, + bch2_subvolume_get(&trans, inum.subvol, true, 0, &subvol) ?: + bch2_inode_find_by_inum_trans(&trans, inum, &inode_u)); + + if (!ret) + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + bch2_trans_exit(&trans); + + if (ret) { + iget_failed(&inode->v); + return ERR_PTR(ret); + } + + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + + unlock_new_inode(&inode->v); + + return &inode->v; +} + +struct bch_inode_info * +__bch2_create(struct mnt_idmap *idmap, + struct bch_inode_info *dir, struct dentry *dentry, + umode_t mode, dev_t rdev, subvol_inum snapshot_src, + unsigned flags) +{ + struct bch_fs *c = dir->v.i_sb->s_fs_info; + struct btree_trans trans; + struct bch_inode_unpacked dir_u; + struct bch_inode_info *inode, *old; + struct bch_inode_unpacked inode_u; + struct posix_acl *default_acl = NULL, *acl = NULL; + subvol_inum inum; + struct bch_subvolume subvol; + u64 journal_seq = 0; + int ret; + + /* + * preallocate acls + vfs inode before btree transaction, so that + * nothing can fail after the transaction succeeds: + */ +#ifdef CONFIG_BCACHEFS_POSIX_ACL + ret = posix_acl_create(&dir->v, &mode, &default_acl, &acl); + if (ret) + return ERR_PTR(ret); +#endif + inode = to_bch_ei(new_inode(c->vfs_sb)); + if (unlikely(!inode)) { + inode = ERR_PTR(-ENOMEM); + goto err; + } + + bch2_inode_init_early(c, &inode_u); + + if (!(flags & BCH_CREATE_TMPFILE)) + mutex_lock(&dir->ei_update_lock); + + bch2_trans_init(&trans, c, 8, + 2048 + (!(flags & BCH_CREATE_TMPFILE) + ? dentry->d_name.len : 0)); +retry: + bch2_trans_begin(&trans); + + ret = bch2_create_trans(&trans, + inode_inum(dir), &dir_u, &inode_u, + !(flags & BCH_CREATE_TMPFILE) + ? &dentry->d_name : NULL, + from_kuid(i_user_ns(&dir->v), current_fsuid()), + from_kgid(i_user_ns(&dir->v), current_fsgid()), + mode, rdev, + default_acl, acl, snapshot_src, flags) ?: + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, 1, + KEY_TYPE_QUOTA_PREALLOC); + if (unlikely(ret)) + goto err_before_quota; + + inum.subvol = inode_u.bi_subvol ?: dir->ei_subvol; + inum.inum = inode_u.bi_inum; + + ret = bch2_subvolume_get(&trans, inum.subvol, true, + BTREE_ITER_WITH_UPDATES, &subvol) ?: + bch2_trans_commit(&trans, NULL, &journal_seq, 0); + if (unlikely(ret)) { + bch2_quota_acct(c, bch_qid(&inode_u), Q_INO, -1, + KEY_TYPE_QUOTA_WARN); +err_before_quota: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + goto err_trans; + } + + if (!(flags & BCH_CREATE_TMPFILE)) { + bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&dir->ei_update_lock); + } + + bch2_iget5_set(&inode->v, &inum); + bch2_vfs_inode_init(&trans, inum, inode, &inode_u, &subvol); + + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); + set_cached_acl(&inode->v, ACL_TYPE_DEFAULT, default_acl); + + /* + * we must insert the new inode into the inode cache before calling + * bch2_trans_exit() and dropping locks, else we could race with another + * thread pulling the inode in and modifying it: + */ + + inode->v.i_state |= I_CREATING; + + old = to_bch_ei(inode_insert5(&inode->v, + bch2_inode_hash(inum), + bch2_iget5_test, + bch2_iget5_set, + &inum)); + BUG_ON(!old); + + if (unlikely(old != inode)) { + /* + * We raced, another process pulled the new inode into cache + * before us: + */ + make_bad_inode(&inode->v); + iput(&inode->v); + + inode = old; + } else { + mutex_lock(&c->vfs_inodes_lock); + list_add(&inode->ei_vfs_inode_list, &c->vfs_inodes_list); + mutex_unlock(&c->vfs_inodes_lock); + /* + * we really don't want insert_inode_locked2() to be setting + * I_NEW... + */ + unlock_new_inode(&inode->v); + } + + bch2_trans_exit(&trans); +err: + posix_acl_release(default_acl); + posix_acl_release(acl); + return inode; +err_trans: + if (!(flags & BCH_CREATE_TMPFILE)) + mutex_unlock(&dir->ei_update_lock); + + bch2_trans_exit(&trans); + make_bad_inode(&inode->v); + iput(&inode->v); + inode = ERR_PTR(ret); + goto err; +} + +/* methods */ + +static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, + unsigned int flags) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_hash_info hash = bch2_hash_info_init(c, &dir->ei_inode); + struct inode *vinode = NULL; + subvol_inum inum = { .subvol = 1 }; + int ret; + + ret = bch2_dirent_lookup(c, inode_inum(dir), &hash, + &dentry->d_name, &inum); + + if (!ret) + vinode = bch2_vfs_inode_get(c, inum); + + return d_splice_alias(vinode, dentry); +} + +static int bch2_mknod(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, + umode_t mode, dev_t rdev) +{ + struct bch_inode_info *inode = + __bch2_create(idmap, to_bch_ei(vdir), dentry, mode, rdev, + (subvol_inum) { 0 }, 0); + + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); + + d_instantiate(dentry, &inode->v); + return 0; +} + +static int bch2_create(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, + umode_t mode, bool excl) +{ + return bch2_mknod(idmap, vdir, dentry, mode|S_IFREG, 0); +} + +static int __bch2_link(struct bch_fs *c, + struct bch_inode_info *inode, + struct bch_inode_info *dir, + struct dentry *dentry) +{ + struct btree_trans trans; + struct bch_inode_unpacked dir_u, inode_u; + int ret; + + mutex_lock(&inode->ei_update_lock); + bch2_trans_init(&trans, c, 4, 1024); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_link_trans(&trans, + inode_inum(dir), &dir_u, + inode_inum(inode), &inode_u, + &dentry->d_name)); + + if (likely(!ret)) { + bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + } + + bch2_trans_exit(&trans); + mutex_unlock(&inode->ei_update_lock); + return ret; +} + +static int bch2_link(struct dentry *old_dentry, struct inode *vdir, + struct dentry *dentry) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_inode_info *inode = to_bch_ei(old_dentry->d_inode); + int ret; + + lockdep_assert_held(&inode->v.i_rwsem); + + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) + return ret; + + ihold(&inode->v); + d_instantiate(dentry, &inode->v); + return 0; +} + +int __bch2_unlink(struct inode *vdir, struct dentry *dentry, + bool deleting_snapshot) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir); + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct bch_inode_unpacked dir_u, inode_u; + struct btree_trans trans; + int ret; + + bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode); + bch2_trans_init(&trans, c, 4, 1024); + + ret = commit_do(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL, + bch2_unlink_trans(&trans, + inode_inum(dir), &dir_u, + &inode_u, &dentry->d_name, + deleting_snapshot)); + if (unlikely(ret)) + goto err; + + bch2_inode_update_after_write(&trans, dir, &dir_u, + ATTR_MTIME|ATTR_CTIME); + bch2_inode_update_after_write(&trans, inode, &inode_u, + ATTR_MTIME); + + if (inode_u.bi_subvol) { + /* + * Subvolume deletion is asynchronous, but we still want to tell + * the VFS that it's been deleted here: + */ + set_nlink(&inode->v, 0); + } +err: + bch2_trans_exit(&trans); + bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); + + return ret; +} + +static int bch2_unlink(struct inode *vdir, struct dentry *dentry) +{ + return __bch2_unlink(vdir, dentry, false); +} + +static int bch2_symlink(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, + const char *symname) +{ + struct bch_fs *c = vdir->i_sb->s_fs_info; + struct bch_inode_info *dir = to_bch_ei(vdir), *inode; + int ret; + + inode = __bch2_create(idmap, dir, dentry, S_IFLNK|S_IRWXUGO, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); + + inode_lock(&inode->v); + ret = page_symlink(&inode->v, symname, strlen(symname) + 1); + inode_unlock(&inode->v); + + if (unlikely(ret)) + goto err; + + ret = filemap_write_and_wait_range(inode->v.i_mapping, 0, LLONG_MAX); + if (unlikely(ret)) + goto err; + + ret = __bch2_link(c, inode, dir, dentry); + if (unlikely(ret)) + goto err; + + d_instantiate(dentry, &inode->v); + return 0; +err: + iput(&inode->v); + return ret; +} + +static int bch2_mkdir(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, umode_t mode) +{ + return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); +} + +static int bch2_rename2(struct mnt_idmap *idmap, + struct inode *src_vdir, struct dentry *src_dentry, + struct inode *dst_vdir, struct dentry *dst_dentry, + unsigned flags) +{ + struct bch_fs *c = src_vdir->i_sb->s_fs_info; + struct bch_inode_info *src_dir = to_bch_ei(src_vdir); + struct bch_inode_info *dst_dir = to_bch_ei(dst_vdir); + struct bch_inode_info *src_inode = to_bch_ei(src_dentry->d_inode); + struct bch_inode_info *dst_inode = to_bch_ei(dst_dentry->d_inode); + struct bch_inode_unpacked dst_dir_u, src_dir_u; + struct bch_inode_unpacked src_inode_u, dst_inode_u; + struct btree_trans trans; + enum bch_rename_mode mode = flags & RENAME_EXCHANGE + ? BCH_RENAME_EXCHANGE + : dst_dentry->d_inode + ? BCH_RENAME_OVERWRITE : BCH_RENAME; + int ret; + + if (flags & ~(RENAME_NOREPLACE|RENAME_EXCHANGE)) + return -EINVAL; + + if (mode == BCH_RENAME_OVERWRITE) { + ret = filemap_write_and_wait_range(src_inode->v.i_mapping, + 0, LLONG_MAX); + if (ret) + return ret; + } + + bch2_trans_init(&trans, c, 8, 2048); + + bch2_lock_inodes(INODE_UPDATE_LOCK, + src_dir, + dst_dir, + src_inode, + dst_inode); + + if (inode_attr_changing(dst_dir, src_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, src_inode, + dst_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + + if (mode == BCH_RENAME_EXCHANGE && + inode_attr_changing(src_dir, dst_inode, Inode_opt_project)) { + ret = bch2_fs_quota_transfer(c, dst_inode, + src_dir->ei_qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + } + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_rename_trans(&trans, + inode_inum(src_dir), &src_dir_u, + inode_inum(dst_dir), &dst_dir_u, + &src_inode_u, + &dst_inode_u, + &src_dentry->d_name, + &dst_dentry->d_name, + mode)); + if (unlikely(ret)) + goto err; + + BUG_ON(src_inode->v.i_ino != src_inode_u.bi_inum); + BUG_ON(dst_inode && + dst_inode->v.i_ino != dst_inode_u.bi_inum); + + bch2_inode_update_after_write(&trans, src_dir, &src_dir_u, + ATTR_MTIME|ATTR_CTIME); + + if (src_dir != dst_dir) + bch2_inode_update_after_write(&trans, dst_dir, &dst_dir_u, + ATTR_MTIME|ATTR_CTIME); + + bch2_inode_update_after_write(&trans, src_inode, &src_inode_u, + ATTR_CTIME); + + if (dst_inode) + bch2_inode_update_after_write(&trans, dst_inode, &dst_inode_u, + ATTR_CTIME); +err: + bch2_trans_exit(&trans); + + bch2_fs_quota_transfer(c, src_inode, + bch_qid(&src_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + if (dst_inode) + bch2_fs_quota_transfer(c, dst_inode, + bch_qid(&dst_inode->ei_inode), + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_NOCHECK); + + bch2_unlock_inodes(INODE_UPDATE_LOCK, + src_dir, + dst_dir, + src_inode, + dst_inode); + + return ret; +} + +static void bch2_setattr_copy(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + struct iattr *attr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + unsigned int ia_valid = attr->ia_valid; + + if (ia_valid & ATTR_UID) + bi->bi_uid = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + if (ia_valid & ATTR_GID) + bi->bi_gid = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + + if (ia_valid & ATTR_SIZE) + bi->bi_size = attr->ia_size; + + if (ia_valid & ATTR_ATIME) + bi->bi_atime = timespec_to_bch2_time(c, attr->ia_atime); + if (ia_valid & ATTR_MTIME) + bi->bi_mtime = timespec_to_bch2_time(c, attr->ia_mtime); + if (ia_valid & ATTR_CTIME) + bi->bi_ctime = timespec_to_bch2_time(c, attr->ia_ctime); + + if (ia_valid & ATTR_MODE) { + umode_t mode = attr->ia_mode; + kgid_t gid = ia_valid & ATTR_GID + ? attr->ia_gid + : inode->v.i_gid; + + if (!in_group_p(gid) && + !capable_wrt_inode_uidgid(idmap, &inode->v, CAP_FSETID)) + mode &= ~S_ISGID; + bi->bi_mode = mode; + } +} + +int bch2_setattr_nonsize(struct mnt_idmap *idmap, + struct bch_inode_info *inode, + struct iattr *attr) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_qid qid; + struct btree_trans trans; + struct btree_iter inode_iter = { NULL }; + struct bch_inode_unpacked inode_u; + struct posix_acl *acl = NULL; + int ret; + + mutex_lock(&inode->ei_update_lock); + + qid = inode->ei_qid; + + if (attr->ia_valid & ATTR_UID) + qid.q[QTYP_USR] = from_kuid(i_user_ns(&inode->v), attr->ia_uid); + + if (attr->ia_valid & ATTR_GID) + qid.q[QTYP_GRP] = from_kgid(i_user_ns(&inode->v), attr->ia_gid); + + ret = bch2_fs_quota_transfer(c, inode, qid, ~0, + KEY_TYPE_QUOTA_PREALLOC); + if (ret) + goto err; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + kfree(acl); + acl = NULL; + + ret = bch2_inode_peek(&trans, &inode_iter, &inode_u, inode_inum(inode), + BTREE_ITER_INTENT); + if (ret) + goto btree_err; + + bch2_setattr_copy(idmap, inode, &inode_u, attr); + + if (attr->ia_valid & ATTR_MODE) { + ret = bch2_acl_chmod(&trans, inode_inum(inode), &inode_u, + inode_u.bi_mode, &acl); + if (ret) + goto btree_err; + } + + ret = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +btree_err: + bch2_trans_iter_exit(&trans, &inode_iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + if (unlikely(ret)) + goto err_trans; + + bch2_inode_update_after_write(&trans, inode, &inode_u, attr->ia_valid); + + if (acl) + set_cached_acl(&inode->v, ACL_TYPE_ACCESS, acl); +err_trans: + bch2_trans_exit(&trans); +err: + mutex_unlock(&inode->ei_update_lock); + + return bch2_err_class(ret); +} + +static int bch2_getattr(struct mnt_idmap *idmap, + const struct path *path, struct kstat *stat, + u32 request_mask, unsigned query_flags) +{ + struct bch_inode_info *inode = to_bch_ei(d_inode(path->dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + stat->dev = inode->v.i_sb->s_dev; + stat->ino = inode->v.i_ino; + stat->mode = inode->v.i_mode; + stat->nlink = inode->v.i_nlink; + stat->uid = inode->v.i_uid; + stat->gid = inode->v.i_gid; + stat->rdev = inode->v.i_rdev; + stat->size = i_size_read(&inode->v); + stat->atime = inode->v.i_atime; + stat->mtime = inode->v.i_mtime; + stat->ctime = inode->v.i_ctime; + stat->blksize = block_bytes(c); + stat->blocks = inode->v.i_blocks; + + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime = bch2_time_to_timespec(c, inode->ei_inode.bi_otime); + } + + if (inode->ei_inode.bi_flags & BCH_INODE_IMMUTABLE) + stat->attributes |= STATX_ATTR_IMMUTABLE; + stat->attributes_mask |= STATX_ATTR_IMMUTABLE; + + if (inode->ei_inode.bi_flags & BCH_INODE_APPEND) + stat->attributes |= STATX_ATTR_APPEND; + stat->attributes_mask |= STATX_ATTR_APPEND; + + if (inode->ei_inode.bi_flags & BCH_INODE_NODUMP) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= STATX_ATTR_NODUMP; + + return 0; +} + +static int bch2_setattr(struct mnt_idmap *idmap, + struct dentry *dentry, struct iattr *iattr) +{ + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + int ret; + + lockdep_assert_held(&inode->v.i_rwsem); + + ret = setattr_prepare(idmap, dentry, iattr); + if (ret) + return ret; + + return iattr->ia_valid & ATTR_SIZE + ? bch2_truncate(idmap, inode, iattr) + : bch2_setattr_nonsize(idmap, inode, iattr); +} + +static int bch2_tmpfile(struct mnt_idmap *idmap, + struct inode *vdir, struct file *file, umode_t mode) +{ + struct bch_inode_info *inode = + __bch2_create(idmap, to_bch_ei(vdir), + file->f_path.dentry, mode, 0, + (subvol_inum) { 0 }, BCH_CREATE_TMPFILE); + + if (IS_ERR(inode)) + return bch2_err_class(PTR_ERR(inode)); + + d_mark_tmpfile(file, &inode->v); + d_instantiate(file->f_path.dentry, &inode->v); + return finish_open_simple(file, 0); +} + +static int bch2_fill_extent(struct bch_fs *c, + struct fiemap_extent_info *info, + struct bkey_s_c k, unsigned flags) +{ + if (bkey_extent_is_direct_data(k.k)) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + int ret; + + if (k.k->type == KEY_TYPE_reflink_v) + flags |= FIEMAP_EXTENT_SHARED; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + int flags2 = 0; + u64 offset = p.ptr.offset; + + if (p.ptr.unwritten) + flags2 |= FIEMAP_EXTENT_UNWRITTEN; + + if (p.crc.compression_type) + flags2 |= FIEMAP_EXTENT_ENCODED; + else + offset += p.crc.offset; + + if ((offset & (block_sectors(c) - 1)) || + (k.k->size & (block_sectors(c) - 1))) + flags2 |= FIEMAP_EXTENT_NOT_ALIGNED; + + ret = fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + offset << 9, + k.k->size << 9, flags|flags2); + if (ret) + return ret; + } + + return 0; + } else if (bkey_extent_is_inline_data(k.k)) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DATA_INLINE); + } else if (k.k->type == KEY_TYPE_reservation) { + return fiemap_fill_next_extent(info, + bkey_start_offset(k.k) << 9, + 0, k.k->size << 9, + flags| + FIEMAP_EXTENT_DELALLOC| + FIEMAP_EXTENT_UNWRITTEN); + } else { + BUG(); + } +} + +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + u64 start, u64 len) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_buf cur, prev; + struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); + unsigned offset_into_extent, sectors; + bool have_extent = false; + u32 snapshot; + int ret = 0; + + ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + if (start + len < start) + return -EINVAL; + + start >>= 9; + + bch2_bkey_buf_init(&cur); + bch2_bkey_buf_init(&prev); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, ei->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(ei->v.i_ino, start, snapshot), 0); + + while (!(ret = btree_trans_too_many_iters(&trans)) && + (k = bch2_btree_iter_peek_upto(&iter, end)).k && + !(ret = bkey_err(k))) { + enum btree_id data_btree = BTREE_ID_extents; + + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + bch2_btree_iter_advance(&iter); + continue; + } + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&cur, c, k); + + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &cur); + if (ret) + break; + + k = bkey_i_to_s_c(cur.k); + bch2_bkey_buf_realloc(&prev, c, k.k->u64s); + + sectors = min(sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + + offset_into_extent), + cur.k); + bch2_key_resize(&cur.k->k, sectors); + cur.k->k.p = iter.pos; + cur.k->k.p.offset += cur.k->k.size; + + if (have_extent) { + bch2_trans_unlock(&trans); + ret = bch2_fill_extent(c, info, + bkey_i_to_s_c(prev.k), 0); + if (ret) + break; + } + + bkey_copy(prev.k, cur.k); + have_extent = true; + + bch2_btree_iter_set_pos(&iter, + POS(iter.pos.inode, iter.pos.offset + sectors)); + } + start = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (!ret && have_extent) { + bch2_trans_unlock(&trans); + ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), + FIEMAP_EXTENT_LAST); + } + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&cur, c); + bch2_bkey_buf_exit(&prev, c); + return ret < 0 ? ret : 0; +} + +static const struct vm_operations_struct bch_vm_ops = { + .fault = bch2_page_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = bch2_page_mkwrite, +}; + +static int bch2_mmap(struct file *file, struct vm_area_struct *vma) +{ + file_accessed(file); + + vma->vm_ops = &bch_vm_ops; + return 0; +} + +/* Directories: */ + +static loff_t bch2_dir_llseek(struct file *file, loff_t offset, int whence) +{ + return generic_file_llseek_size(file, offset, whence, + S64_MAX, S64_MAX); +} + +static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) +{ + struct bch_inode_info *inode = file_bch_inode(file); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + if (!dir_emit_dots(file, ctx)) + return 0; + + return bch2_readdir(c, inode_inum(inode), ctx); +} + +static const struct file_operations bch_file_operations = { + .llseek = bch2_llseek, + .read_iter = bch2_read_iter, + .write_iter = bch2_write_iter, + .mmap = bch2_mmap, + .open = generic_file_open, + .fsync = bch2_fsync, + .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = bch2_fallocate_dispatch, + .unlocked_ioctl = bch2_fs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch2_compat_fs_ioctl, +#endif + .remap_file_range = bch2_remap_file_range, +}; + +static const struct inode_operations bch_file_inode_operations = { + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .fiemap = bch2_fiemap, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct inode_operations bch_dir_inode_operations = { + .lookup = bch2_lookup, + .create = bch2_create, + .link = bch2_link, + .unlink = bch2_unlink, + .symlink = bch2_symlink, + .mkdir = bch2_mkdir, + .rmdir = bch2_unlink, + .mknod = bch2_mknod, + .rename = bch2_rename2, + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .tmpfile = bch2_tmpfile, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct file_operations bch_dir_file_operations = { + .llseek = bch2_dir_llseek, + .read = generic_read_dir, + .iterate_shared = bch2_vfs_readdir, + .fsync = bch2_fsync, + .unlocked_ioctl = bch2_fs_file_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = bch2_compat_fs_ioctl, +#endif +}; + +static const struct inode_operations bch_symlink_inode_operations = { + .get_link = page_get_link, + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct inode_operations bch_special_inode_operations = { + .getattr = bch2_getattr, + .setattr = bch2_setattr, + .listxattr = bch2_xattr_list, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + .get_acl = bch2_get_acl, + .set_acl = bch2_set_acl, +#endif +}; + +static const struct address_space_operations bch_address_space_operations = { + .read_folio = bch2_read_folio, + .writepages = bch2_writepages, + .readahead = bch2_readahead, + .dirty_folio = filemap_dirty_folio, + .write_begin = bch2_write_begin, + .write_end = bch2_write_end, + .invalidate_folio = bch2_invalidate_folio, + .release_folio = bch2_release_folio, + .direct_IO = noop_direct_IO, +#ifdef CONFIG_MIGRATION + .migrate_folio = filemap_migrate_folio, +#endif + .error_remove_page = generic_error_remove_page, +}; + +struct bcachefs_fid { + u64 inum; + u32 subvol; + u32 gen; +} __packed; + +struct bcachefs_fid_with_parent { + struct bcachefs_fid fid; + struct bcachefs_fid dir; +} __packed; + +static int bcachefs_fid_valid(int fh_len, int fh_type) +{ + switch (fh_type) { + case FILEID_BCACHEFS_WITHOUT_PARENT: + return fh_len == sizeof(struct bcachefs_fid) / sizeof(u32); + case FILEID_BCACHEFS_WITH_PARENT: + return fh_len == sizeof(struct bcachefs_fid_with_parent) / sizeof(u32); + default: + return false; + } +} + +static struct bcachefs_fid bch2_inode_to_fid(struct bch_inode_info *inode) +{ + return (struct bcachefs_fid) { + .inum = inode->ei_inode.bi_inum, + .subvol = inode->ei_subvol, + .gen = inode->ei_inode.bi_generation, + }; +} + +static int bch2_encode_fh(struct inode *vinode, u32 *fh, int *len, + struct inode *vdir) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_inode_info *dir = to_bch_ei(vdir); + + if (*len < sizeof(struct bcachefs_fid_with_parent) / sizeof(u32)) + return FILEID_INVALID; + + if (!S_ISDIR(inode->v.i_mode) && dir) { + struct bcachefs_fid_with_parent *fid = (void *) fh; + + fid->fid = bch2_inode_to_fid(inode); + fid->dir = bch2_inode_to_fid(dir); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITH_PARENT; + } else { + struct bcachefs_fid *fid = (void *) fh; + + *fid = bch2_inode_to_fid(inode); + + *len = sizeof(*fid) / sizeof(u32); + return FILEID_BCACHEFS_WITHOUT_PARENT; + } +} + +static struct inode *bch2_nfs_get_inode(struct super_block *sb, + struct bcachefs_fid fid) +{ + struct bch_fs *c = sb->s_fs_info; + struct inode *vinode = bch2_vfs_inode_get(c, (subvol_inum) { + .subvol = fid.subvol, + .inum = fid.inum, + }); + if (!IS_ERR(vinode) && vinode->i_generation != fid.gen) { + iput(vinode); + vinode = ERR_PTR(-ESTALE); + } + return vinode; +} + +static struct dentry *bch2_fh_to_dentry(struct super_block *sb, struct fid *_fid, + int fh_len, int fh_type) +{ + struct bcachefs_fid *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type)) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, *fid)); +} + +static struct dentry *bch2_fh_to_parent(struct super_block *sb, struct fid *_fid, + int fh_len, int fh_type) +{ + struct bcachefs_fid_with_parent *fid = (void *) _fid; + + if (!bcachefs_fid_valid(fh_len, fh_type) || + fh_type != FILEID_BCACHEFS_WITH_PARENT) + return NULL; + + return d_obtain_alias(bch2_nfs_get_inode(sb, fid->dir)); +} + +static struct dentry *bch2_get_parent(struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + subvol_inum parent_inum = { + .subvol = inode->ei_inode.bi_parent_subvol ?: + inode->ei_subvol, + .inum = inode->ei_inode.bi_dir, + }; + + if (!parent_inum.inum) + return NULL; + + return d_obtain_alias(bch2_vfs_inode_get(c, parent_inum)); +} + +static int bch2_get_name(struct dentry *parent, char *name, struct dentry *child) +{ + struct bch_inode_info *inode = to_bch_ei(child->d_inode); + struct bch_inode_info *dir = to_bch_ei(parent->d_inode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct btree_trans trans; + struct btree_iter iter1; + struct btree_iter iter2; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + struct bch_inode_unpacked inode_u; + subvol_inum target; + u32 snapshot; + unsigned name_len; + int ret; + + if (!S_ISDIR(dir->v.i_mode)) + return -EINVAL; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter1, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); + bch2_trans_iter_init(&trans, &iter2, BTREE_ID_dirents, + POS(dir->ei_inode.bi_inum, 0), 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, dir->ei_subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter1, snapshot); + bch2_btree_iter_set_snapshot(&iter2, snapshot); + + ret = bch2_inode_find_by_inum_trans(&trans, inode_inum(inode), &inode_u); + if (ret) + goto err; + + if (inode_u.bi_dir == dir->ei_inode.bi_inum) { + bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + + k = bch2_btree_iter_peek_slot(&iter1); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_dirent) { + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + goto err; + } + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret > 0) + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + if (ret) + goto err; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } else { + /* + * File with multiple hardlinks and our backref is to the wrong + * directory - linear search: + */ + for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + if (k.k->p.inode > dir->ei_inode.bi_inum) + break; + + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + ret = bch2_dirent_read_target(&trans, inode_inum(dir), d, &target); + if (ret < 0) + break; + if (ret) + continue; + + if (target.subvol == inode->ei_subvol && + target.inum == inode->ei_inode.bi_inum) + goto found; + } + } + + ret = -ENOENT; + goto err; +found: + name_len = min_t(unsigned, bch2_dirent_name_bytes(d), NAME_MAX); + + memcpy(name, d.v->d_name, name_len); + name[name_len] = '\0'; +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter1); + bch2_trans_iter_exit(&trans, &iter2); + bch2_trans_exit(&trans); + + return ret; +} + +static const struct export_operations bch_export_ops = { + .encode_fh = bch2_encode_fh, + .fh_to_dentry = bch2_fh_to_dentry, + .fh_to_parent = bch2_fh_to_parent, + .get_parent = bch2_get_parent, + .get_name = bch2_get_name, +}; + +static void bch2_vfs_inode_init(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + struct bch_subvolume *subvol) +{ + bch2_inode_update_after_write(trans, inode, bi, ~0); + + if (BCH_SUBVOLUME_SNAP(subvol)) + set_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + else + clear_bit(EI_INODE_SNAPSHOT, &inode->ei_flags); + + inode->v.i_blocks = bi->bi_sectors; + inode->v.i_ino = bi->bi_inum; + inode->v.i_rdev = bi->bi_dev; + inode->v.i_generation = bi->bi_generation; + inode->v.i_size = bi->bi_size; + + inode->ei_flags = 0; + inode->ei_quota_reserved = 0; + inode->ei_qid = bch_qid(bi); + inode->ei_subvol = inum.subvol; + + inode->v.i_mapping->a_ops = &bch_address_space_operations; + + switch (inode->v.i_mode & S_IFMT) { + case S_IFREG: + inode->v.i_op = &bch_file_inode_operations; + inode->v.i_fop = &bch_file_operations; + break; + case S_IFDIR: + inode->v.i_op = &bch_dir_inode_operations; + inode->v.i_fop = &bch_dir_file_operations; + break; + case S_IFLNK: + inode_nohighmem(&inode->v); + inode->v.i_op = &bch_symlink_inode_operations; + break; + default: + init_special_inode(&inode->v, inode->v.i_mode, inode->v.i_rdev); + inode->v.i_op = &bch_special_inode_operations; + break; + } + + mapping_set_large_folios(inode->v.i_mapping); +} + +static struct inode *bch2_alloc_inode(struct super_block *sb) +{ + struct bch_inode_info *inode; + + inode = kmem_cache_alloc(bch2_inode_cache, GFP_NOFS); + if (!inode) + return NULL; + + inode_init_once(&inode->v); + mutex_init(&inode->ei_update_lock); + two_state_lock_init(&inode->ei_pagecache_lock); + INIT_LIST_HEAD(&inode->ei_vfs_inode_list); + mutex_init(&inode->ei_quota_lock); + + return &inode->v; +} + +static void bch2_i_callback(struct rcu_head *head) +{ + struct inode *vinode = container_of(head, struct inode, i_rcu); + struct bch_inode_info *inode = to_bch_ei(vinode); + + kmem_cache_free(bch2_inode_cache, inode); +} + +static void bch2_destroy_inode(struct inode *vinode) +{ + call_rcu(&vinode->i_rcu, bch2_i_callback); +} + +static int inode_update_times_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + bi->bi_atime = timespec_to_bch2_time(c, inode->v.i_atime); + bi->bi_mtime = timespec_to_bch2_time(c, inode->v.i_mtime); + bi->bi_ctime = timespec_to_bch2_time(c, inode->v.i_ctime); + + return 0; +} + +static int bch2_vfs_write_inode(struct inode *vinode, + struct writeback_control *wbc) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(vinode); + int ret; + + mutex_lock(&inode->ei_update_lock); + ret = bch2_write_inode(c, inode, inode_update_times_fn, NULL, + ATTR_ATIME|ATTR_MTIME|ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + + return bch2_err_class(ret); +} + +static void bch2_evict_inode(struct inode *vinode) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(vinode); + + truncate_inode_pages_final(&inode->v.i_data); + + clear_inode(&inode->v); + + BUG_ON(!is_bad_inode(&inode->v) && inode->ei_quota_reserved); + + if (!inode->v.i_nlink && !is_bad_inode(&inode->v)) { + bch2_quota_acct(c, inode->ei_qid, Q_SPC, -((s64) inode->v.i_blocks), + KEY_TYPE_QUOTA_WARN); + bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, + KEY_TYPE_QUOTA_WARN); + bch2_inode_rm(c, inode_inum(inode)); + } + + mutex_lock(&c->vfs_inodes_lock); + list_del_init(&inode->ei_vfs_inode_list); + mutex_unlock(&c->vfs_inodes_lock); +} + +void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s) +{ + struct bch_inode_info *inode, **i; + DARRAY(struct bch_inode_info *) grabbed; + bool clean_pass = false, this_pass_clean; + + /* + * Initially, we scan for inodes without I_DONTCACHE, then mark them to + * be pruned with d_mark_dontcache(). + * + * Once we've had a clean pass where we didn't find any inodes without + * I_DONTCACHE, we wait for them to be freed: + */ + + darray_init(&grabbed); + darray_make_room(&grabbed, 1024); +again: + cond_resched(); + this_pass_clean = true; + + mutex_lock(&c->vfs_inodes_lock); + list_for_each_entry(inode, &c->vfs_inodes_list, ei_vfs_inode_list) { + if (!snapshot_list_has_id(s, inode->ei_subvol)) + continue; + + if (!(inode->v.i_state & I_DONTCACHE) && + !(inode->v.i_state & I_FREEING) && + igrab(&inode->v)) { + this_pass_clean = false; + + if (darray_push_gfp(&grabbed, inode, GFP_ATOMIC|__GFP_NOWARN)) { + iput(&inode->v); + break; + } + } else if (clean_pass && this_pass_clean) { + wait_queue_head_t *wq = bit_waitqueue(&inode->v.i_state, __I_NEW); + DEFINE_WAIT_BIT(wait, &inode->v.i_state, __I_NEW); + + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + mutex_unlock(&c->vfs_inodes_lock); + + schedule(); + finish_wait(wq, &wait.wq_entry); + goto again; + } + } + mutex_unlock(&c->vfs_inodes_lock); + + darray_for_each(grabbed, i) { + inode = *i; + d_mark_dontcache(&inode->v); + d_prune_aliases(&inode->v); + iput(&inode->v); + } + grabbed.nr = 0; + + if (!clean_pass || !this_pass_clean) { + clean_pass = this_pass_clean; + goto again; + } + + darray_exit(&grabbed); +} + +static int bch2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_sb; + struct bch_fs *c = sb->s_fs_info; + struct bch_fs_usage_short usage = bch2_fs_usage_read_short(c); + unsigned shift = sb->s_blocksize_bits - 9; + /* + * this assumes inodes take up 64 bytes, which is a decent average + * number: + */ + u64 avail_inodes = ((usage.capacity - usage.used) << 3); + u64 fsid; + + buf->f_type = BCACHEFS_STATFS_MAGIC; + buf->f_bsize = sb->s_blocksize; + buf->f_blocks = usage.capacity >> shift; + buf->f_bfree = usage.free >> shift; + buf->f_bavail = avail_factor(usage.free) >> shift; + + buf->f_files = usage.nr_inodes + avail_inodes; + buf->f_ffree = avail_inodes; + + fsid = le64_to_cpup((void *) c->sb.user_uuid.b) ^ + le64_to_cpup((void *) c->sb.user_uuid.b + sizeof(u64)); + buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; + buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; + buf->f_namelen = BCH_NAME_MAX; + + return 0; +} + +static int bch2_sync_fs(struct super_block *sb, int wait) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + if (c->opts.journal_flush_disabled) + return 0; + + if (!wait) { + bch2_journal_flush_async(&c->journal, NULL); + return 0; + } + + ret = bch2_journal_flush(&c->journal); + return bch2_err_class(ret); +} + +static struct bch_fs *bch2_path_to_fs(const char *path) +{ + struct bch_fs *c; + dev_t dev; + int ret; + + ret = lookup_bdev(path, &dev); + if (ret) + return ERR_PTR(ret); + + c = bch2_dev_to_fs(dev); + if (c) + closure_put(&c->cl); + return c ?: ERR_PTR(-ENOENT); +} + +static char **split_devs(const char *_dev_name, unsigned *nr) +{ + char *dev_name = NULL, **devs = NULL, *s; + size_t i, nr_devs = 0; + + dev_name = kstrdup(_dev_name, GFP_KERNEL); + if (!dev_name) + return NULL; + + for (s = dev_name; s; s = strchr(s + 1, ':')) + nr_devs++; + + devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL); + if (!devs) { + kfree(dev_name); + return NULL; + } + + for (i = 0, s = dev_name; + s; + (s = strchr(s, ':')) && (*s++ = '\0')) + devs[i++] = s; + + *nr = nr_devs; + return devs; +} + +static int bch2_remount(struct super_block *sb, int *flags, char *data) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_opts opts = bch2_opts_empty(); + int ret; + + opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); + + ret = bch2_parse_mount_opts(c, &opts, data); + if (ret) + goto err; + + if (opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts.read_only; + + up_write(&c->state_lock); + } + + if (opts.errors >= 0) + c->opts.errors = opts.errors; +err: + return bch2_err_class(ret); +} + +static int bch2_show_devname(struct seq_file *seq, struct dentry *root) +{ + struct bch_fs *c = root->d_sb->s_fs_info; + struct bch_dev *ca; + unsigned i; + bool first = true; + + for_each_online_member(ca, c, i) { + if (!first) + seq_putc(seq, ':'); + first = false; + seq_puts(seq, "/dev/"); + seq_puts(seq, ca->name); + } + + return 0; +} + +static int bch2_show_options(struct seq_file *seq, struct dentry *root) +{ + struct bch_fs *c = root->d_sb->s_fs_info; + enum bch_opt_id i; + struct printbuf buf = PRINTBUF; + int ret = 0; + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + printbuf_reset(&buf); + bch2_opt_to_text(&buf, c, c->disk_sb.sb, opt, v, + OPT_SHOW_MOUNT_STYLE); + seq_putc(seq, ','); + seq_puts(seq, buf.buf); + } + + if (buf.allocation_failure) + ret = -ENOMEM; + printbuf_exit(&buf); + return ret; +} + +static void bch2_put_super(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + __bch2_fs_stop(c); +} + +static const struct super_operations bch_super_operations = { + .alloc_inode = bch2_alloc_inode, + .destroy_inode = bch2_destroy_inode, + .write_inode = bch2_vfs_write_inode, + .evict_inode = bch2_evict_inode, + .sync_fs = bch2_sync_fs, + .statfs = bch2_statfs, + .show_devname = bch2_show_devname, + .show_options = bch2_show_options, + .remount_fs = bch2_remount, + .put_super = bch2_put_super, +#if 0 + .freeze_fs = bch2_freeze, + .unfreeze_fs = bch2_unfreeze, +#endif +}; + +static int bch2_set_super(struct super_block *s, void *data) +{ + s->s_fs_info = data; + return 0; +} + +static int bch2_noset_super(struct super_block *s, void *data) +{ + return -EBUSY; +} + +static int bch2_test_super(struct super_block *s, void *data) +{ + struct bch_fs *c = s->s_fs_info; + struct bch_fs **devs = data; + unsigned i; + + if (!c) + return false; + + for (i = 0; devs[i]; i++) + if (c != devs[i]) + return false; + return true; +} + +static struct dentry *bch2_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + struct bch_fs *c; + struct bch_dev *ca; + struct super_block *sb; + struct inode *vinode; + struct bch_opts opts = bch2_opts_empty(); + char **devs; + struct bch_fs **devs_to_fs = NULL; + unsigned i, nr_devs; + int ret; + + opt_set(opts, read_only, (flags & SB_RDONLY) != 0); + + ret = bch2_parse_mount_opts(NULL, &opts, data); + if (ret) + return ERR_PTR(ret); + + if (!dev_name || strlen(dev_name) == 0) + return ERR_PTR(-EINVAL); + + devs = split_devs(dev_name, &nr_devs); + if (!devs) + return ERR_PTR(-ENOMEM); + + devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL); + if (!devs_to_fs) { + sb = ERR_PTR(-ENOMEM); + goto got_sb; + } + + for (i = 0; i < nr_devs; i++) + devs_to_fs[i] = bch2_path_to_fs(devs[i]); + + sb = sget(fs_type, bch2_test_super, bch2_noset_super, + flags|SB_NOSEC, devs_to_fs); + if (!IS_ERR(sb)) + goto got_sb; + + c = bch2_fs_open(devs, nr_devs, opts); + if (IS_ERR(c)) { + sb = ERR_CAST(c); + goto got_sb; + } + + /* Some options can't be parsed until after the fs is started: */ + ret = bch2_parse_mount_opts(c, &opts, data); + if (ret) { + bch2_fs_stop(c); + sb = ERR_PTR(ret); + goto got_sb; + } + + bch2_opts_apply(&c->opts, opts); + + sb = sget(fs_type, NULL, bch2_set_super, flags|SB_NOSEC, c); + if (IS_ERR(sb)) + bch2_fs_stop(c); +got_sb: + kfree(devs_to_fs); + kfree(devs[0]); + kfree(devs); + + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + ret = bch2_err_class(ret); + return ERR_PTR(ret); + } + + c = sb->s_fs_info; + + if (sb->s_root) { + if ((flags ^ sb->s_flags) & SB_RDONLY) { + ret = -EBUSY; + goto err_put_super; + } + goto out; + } + + sb->s_blocksize = block_bytes(c); + sb->s_blocksize_bits = ilog2(block_bytes(c)); + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_op = &bch_super_operations; + sb->s_export_op = &bch_export_ops; +#ifdef CONFIG_BCACHEFS_QUOTA + sb->s_qcop = &bch2_quotactl_operations; + sb->s_quota_types = QTYPE_MASK_USR|QTYPE_MASK_GRP|QTYPE_MASK_PRJ; +#endif + sb->s_xattr = bch2_xattr_handlers; + sb->s_magic = BCACHEFS_STATFS_MAGIC; + sb->s_time_gran = c->sb.nsec_per_time_unit; + sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; + sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); + c->vfs_sb = sb; + strscpy(sb->s_id, c->name, sizeof(sb->s_id)); + + ret = super_setup_bdi(sb); + if (ret) + goto err_put_super; + + sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; + + for_each_online_member(ca, c, i) { + struct block_device *bdev = ca->disk_sb.bdev; + + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + percpu_ref_put(&ca->io_ref); + break; + } + + c->dev = sb->s_dev; + +#ifdef CONFIG_BCACHEFS_POSIX_ACL + if (c->opts.acl) + sb->s_flags |= SB_POSIXACL; +#endif + + sb->s_shrink.seeks = 0; + + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); + ret = PTR_ERR_OR_ZERO(vinode); + if (ret) { + bch_err(c, "error mounting: error getting root inode: %s", bch2_err_str(ret)); + goto err_put_super; + } + + sb->s_root = d_make_root(vinode); + if (!sb->s_root) { + bch_err(c, "error mounting: error allocating root dentry"); + ret = -ENOMEM; + goto err_put_super; + } + + sb->s_flags |= SB_ACTIVE; +out: + return dget(sb->s_root); + +err_put_super: + deactivate_locked_super(sb); + return ERR_PTR(bch2_err_class(ret)); +} + +static void bch2_kill_sb(struct super_block *sb) +{ + struct bch_fs *c = sb->s_fs_info; + + generic_shutdown_super(sb); + bch2_fs_free(c); +} + +static struct file_system_type bcache_fs_type = { + .owner = THIS_MODULE, + .name = "bcachefs", + .mount = bch2_mount, + .kill_sb = bch2_kill_sb, + .fs_flags = FS_REQUIRES_DEV, +}; + +MODULE_ALIAS_FS("bcachefs"); + +void bch2_vfs_exit(void) +{ + unregister_filesystem(&bcache_fs_type); + kmem_cache_destroy(bch2_inode_cache); +} + +int __init bch2_vfs_init(void) +{ + int ret = -ENOMEM; + + bch2_inode_cache = KMEM_CACHE(bch_inode_info, SLAB_RECLAIM_ACCOUNT); + if (!bch2_inode_cache) + goto err; + + ret = register_filesystem(&bcache_fs_type); + if (ret) + goto err; + + return 0; +err: + bch2_vfs_exit(); + return ret; +} + +#endif /* NO_BCACHEFS_FS */ diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h new file mode 100644 index 000000000..6170d214d --- /dev/null +++ b/fs/bcachefs/fs.h @@ -0,0 +1,208 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FS_H +#define _BCACHEFS_FS_H + +#include "inode.h" +#include "opts.h" +#include "str_hash.h" +#include "quota_types.h" +#include "two_state_shared_lock.h" + +#include +#include + +struct bch_inode_info { + struct inode v; + struct list_head ei_vfs_inode_list; + unsigned long ei_flags; + + struct mutex ei_update_lock; + u64 ei_quota_reserved; + unsigned long ei_last_dirtied; + two_state_lock_t ei_pagecache_lock; + + struct mutex ei_quota_lock; + struct bch_qid ei_qid; + + u32 ei_subvol; + + /* + * When we've been doing nocow writes we'll need to issue flushes to the + * underlying block devices + * + * XXX: a device may have had a flush issued by some other codepath. It + * would be better to keep for each device a sequence number that's + * incremented when we isusue a cache flush, and track here the sequence + * number that needs flushing. + */ + struct bch_devs_mask ei_devs_need_flush; + + /* copy of inode in btree: */ + struct bch_inode_unpacked ei_inode; +}; + +#define bch2_pagecache_add_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_tryget(i) bch2_two_state_trylock(&i->ei_pagecache_lock, 0) +#define bch2_pagecache_add_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 0) + +#define bch2_pagecache_block_put(i) bch2_two_state_unlock(&i->ei_pagecache_lock, 1) +#define bch2_pagecache_block_get(i) bch2_two_state_lock(&i->ei_pagecache_lock, 1) + +static inline subvol_inum inode_inum(struct bch_inode_info *inode) +{ + return (subvol_inum) { + .subvol = inode->ei_subvol, + .inum = inode->ei_inode.bi_inum, + }; +} + +/* + * Set if we've gotten a btree error for this inode, and thus the vfs inode and + * btree inode may be inconsistent: + */ +#define EI_INODE_ERROR 0 + +/* + * Set in the inode is in a snapshot subvolume - we don't do quota accounting in + * those: + */ +#define EI_INODE_SNAPSHOT 1 + +#define to_bch_ei(_inode) \ + container_of_or_null(_inode, struct bch_inode_info, v) + +static inline int ptrcmp(void *l, void *r) +{ + return cmp_int(l, r); +} + +enum bch_inode_lock_op { + INODE_LOCK = (1U << 0), + INODE_PAGECACHE_BLOCK = (1U << 1), + INODE_UPDATE_LOCK = (1U << 2), +}; + +#define bch2_lock_inodes(_locks, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ + \ + for (i = 1; i < ARRAY_SIZE(a); i++) \ + if (a[i] != a[i - 1]) { \ + if ((_locks) & INODE_LOCK) \ + down_write_nested(&a[i]->v.i_rwsem, i); \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_get(a[i]);\ + if ((_locks) & INODE_UPDATE_LOCK) \ + mutex_lock_nested(&a[i]->ei_update_lock, i);\ + } \ +} while (0) + +#define bch2_unlock_inodes(_locks, ...) \ +do { \ + struct bch_inode_info *a[] = { NULL, __VA_ARGS__ }; \ + unsigned i; \ + \ + bubble_sort(&a[1], ARRAY_SIZE(a) - 1, ptrcmp); \ + \ + for (i = 1; i < ARRAY_SIZE(a); i++) \ + if (a[i] != a[i - 1]) { \ + if ((_locks) & INODE_LOCK) \ + up_write(&a[i]->v.i_rwsem); \ + if ((_locks) & INODE_PAGECACHE_BLOCK) \ + bch2_pagecache_block_put(a[i]);\ + if ((_locks) & INODE_UPDATE_LOCK) \ + mutex_unlock(&a[i]->ei_update_lock); \ + } \ +} while (0) + +static inline struct bch_inode_info *file_bch_inode(struct file *file) +{ + return to_bch_ei(file_inode(file)); +} + +static inline bool inode_attr_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode, + enum inode_opt_id id) +{ + return !(inode->ei_inode.bi_fields_set & (1 << id)) && + bch2_inode_opt_get(&dir->ei_inode, id) != + bch2_inode_opt_get(&inode->ei_inode, id); +} + +static inline bool inode_attrs_changing(struct bch_inode_info *dir, + struct bch_inode_info *inode) +{ + unsigned id; + + for (id = 0; id < Inode_opt_nr; id++) + if (inode_attr_changing(dir, inode, id)) + return true; + + return false; +} + +struct bch_inode_unpacked; + +#ifndef NO_BCACHEFS_FS + +struct bch_inode_info * +__bch2_create(struct mnt_idmap *, struct bch_inode_info *, + struct dentry *, umode_t, dev_t, subvol_inum, unsigned); + +int bch2_fs_quota_transfer(struct bch_fs *, + struct bch_inode_info *, + struct bch_qid, + unsigned, + enum quota_acct_mode); + +static inline int bch2_set_projid(struct bch_fs *c, + struct bch_inode_info *inode, + u32 projid) +{ + struct bch_qid qid = inode->ei_qid; + + qid.q[QTYP_PRJ] = projid; + + return bch2_fs_quota_transfer(c, inode, qid, + 1 << QTYP_PRJ, + KEY_TYPE_QUOTA_PREALLOC); +} + +struct inode *bch2_vfs_inode_get(struct bch_fs *, subvol_inum); + +/* returns 0 if we want to do the update, or error is passed up */ +typedef int (*inode_set_fn)(struct bch_inode_info *, + struct bch_inode_unpacked *, void *); + +void bch2_inode_update_after_write(struct btree_trans *, + struct bch_inode_info *, + struct bch_inode_unpacked *, + unsigned); +int __must_check bch2_write_inode(struct bch_fs *, struct bch_inode_info *, + inode_set_fn, void *, unsigned); + +int bch2_setattr_nonsize(struct mnt_idmap *, + struct bch_inode_info *, + struct iattr *); +int __bch2_unlink(struct inode *, struct dentry *, bool); + +void bch2_evict_subvolume_inodes(struct bch_fs *, snapshot_id_list *); + +void bch2_vfs_exit(void); +int bch2_vfs_init(void); + +#else + +#define bch2_inode_update_after_write(_trans, _inode, _inode_u, _fields) do {} while (0) + +static inline void bch2_evict_subvolume_inodes(struct bch_fs *c, + snapshot_id_list *s) {} +static inline void bch2_vfs_exit(void) {} +static inline int bch2_vfs_init(void) { return 0; } + +#endif /* NO_BCACHEFS_FS */ + +#endif /* _BCACHEFS_FS_H */ diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c new file mode 100644 index 000000000..7edd4632d --- /dev/null +++ b/fs/bcachefs/fsck.c @@ -0,0 +1,2452 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "darray.h" +#include "dirent.h" +#include "error.h" +#include "fs-common.h" +#include "fsck.h" +#include "inode.h" +#include "keylist.h" +#include "subvolume.h" +#include "super.h" +#include "xattr.h" + +#include +#include /* struct qstr */ + +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +/* + * XXX: this is handling transaction restarts without returning + * -BCH_ERR_transaction_restart_nested, this is not how we do things anymore: + */ +static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 sectors = 0; + int ret; + + for_each_btree_key_upto(trans, iter, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) + if (bkey_extent_is_allocation(k.k)) + sectors += k.k->size; + + bch2_trans_iter_exit(trans, &iter); + + return ret ?: sectors; +} + +static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + u64 subdirs = 0; + int ret; + + for_each_btree_key_upto(trans, iter, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + POS(inum, U64_MAX), + 0, k, ret) { + if (k.k->type != KEY_TYPE_dirent) + continue; + + d = bkey_s_c_to_dirent(k); + if (d.v->d_type == DT_DIR) + subdirs++; + } + bch2_trans_iter_exit(trans, &iter); + + return ret ?: subdirs; +} + +static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot, + u32 *subvol) +{ + struct bch_snapshot s; + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, + POS(0, snapshot), 0, + snapshot, &s); + if (!ret) + *subvol = le32_to_cpu(s.subvol); + else if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not fonud", snapshot); + return ret; + +} + +static int __subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + struct bch_subvolume s; + int ret; + + ret = bch2_subvolume_get(trans, subvol, false, 0, &s); + + *snapshot = le32_to_cpu(s.snapshot); + *inum = le64_to_cpu(s.inode); + return ret; +} + +static int subvol_lookup(struct btree_trans *trans, u32 subvol, + u32 *snapshot, u64 *inum) +{ + return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum)); +} + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, + POS(0, inode_nr), + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !bkey_eq(k.k->p, POS(0, inode_nr))) { + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, inode); +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(trans->c, "error fetching inode %llu: %s", + inode_nr, bch2_err_str(ret)); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int __lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, *snapshot), 0); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) + : -BCH_ERR_ENOENT_inode; + if (!ret) + *snapshot = iter.pos.snapshot; +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(trans->c, "error fetching inode %llu:%u: %s", + inode_nr, *snapshot, bch2_err_str(ret)); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int lookup_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode, + u32 *snapshot) +{ + return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot)); +} + +static int __lookup_dirent(struct btree_trans *trans, + struct bch_hash_info hash_info, + subvol_inum dir, struct qstr *name, + u64 *target, unsigned *type) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc, + &hash_info, dir, name, 0); + if (ret) + return ret; + + d = bkey_s_c_to_dirent(bch2_btree_iter_peek_slot(&iter)); + *target = le64_to_cpu(d.v->d_inum); + *type = d.v->d_type; + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int __write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bkey_inode_buf *inode_p = + bch2_trans_kmalloc(trans, sizeof(*inode_p)); + + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack(inode_p, inode); + inode_p->inode.k.p.snapshot = snapshot; + + return bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &inode_p->inode.k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static int write_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + int ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __write_inode(trans, inode, snapshot)); + if (ret) + bch_err(trans->c, "error in fsck: error updating inode: %s", + bch2_err_str(ret)); + return ret; +} + +static int fsck_inode_rm(struct btree_trans *trans, u64 inum, u32 snapshot) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + int ret; + + do { + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_dirents, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL) ?: + bch2_btree_delete_range_trans(trans, BTREE_ID_xattrs, + SPOS(inum, 0, snapshot), + SPOS(inum, U64_MAX, snapshot), + 0, NULL); + } while (ret == -BCH_ERR_transaction_restart_nested); + if (ret) + goto err; +retry: + bch2_trans_begin(trans); + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inum, snapshot), BTREE_ITER_INTENT); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(c, + "inode %llu:%u not found when deleting", + inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + /* Subvolume root? */ + if (inode_u.bi_subvol) + bch_warn(c, "deleting inode %llu marked as unlinked, but also a subvolume root!?", inode_u.bi_inum); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + return ret ?: -BCH_ERR_transaction_restart_nested; +} + +static int __remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_INTENT); + + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter); +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_fn(c, ret); + return ret; +} + +/* Get lost+found, create if it doesn't exist: */ +static int lookup_lostfound(struct btree_trans *trans, u32 subvol, + struct bch_inode_unpacked *lostfound) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root; + struct bch_hash_info root_hash_info; + struct qstr lostfound_str = QSTR("lost+found"); + subvol_inum root_inum = { .subvol = subvol }; + u64 inum = 0; + unsigned d_type = 0; + u32 snapshot; + int ret; + + ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum); + if (ret) + return ret; + + ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot); + if (ret) + return ret; + + root_hash_info = bch2_hash_info_init(c, &root); + + ret = __lookup_dirent(trans, root_hash_info, root_inum, + &lostfound_str, &inum, &d_type); + if (bch2_err_matches(ret, ENOENT)) { + bch_notice(c, "creating lost+found"); + goto create_lostfound; + } + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error looking up lost+found: %s", bch2_err_str(ret)); + if (ret) + return ret; + + if (d_type != DT_DIR) { + bch_err(c, "error looking up lost+found: not a directory"); + return ret; + } + + /* + * The bch2_check_dirents pass has already run, dangling dirents + * shouldn't exist here: + */ + return __lookup_inode(trans, inum, lostfound, &snapshot); + +create_lostfound: + bch2_inode_init_early(c, lostfound); + + ret = bch2_create_trans(trans, root_inum, &root, + lostfound, &lostfound_str, + 0, 0, S_IFDIR|0700, 0, NULL, NULL, + (subvol_inum) { }, 0); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error creating lost+found: %s", bch2_err_str(ret)); + return ret; +} + +static int __reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ + struct bch_hash_info dir_hash; + struct bch_inode_unpacked lostfound; + char name_buf[20]; + struct qstr name; + u64 dir_offset = 0; + u32 subvol; + int ret; + + ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol); + if (ret) + return ret; + + ret = lookup_lostfound(trans, subvol, &lostfound); + if (ret) + return ret; + + if (S_ISDIR(inode->bi_mode)) { + lostfound.bi_nlink++; + + ret = __write_inode(trans, &lostfound, U32_MAX); + if (ret) + return ret; + } + + dir_hash = bch2_hash_info_init(trans->c, &lostfound); + + snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum); + name = (struct qstr) QSTR(name_buf); + + ret = bch2_dirent_create(trans, + (subvol_inum) { + .subvol = subvol, + .inum = lostfound.bi_inum, + }, + &dir_hash, + inode_d_type(inode), + &name, inode->bi_inum, &dir_offset, + BCH_HASH_SET_MUST_CREATE); + if (ret) + return ret; + + inode->bi_dir = lostfound.bi_inum; + inode->bi_dir_offset = dir_offset; + + return __write_inode(trans, inode, inode_snapshot); +} + +static int reattach_inode(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 inode_snapshot) +{ + int ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL, + __reattach_inode(trans, inode, inode_snapshot)); + if (ret) { + bch_err(trans->c, "error reattaching inode %llu: %s", + inode->bi_inum, bch2_err_str(ret)); + return ret; + } + + return ret; +} + +static int remove_backpointer(struct btree_trans *trans, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + d = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_dirents, + POS(inode->bi_dir, inode->bi_dir_offset), 0, + dirent); + ret = bkey_err(d) ?: + __remove_dirent(trans, d.k->p); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +struct snapshots_seen_entry { + u32 id; + u32 equiv; +}; + +struct snapshots_seen { + struct bpos pos; + DARRAY(struct snapshots_seen_entry) ids; +}; + +static inline void snapshots_seen_exit(struct snapshots_seen *s) +{ + darray_exit(&s->ids); +} + +static inline void snapshots_seen_init(struct snapshots_seen *s) +{ + memset(s, 0, sizeof(*s)); +} + +static int snapshots_seen_add(struct bch_fs *c, struct snapshots_seen *s, u32 id) +{ + struct snapshots_seen_entry *i, n = { id, id }; + int ret; + + darray_for_each(s->ids, i) { + if (n.equiv < i->equiv) + break; + + if (i->equiv == n.equiv) { + bch_err(c, "%s(): adding duplicate snapshot", __func__); + return -EINVAL; + } + } + + ret = darray_insert_item(&s->ids, i - s->ids.data, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); + return ret; +} + +static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s, + enum btree_id btree_id, struct bpos pos) +{ + struct snapshots_seen_entry *i, n = { + .id = pos.snapshot, + .equiv = bch2_snapshot_equiv(c, pos.snapshot), + }; + int ret = 0; + + if (!bkey_eq(s->pos, pos)) + s->ids.nr = 0; + + pos.snapshot = n.equiv; + s->pos = pos; + + darray_for_each(s->ids, i) + if (i->equiv == n.equiv) { + if (fsck_err_on(i->id != n.id, c, + "snapshot deletion did not run correctly:\n" + " duplicate keys in btree %s at %llu:%llu snapshots %u, %u (equiv %u)\n", + bch2_btree_ids[btree_id], + pos.inode, pos.offset, + i->id, n.id, n.equiv)) + return -BCH_ERR_need_snapshot_cleanup; + + return 0; + } + + ret = darray_push(&s->ids, n); + if (ret) + bch_err(c, "error reallocating snapshots_seen table (size %zu)", + s->ids.size); +fsck_err: + return ret; +} + +/** + * key_visible_in_snapshot - returns true if @id is a descendent of @ancestor, + * and @ancestor hasn't been overwritten in @seen + * + * That is, returns whether key in @ancestor snapshot is visible in @id snapshot + */ +static bool key_visible_in_snapshot(struct bch_fs *c, struct snapshots_seen *seen, + u32 id, u32 ancestor) +{ + ssize_t i; + u32 top = seen->ids.nr ? seen->ids.data[seen->ids.nr - 1].equiv : 0; + + BUG_ON(id > ancestor); + BUG_ON(!bch2_snapshot_is_equiv(c, id)); + BUG_ON(!bch2_snapshot_is_equiv(c, ancestor)); + + /* @ancestor should be the snapshot most recently added to @seen */ + BUG_ON(ancestor != seen->pos.snapshot); + BUG_ON(ancestor != top); + + if (id == ancestor) + return true; + + if (!bch2_snapshot_is_ancestor(c, id, ancestor)) + return false; + + for (i = seen->ids.nr - 2; + i >= 0 && seen->ids.data[i].equiv >= id; + --i) + if (bch2_snapshot_is_ancestor(c, id, seen->ids.data[i].equiv) && + bch2_snapshot_is_ancestor(c, seen->ids.data[i].equiv, ancestor)) + return false; + + return true; +} + +/** + * ref_visible - given a key with snapshot id @src that points to a key with + * snapshot id @dst, test whether there is some snapshot in which @dst is + * visible. + * + * This assumes we're visiting @src keys in natural key order. + * + * @s - list of snapshot IDs already seen at @src + * @src - snapshot ID of src key + * @dst - snapshot ID of dst key + */ +static int ref_visible(struct bch_fs *c, struct snapshots_seen *s, + u32 src, u32 dst) +{ + return dst <= src + ? key_visible_in_snapshot(c, s, dst, src) + : bch2_snapshot_is_ancestor(c, src, dst); +} + +static int ref_visible2(struct bch_fs *c, + u32 src, struct snapshots_seen *src_seen, + u32 dst, struct snapshots_seen *dst_seen) +{ + src = bch2_snapshot_equiv(c, src); + dst = bch2_snapshot_equiv(c, dst); + + if (dst > src) { + swap(dst, src); + swap(dst_seen, src_seen); + } + return key_visible_in_snapshot(c, src_seen, dst, src); +} + +#define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ + for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ + (_i)->snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + +struct inode_walker_entry { + struct bch_inode_unpacked inode; + u32 snapshot; + u64 count; +}; + +struct inode_walker { + bool first_this_inode; + u64 cur_inum; + + DARRAY(struct inode_walker_entry) inodes; +}; + +static void inode_walker_exit(struct inode_walker *w) +{ + darray_exit(&w->inodes); +} + +static struct inode_walker inode_walker_init(void) +{ + return (struct inode_walker) { 0, }; +} + +static int add_inode(struct bch_fs *c, struct inode_walker *w, + struct bkey_s_c inode) +{ + struct bch_inode_unpacked u; + + BUG_ON(bch2_inode_unpack(inode, &u)); + + return darray_push(&w->inodes, ((struct inode_walker_entry) { + .inode = u, + .snapshot = bch2_snapshot_equiv(c, inode.k->p.snapshot), + })); +} + +static int get_inodes_all_snapshots(struct btree_trans *trans, + struct inode_walker *w, u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + u32 restart_count = trans->restart_count; + int ret; + + if (w->cur_inum == inum) + return 0; + + w->inodes.nr = 0; + + for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (k.k->p.offset != inum) + break; + + if (bkey_is_inode(k.k)) + add_inode(c, w, k); + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + return ret; + + w->cur_inum = inum; + w->first_this_inode = true; + + if (trans_was_restarted(trans, restart_count)) + return -BCH_ERR_transaction_restart_nested; + + return 0; +} + +static struct inode_walker_entry * +lookup_inode_for_snapshot(struct bch_fs *c, + struct inode_walker *w, u32 snapshot) +{ + struct inode_walker_entry *i; + + snapshot = bch2_snapshot_equiv(c, snapshot); + + darray_for_each(w->inodes, i) + if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot)) + goto found; + + return NULL; +found: + BUG_ON(snapshot > i->snapshot); + + if (snapshot != i->snapshot) { + struct inode_walker_entry new = *i; + int ret; + + new.snapshot = snapshot; + new.count = 0; + + bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u", + w->cur_inum, snapshot, i->snapshot); + + while (i > w->inodes.data && i[-1].snapshot > snapshot) + --i; + + ret = darray_insert_item(&w->inodes, i - w->inodes.data, new); + if (ret) + return ERR_PTR(ret); + } + + return i; +} + +static struct inode_walker_entry *walk_inode(struct btree_trans *trans, + struct inode_walker *w, struct bpos pos) +{ + int ret = get_inodes_all_snapshots(trans, w, pos.inode); + if (ret) + return ERR_PTR(ret); + + return lookup_inode_for_snapshot(trans->c, w, pos.snapshot); +} + +static int __get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->inodes.nr = 0; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (k.k->p.offset != inum) + break; + + if (!ref_visible(c, s, s->pos.snapshot, equiv)) + continue; + + if (bkey_is_inode(k.k)) + add_inode(c, w, k); + + if (equiv >= s->pos.snapshot) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + +static int check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(!bch2_snapshot_equiv(c, k.k->p.snapshot), c, + "key in missing snapshot: %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: 1; +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int hash_redo_key(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k) +{ + struct bkey_i *delete; + struct bkey_i *tmp; + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + if (IS_ERR(delete)) + return PTR_ERR(delete); + + tmp = bch2_bkey_make_mut_noupdate(trans, k); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + + bkey_init(&delete->k); + delete->k.p = k_iter->pos; + return bch2_btree_iter_traverse(k_iter) ?: + bch2_trans_update(trans, k_iter, delete, 0) ?: + bch2_hash_set_snapshot(trans, desc, hash_info, + (subvol_inum) { 0, k.k->p.inode }, + k.k->p.snapshot, tmp, + BCH_HASH_SET_MUST_CREATE, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); +} + +static int hash_check_key(struct btree_trans *trans, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c hash_k) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter = { NULL }; + struct printbuf buf = PRINTBUF; + struct bkey_s_c k; + u64 hash; + int ret = 0; + + if (hash_k.k->type != desc.key_type) + return 0; + + hash = desc.hash_bkey(hash_info, hash_k); + + if (likely(hash == hash_k.k->p.offset)) + return 0; + + if (hash_k.k->p.offset < hash) + goto bad_hash; + + for_each_btree_key_norestart(trans, iter, desc.btree_id, + SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), + BTREE_ITER_SLOTS, k, ret) { + if (bkey_eq(k.k->p, hash_k.k->p)) + break; + + if (fsck_err_on(k.k->type == desc.key_type && + !desc.cmp_bkey(k, hash_k), c, + "duplicate hash table keys:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), + buf.buf))) { + ret = bch2_hash_delete_at(trans, desc, hash_info, k_iter, 0) ?: 1; + break; + } + + if (bkey_deleted(k.k)) { + bch2_trans_iter_exit(trans, &iter); + goto bad_hash; + } + } +out: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); + return ret; +bad_hash: + if (fsck_err(c, "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", + bch2_btree_ids[desc.btree_id], hash_k.k->p.inode, hash_k.k->p.offset, hash, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { + ret = hash_redo_key(trans, desc, hash_info, k_iter, hash_k); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "hash_redo_key err %s", bch2_err_str(ret)); + if (ret) + return ret; + ret = -BCH_ERR_transaction_restart_nested; + } +fsck_err: + goto out; +} + +static int check_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_inode_unpacked *prev, + struct snapshots_seen *s, + bool full) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + bool do_update = false; + int ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret < 0) + goto err; + if (ret) + return 0; + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + /* + * if snapshot id isn't a leaf node, skip it - deletion in + * particular is not atomic, so on the internal snapshot nodes + * we can see inodes marked for deletion after a clean shutdown + */ + if (bch2_snapshot_is_internal_node(c, k.k->p.snapshot)) + return 0; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (!full && + !(u.bi_flags & (BCH_INODE_I_SIZE_DIRTY| + BCH_INODE_I_SECTORS_DIRTY| + BCH_INODE_UNLINKED))) + return 0; + + if (prev->bi_inum != u.bi_inum) + *prev = u; + + if (fsck_err_on(prev->bi_hash_seed != u.bi_hash_seed || + inode_d_type(prev) != inode_d_type(&u), c, + "inodes in different snapshots don't match")) { + bch_err(c, "repair not implemented yet"); + return -EINVAL; + } + + if (u.bi_flags & BCH_INODE_UNLINKED && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu unlinked", + u.bi_inum))) { + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + + ret = fsck_inode_rm(trans, u.bi_inum, iter->pos.snapshot); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error in fsck: error while deleting inode: %s", + bch2_err_str(ret)); + return ret; + } + + if (u.bi_flags & BCH_INODE_I_SIZE_DIRTY && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu has i_size dirty", + u.bi_inum))) { + bch_verbose(c, "truncating inode %llu", u.bi_inum); + + bch2_trans_unlock(trans); + bch2_fs_lazy_rw(c); + + /* + * XXX: need to truncate partial blocks too here - or ideally + * just switch units to bytes and that issue goes away + */ + ret = bch2_btree_delete_range_trans(trans, BTREE_ID_extents, + SPOS(u.bi_inum, round_up(u.bi_size, block_bytes(c)) >> 9, + iter->pos.snapshot), + POS(u.bi_inum, U64_MAX), + 0, NULL); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err(c, "error in fsck: error truncating inode: %s", + bch2_err_str(ret)); + if (ret) + return ret; + + /* + * We truncated without our normal sector accounting hook, just + * make sure we recalculate it: + */ + u.bi_flags |= BCH_INODE_I_SECTORS_DIRTY; + + u.bi_flags &= ~BCH_INODE_I_SIZE_DIRTY; + do_update = true; + } + + if (u.bi_flags & BCH_INODE_I_SECTORS_DIRTY && + (!c->sb.clean || + fsck_err(c, "filesystem marked clean, but inode %llu has i_sectors dirty", + u.bi_inum))) { + s64 sectors; + + bch_verbose(c, "recounting sectors for inode %llu", + u.bi_inum); + + sectors = bch2_count_inode_sectors(trans, u.bi_inum, iter->pos.snapshot); + if (sectors < 0) { + bch_err(c, "error in fsck: error recounting inode sectors: %s", + bch2_err_str(sectors)); + return sectors; + } + + u.bi_sectors = sectors; + u.bi_flags &= ~BCH_INODE_I_SECTORS_DIRTY; + do_update = true; + } + + if (u.bi_flags & BCH_INODE_BACKPTR_UNTRUSTED) { + u.bi_dir = 0; + u.bi_dir_offset = 0; + u.bi_flags &= ~BCH_INODE_BACKPTR_UNTRUSTED; + do_update = true; + } + + if (do_update) { + ret = __write_inode(trans, &u, iter->pos.snapshot); + if (ret) + bch_err(c, "error in fsck: error updating inode: %s", + bch2_err_str(ret)); + } +err: +fsck_err: + if (ret) + bch_err_fn(c, ret); + return ret; +} + +noinline_for_stack +int bch2_check_inodes(struct bch_fs *c) +{ + bool full = c->opts.fsck; + struct btree_trans trans; + struct btree_iter iter; + struct bch_inode_unpacked prev = { 0 }; + struct snapshots_seen s; + struct bkey_s_c k; + int ret; + + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, + POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_inode(&trans, &iter, k, &prev, &s, full)); + + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* + * Checking for overlapping extents needs to be reimplemented + */ +#if 0 +static int fix_overlapping_extent(struct btree_trans *trans, + struct bkey_s_c k, struct bpos cut_at) +{ + struct btree_iter iter; + struct bkey_i *u; + int ret; + + u = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(u, k); + bch2_cut_front(cut_at, u); + + + /* + * We don't want to go through the extent_handle_overwrites path: + * + * XXX: this is going to screw up disk accounting, extent triggers + * assume things about extent overwrites - we should be running the + * triggers manually here + */ + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, u->k.p, + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + + BUG_ON(iter.flags & BTREE_ITER_IS_EXTENTS); + ret = bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(trans, &iter, u, BTREE_TRIGGER_NORUN) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW); + bch2_trans_iter_exit(trans, &iter); + return ret; +} +#endif + +static struct bkey_s_c_dirent dirent_get_by_pos(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos pos) +{ + return bch2_bkey_get_iter_typed(trans, iter, BTREE_ID_dirents, pos, 0, dirent); +} + +static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static bool dirent_points_to_inode(struct bkey_s_c_dirent d, + struct bch_inode_unpacked *inode) +{ + return d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol + : le64_to_cpu(d.v->d_inum) == inode->bi_inum; +} + +static int inode_backpointer_exists(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct btree_iter iter; + struct bkey_s_c_dirent d; + int ret; + + d = dirent_get_by_pos(trans, &iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot)); + ret = bkey_err(d); + if (ret) + return bch2_err_matches(ret, ENOENT) ? 0 : ret; + + ret = dirent_points_to_inode(d, inode); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + u32 restart_count = trans->restart_count; + int ret = 0; + s64 count2; + + darray_for_each(w->inodes, i) { + if (i->inode.bi_sectors == i->count) + continue; + + count2 = bch2_count_inode_sectors(trans, w->cur_inum, i->snapshot); + + if (i->count != count2) { + bch_err(c, "fsck counted i_sectors wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_sectors == i->count) + continue; + } + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SECTORS_DIRTY), c, + "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", + w->cur_inum, i->snapshot, + i->inode.bi_sectors, i->count)) { + i->inode.bi_sectors = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + } + } +fsck_err: + if (ret) + bch_err_fn(c, ret); + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; +} + +struct extent_end { + u32 snapshot; + u64 offset; + struct snapshots_seen seen; +}; + +typedef DARRAY(struct extent_end) extent_ends; + +static int get_print_extent(struct btree_trans *trans, struct bpos pos, struct printbuf *out) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_extents, pos, + BTREE_ITER_SLOTS| + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_NOT_EXTENTS); + ret = bkey_err(k); + if (ret) + return ret; + + bch2_bkey_val_to_text(out, trans->c, k); + bch2_trans_iter_exit(trans, &iter); + return 0; +} + +static int check_overlapping_extents(struct btree_trans *trans, + struct snapshots_seen *seen, + extent_ends *extent_ends, + struct bkey_s_c k, + struct btree_iter *iter) +{ + struct bch_fs *c = trans->c; + struct extent_end *i; + struct printbuf buf = PRINTBUF; + int ret = 0; + + darray_for_each(*extent_ends, i) { + /* duplicate, due to transaction restart: */ + if (i->offset == k.k->p.offset && + i->snapshot == k.k->p.snapshot) + continue; + + if (!ref_visible2(c, + k.k->p.snapshot, seen, + i->snapshot, &i->seen)) + continue; + + if (i->offset <= bkey_start_offset(k.k)) + continue; + + printbuf_reset(&buf); + prt_str(&buf, "overlapping extents:\n "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\n "); + + ret = get_print_extent(trans, SPOS(k.k->p.inode, i->offset, i->snapshot), &buf); + if (ret) + break; + + if (fsck_err(c, "%s", buf.buf)) { + struct bkey_i *update = bch2_trans_kmalloc(trans, bkey_bytes(k.k)); + if ((ret = PTR_ERR_OR_ZERO(update))) + goto err; + bkey_reassemble(update, k); + ret = bch2_trans_update_extent(trans, iter, update, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + if (ret) + goto err; + } + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int extent_ends_at(extent_ends *extent_ends, + struct snapshots_seen *seen, + struct bkey_s_c k) +{ + struct extent_end *i, n = (struct extent_end) { + .snapshot = k.k->p.snapshot, + .offset = k.k->p.offset, + .seen = *seen, + }; + + n.seen.ids.data = kmemdup(seen->ids.data, + sizeof(seen->ids.data[0]) * seen->ids.size, + GFP_KERNEL); + if (!n.seen.ids.data) + return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + + darray_for_each(*extent_ends, i) { + if (i->snapshot == k.k->p.snapshot) { + snapshots_seen_exit(&i->seen); + *i = n; + return 0; + } + + if (i->snapshot >= k.k->p.snapshot) + break; + } + + return darray_insert_item(extent_ends, i - extent_ends->data, n); +} + +static void extent_ends_reset(extent_ends *extent_ends) +{ + struct extent_end *i; + + darray_for_each(*extent_ends, i) + snapshots_seen_exit(&i->seen); + + extent_ends->nr = 0; +} + +static int check_extent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct inode_walker *inode, + struct snapshots_seen *s, + extent_ends *extent_ends) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + struct bpos equiv; + int ret = 0; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } + + equiv = k.k->p; + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + if (k.k->type == KEY_TYPE_whiteout) + goto out; + + if (inode->cur_inum != k.k->p.inode) { + ret = check_i_sectors(trans, inode); + if (ret) + goto err; + + extent_ends_reset(extent_ends); + } + + BUG_ON(!iter->path->should_be_locked); + + ret = check_overlapping_extents(trans, s, extent_ends, k, iter); + if (ret) + goto err; + + ret = extent_ends_at(extent_ends, s, k); + if (ret) + goto err; + + i = walk_inode(trans, inode, equiv); + ret = PTR_ERR_OR_ZERO(i); + if (ret) + goto err; + + if (fsck_err_on(!i, c, + "extent in missing inode:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } + + if (!i) + goto out; + + if (fsck_err_on(!S_ISREG(i->inode.bi_mode) && + !S_ISLNK(i->inode.bi_mode), c, + "extent in non regular inode mode %o:\n %s", + i->inode.bi_mode, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } + + /* + * Check inodes in reverse order, from oldest snapshots to newest, so + * that we emit the fewest number of whiteouts necessary: + */ + for (i = inode->inodes.data + inode->inodes.nr - 1; + i >= inode->inodes.data; + --i) { + if (i->snapshot > equiv.snapshot || + !key_visible_in_snapshot(c, s, i->snapshot, equiv.snapshot)) + continue; + + if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_I_SIZE_DIRTY) && + k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && + !bkey_extent_is_reservation(k), c, + "extent type past end of inode %llu:%u, i_size %llu\n %s", + i->inode.bi_inum, i->snapshot, i->inode.bi_size, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + struct btree_iter iter2; + + bch2_trans_copy_iter(&iter2, iter); + bch2_btree_iter_set_snapshot(&iter2, i->snapshot); + ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_btree_delete_at(trans, &iter2, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + bch2_trans_iter_exit(trans, &iter2); + if (ret) + goto err; + + if (i->snapshot != equiv.snapshot) { + ret = snapshots_seen_add(c, s, i->snapshot); + if (ret) + goto err; + } + } + } + + if (bkey_extent_is_allocation(k.k)) + for_each_visible_inode(c, s, inode, equiv.snapshot, i) + i->count += k.k->size; +#if 0 + bch2_bkey_buf_reassemble(&prev, c, k); +#endif + +out: +err: +fsck_err: + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_fn(c, ret); + return ret; +} + +/* + * Walk extents: verify that extents have a corresponding S_ISREG inode, and + * that i_size an i_sectors are consistent + */ +int bch2_check_extents(struct bch_fs *c) +{ + struct inode_walker w = inode_walker_init(); + struct snapshots_seen s; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + extent_ends extent_ends = { 0 }; + struct disk_reservation res = { 0 }; + int ret = 0; + + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_extents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + &res, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({ + bch2_disk_reservation_put(c, &res); + check_extent(&trans, &iter, k, &w, &s, &extent_ends); + })); + + bch2_disk_reservation_put(c, &res); + extent_ends_reset(&extent_ends); + darray_exit(&extent_ends); + inode_walker_exit(&w); + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + u32 restart_count = trans->restart_count; + int ret = 0; + s64 count2; + + darray_for_each(w->inodes, i) { + if (i->inode.bi_nlink == i->count) + continue; + + count2 = bch2_count_subdirs(trans, w->cur_inum, i->snapshot); + if (count2 < 0) + return count2; + + if (i->count != count2) { + bch_err(c, "fsck counted subdirectories wrong: got %llu should be %llu", + i->count, count2); + i->count = count2; + if (i->inode.bi_nlink == i->count) + continue; + } + + if (fsck_err_on(i->inode.bi_nlink != i->count, c, + "directory %llu:%u with wrong i_nlink: got %u, should be %llu", + w->cur_inum, i->snapshot, i->inode.bi_nlink, i->count)) { + i->inode.bi_nlink = i->count; + ret = write_inode(trans, &i->inode, i->snapshot); + if (ret) + break; + } + } +fsck_err: + if (ret) + bch_err_fn(c, ret); + if (!ret && trans_was_restarted(trans, restart_count)) + ret = -BCH_ERR_transaction_restart_nested; + return ret; +} + +static int check_dirent_target(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + u32 target_snapshot) +{ + struct bch_fs *c = trans->c; + struct bkey_i_dirent *n; + bool backpointer_exists = true; + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (!inode_points_to_dirent(target, d)) { + ret = inode_backpointer_exists(trans, target, d.k->p.snapshot); + if (ret < 0) + goto err; + + backpointer_exists = ret; + ret = 0; + + if (fsck_err_on(S_ISDIR(target->bi_mode) && + backpointer_exists, c, + "directory %llu with multiple links", + target->bi_inum)) { + ret = __remove_dirent(trans, d.k->p); + goto out; + } + + if (fsck_err_on(backpointer_exists && + !target->bi_nlink, c, + "inode %llu type %s has multiple links but i_nlink 0", + target->bi_inum, bch2_d_types[d.v->d_type])) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_UNLINKED; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + + if (fsck_err_on(!backpointer_exists, c, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + + ret = __write_inode(trans, target, target_snapshot); + if (ret) + goto err; + } + } + + if (fsck_err_on(d.v->d_type != inode_d_type(target), c, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + goto err; + + d = dirent_i_to_s_c(n); + } + + if (d.v->d_type == DT_SUBVOL && + target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) && + (c->sb.version < bcachefs_metadata_version_subvol_dirent || + fsck_err(c, "dirent has wrong d_parent_subvol field: got %u, should be %u", + le32_to_cpu(d.v->d_parent_subvol), + target->bi_parent_subvol))) { + n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + + ret = bch2_trans_update(trans, iter, &n->k_i, 0); + if (ret) + goto err; + + d = dirent_i_to_s_c(n); + } +out: +err: +fsck_err: + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_fn(c, ret); + return ret; +} + +static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct bch_hash_info *hash_info, + struct inode_walker *dir, + struct inode_walker *target, + struct snapshots_seen *s) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_dirent d; + struct inode_walker_entry *i; + struct printbuf buf = PRINTBUF; + struct bpos equiv; + int ret = 0; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) { + ret = ret < 0 ? ret : 0; + goto out; + } + + equiv = k.k->p; + equiv.snapshot = bch2_snapshot_equiv(c, k.k->p.snapshot); + + ret = snapshots_seen_update(c, s, iter->btree_id, k.k->p); + if (ret) + goto err; + + if (k.k->type == KEY_TYPE_whiteout) + goto out; + + if (dir->cur_inum != k.k->p.inode) { + ret = check_subdir_count(trans, dir); + if (ret) + goto err; + } + + BUG_ON(!iter->path->should_be_locked); + + i = walk_inode(trans, dir, equiv); + ret = PTR_ERR_OR_ZERO(i); + if (ret < 0) + goto err; + + if (dir->first_this_inode) + *hash_info = bch2_hash_info_init(c, &dir->inodes.data[0].inode); + dir->first_this_inode = false; + + if (fsck_err_on(!i, c, + "dirent in nonexisting directory:\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + goto out; + } + + if (!i) + goto out; + + if (fsck_err_on(!S_ISDIR(i->inode.bi_mode), c, + "dirent in non directory inode type %s:\n%s", + bch2_d_type_str(inode_d_type(&i->inode)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto out; + } + + ret = hash_check_key(trans, bch2_dirent_hash_desc, hash_info, iter, k); + if (ret < 0) + goto err; + if (ret) { + /* dirent has been deleted */ + ret = 0; + goto out; + } + + if (k.k->type != KEY_TYPE_dirent) + goto out; + + d = bkey_s_c_to_dirent(k); + + if (d.v->d_type == DT_SUBVOL) { + struct bch_inode_unpacked subvol_root; + u32 target_subvol = le32_to_cpu(d.v->d_child_subvol); + u32 target_snapshot; + u64 target_inum; + + ret = __subvol_lookup(trans, target_subvol, + &target_snapshot, &target_inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, c, + "dirent points to missing subvolume %u", + le32_to_cpu(d.v->d_child_subvol))) { + ret = __remove_dirent(trans, d.k->p); + goto err; + } + + ret = __lookup_inode(trans, target_inum, + &subvol_root, &target_snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, c, + "subvolume %u points to missing subvolume root %llu", + target_subvol, + target_inum)) { + bch_err(c, "repair not implemented yet"); + ret = -EINVAL; + goto err; + } + + if (fsck_err_on(subvol_root.bi_subvol != target_subvol, c, + "subvol root %llu has wrong bi_subvol field: got %u, should be %u", + target_inum, + subvol_root.bi_subvol, target_subvol)) { + subvol_root.bi_subvol = target_subvol; + ret = __write_inode(trans, &subvol_root, target_snapshot); + if (ret) + goto err; + } + + ret = check_dirent_target(trans, iter, d, &subvol_root, + target_snapshot); + if (ret) + goto err; + } else { + ret = __get_visible_inodes(trans, target, s, le64_to_cpu(d.v->d_inum)); + if (ret) + goto err; + + if (fsck_err_on(!target->inodes.nr, c, + "dirent points to missing inode: (equiv %u)\n%s", + equiv.snapshot, + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + ret = __remove_dirent(trans, d.k->p); + if (ret) + goto err; + } + + darray_for_each(target->inodes, i) { + ret = check_dirent_target(trans, iter, d, + &i->inode, i->snapshot); + if (ret) + goto err; + } + } + + if (d.v->d_type == DT_DIR) + for_each_visible_inode(c, s, dir, equiv.snapshot, i) + i->count++; + +out: +err: +fsck_err: + printbuf_exit(&buf); + + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_fn(c, ret); + return ret; +} + +/* + * Walk dirents: verify that they all have a corresponding S_ISDIR inode, + * validate d_type + */ +int bch2_check_dirents(struct bch_fs *c) +{ + struct inode_walker dir = inode_walker_init(); + struct inode_walker target = inode_walker_init(); + struct snapshots_seen s; + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + snapshots_seen_init(&s); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_dirents, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_dirent(&trans, &iter, k, &hash_info, &dir, &target, &s)); + + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + inode_walker_exit(&dir); + inode_walker_exit(&target); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct bch_hash_info *hash_info, + struct inode_walker *inode) +{ + struct bch_fs *c = trans->c; + struct inode_walker_entry *i; + int ret; + + ret = check_key_has_snapshot(trans, iter, k); + if (ret) + return ret; + + i = walk_inode(trans, inode, k.k->p); + ret = PTR_ERR_OR_ZERO(i); + if (ret) + return ret; + + if (inode->first_this_inode) + *hash_info = bch2_hash_info_init(c, &inode->inodes.data[0].inode); + inode->first_this_inode = false; + + if (fsck_err_on(!i, c, + "xattr for missing inode %llu", + k.k->p.inode)) + return bch2_btree_delete_at(trans, iter, 0); + + if (!i) + return 0; + + ret = hash_check_key(trans, bch2_xattr_hash_desc, hash_info, iter, k); +fsck_err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_fn(c, ret); + return ret; +} + +/* + * Walk xattrs: verify that they all have a corresponding inode + */ +int bch2_check_xattrs(struct bch_fs *c) +{ + struct inode_walker inode = inode_walker_init(); + struct bch_hash_info hash_info; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + POS(BCACHEFS_ROOT_INO, 0), + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, + k, + NULL, NULL, + BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_xattr(&trans, &iter, k, &hash_info, &inode)); + + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int check_root_trans(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked root_inode; + u32 snapshot; + u64 inum; + int ret; + + ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (mustfix_fsck_err_on(ret, c, "root subvol missing")) { + struct bkey_i_subvolume root_subvol; + + snapshot = U32_MAX; + inum = BCACHEFS_ROOT_INO; + + bkey_subvolume_init(&root_subvol.k_i); + root_subvol.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_subvol.v.flags = 0; + root_subvol.v.snapshot = cpu_to_le32(snapshot); + root_subvol.v.inode = cpu_to_le64(inum); + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + __bch2_btree_insert(trans, BTREE_ID_subvolumes, + &root_subvol.k_i, 0)); + if (ret) { + bch_err(c, "error writing root subvol: %s", bch2_err_str(ret)); + goto err; + } + + } + + ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (mustfix_fsck_err_on(ret, c, "root directory missing") || + mustfix_fsck_err_on(!S_ISDIR(root_inode.bi_mode), c, + "root inode not a directory")) { + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, + 0, NULL); + root_inode.bi_inum = inum; + + ret = __write_inode(trans, &root_inode, snapshot); + if (ret) + bch_err(c, "error writing root inode: %s", bch2_err_str(ret)); + } +err: +fsck_err: + return ret; +} + +/* Get root directory, create if it doesn't exist: */ +int bch2_check_root(struct bch_fs *c) +{ + int ret; + + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + check_root_trans(&trans)); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +struct pathbuf_entry { + u64 inum; + u32 snapshot; +}; + +typedef DARRAY(struct pathbuf_entry) pathbuf; + +static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot) +{ + struct pathbuf_entry *i; + + darray_for_each(*p, i) + if (i->inum == inum && + i->snapshot == snapshot) + return true; + + return false; +} + +static int path_down(struct bch_fs *c, pathbuf *p, + u64 inum, u32 snapshot) +{ + int ret = darray_push(p, ((struct pathbuf_entry) { + .inum = inum, + .snapshot = snapshot, + })); + + if (ret) + bch_err(c, "fsck: error allocating memory for pathbuf, size %zu", + p->size); + return ret; +} + +/* + * Check that a given inode is reachable from the root: + * + * XXX: we should also be verifying that inodes are in the right subvolumes + */ +static int check_path(struct btree_trans *trans, + pathbuf *p, + struct bch_inode_unpacked *inode, + u32 snapshot) +{ + struct bch_fs *c = trans->c; + int ret = 0; + + snapshot = bch2_snapshot_equiv(c, snapshot); + p->nr = 0; + + while (!(inode->bi_inum == BCACHEFS_ROOT_INO && + inode->bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + struct btree_iter dirent_iter; + struct bkey_s_c_dirent d; + u32 parent_snapshot = snapshot; + + if (inode->bi_subvol) { + u64 inum; + + ret = subvol_lookup(trans, inode->bi_parent_subvol, + &parent_snapshot, &inum); + if (ret) + break; + } + + ret = lockrestart_do(trans, + PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter, + SPOS(inode->bi_dir, inode->bi_dir_offset, + parent_snapshot))).k)); + if (ret && !bch2_err_matches(ret, ENOENT)) + break; + + if (!ret && !dirent_points_to_inode(d, inode)) { + bch2_trans_iter_exit(trans, &dirent_iter); + ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + } + + if (bch2_err_matches(ret, ENOENT)) { + if (fsck_err(c, "unreachable inode %llu:%u, type %s nlink %u backptr %llu:%llu", + inode->bi_inum, snapshot, + bch2_d_type_str(inode_d_type(inode)), + inode->bi_nlink, + inode->bi_dir, + inode->bi_dir_offset)) + ret = reattach_inode(trans, inode, snapshot); + break; + } + + bch2_trans_iter_exit(trans, &dirent_iter); + + if (!S_ISDIR(inode->bi_mode)) + break; + + ret = path_down(c, p, inode->bi_inum, snapshot); + if (ret) { + bch_err(c, "memory allocation failure"); + return ret; + } + + snapshot = parent_snapshot; + + ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot); + if (ret) { + /* Should have been caught in dirents pass */ + bch_err(c, "error looking up parent directory: %i", ret); + break; + } + + if (path_is_dup(p, inode->bi_inum, snapshot)) { + struct pathbuf_entry *i; + + /* XXX print path */ + bch_err(c, "directory structure loop"); + + darray_for_each(*p, i) + pr_err("%llu:%u", i->inum, i->snapshot); + pr_err("%llu:%u", inode->bi_inum, snapshot); + + if (!fsck_err(c, "directory structure loop")) + return 0; + + ret = commit_do(trans, NULL, NULL, + BTREE_INSERT_NOFAIL| + BTREE_INSERT_LAZY_RW, + remove_backpointer(trans, inode)); + if (ret) { + bch_err(c, "error removing dirent: %i", ret); + break; + } + + ret = reattach_inode(trans, inode, snapshot); + } + } +fsck_err: + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* + * Check for unreachable inodes, as well as loops in the directory structure: + * After bch2_check_dirents(), if an inode backpointer doesn't exist that means it's + * unreachable: + */ +int bch2_check_directory_structure(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; + pathbuf path = { 0, }; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) + continue; + + ret = bch2_inode_unpack(k, &u); + if (ret) { + /* Should have been caught earlier in fsck: */ + bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret); + break; + } + + if (u.bi_flags & BCH_INODE_UNLINKED) + continue; + + ret = check_path(&trans, &path, &u, iter.pos.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + darray_exit(&path); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* check_nlink pass: */ + +struct nlink_table { + size_t nr; + size_t size; + + struct nlink { + u64 inum; + u32 snapshot; + u32 count; + } *d; +}; + +static int add_nlink(struct bch_fs *c, struct nlink_table *t, + u64 inum, u32 snapshot) +{ + if (t->nr == t->size) { + size_t new_size = max_t(size_t, 128UL, t->size * 2); + void *d = kvmalloc_array(new_size, sizeof(t->d[0]), GFP_KERNEL); + + if (!d) { + bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", + new_size); + return -BCH_ERR_ENOMEM_fsck_add_nlink; + } + + if (t->d) + memcpy(d, t->d, t->size * sizeof(t->d[0])); + kvfree(t->d); + + t->d = d; + t->size = new_size; + } + + + t->d[t->nr++] = (struct nlink) { + .inum = inum, + .snapshot = snapshot, + }; + + return 0; +} + +static int nlink_cmp(const void *_l, const void *_r) +{ + const struct nlink *l = _l; + const struct nlink *r = _r; + + return cmp_int(l->inum, r->inum) ?: cmp_int(l->snapshot, r->snapshot); +} + +static void inc_link(struct bch_fs *c, struct snapshots_seen *s, + struct nlink_table *links, + u64 range_start, u64 range_end, u64 inum, u32 snapshot) +{ + struct nlink *link, key = { + .inum = inum, .snapshot = U32_MAX, + }; + + if (inum < range_start || inum >= range_end) + return; + + link = __inline_bsearch(&key, links->d, links->nr, + sizeof(links->d[0]), nlink_cmp); + if (!link) + return; + + while (link > links->d && link[0].inum == link[-1].inum) + --link; + + for (; link < links->d + links->nr && link->inum == inum; link++) + if (ref_visible(c, s, snapshot, link->snapshot)) { + link->count++; + if (link->snapshot >= snapshot) + break; + } +} + +noinline_for_stack +static int check_nlinks_find_hardlinks(struct bch_fs *c, + struct nlink_table *t, + u64 start, u64 *end) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked u; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_inodes, + POS(0, start), + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + if (!bkey_is_inode(k.k)) + continue; + + /* Should never fail, checked by bch2_inode_invalid: */ + BUG_ON(bch2_inode_unpack(k, &u)); + + /* + * Backpointer and directory structure checks are sufficient for + * directories, since they can't have hardlinks: + */ + if (S_ISDIR(u.bi_mode)) + continue; + + if (!u.bi_nlink) + continue; + + ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot); + if (ret) { + *end = k.k->p.offset; + ret = 0; + break; + } + + } + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (ret) + bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + + return ret; +} + +noinline_for_stack +static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links, + u64 range_start, u64 range_end) +{ + struct btree_trans trans; + struct snapshots_seen s; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_dirent d; + int ret; + + snapshots_seen_init(&s); + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for_each_btree_key(&trans, iter, BTREE_ID_dirents, POS_MIN, + BTREE_ITER_INTENT| + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p); + if (ret) + break; + + switch (k.k->type) { + case KEY_TYPE_dirent: + d = bkey_s_c_to_dirent(k); + + if (d.v->d_type != DT_DIR && + d.v->d_type != DT_SUBVOL) + inc_link(c, &s, links, range_start, range_end, + le64_to_cpu(d.v->d_inum), + bch2_snapshot_equiv(c, d.k->p.snapshot)); + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + bch_err(c, "error in fsck: btree error %i while walking dirents", ret); + + bch2_trans_exit(&trans); + snapshots_seen_exit(&s); + return ret; +} + +static int check_nlinks_update_inode(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k, + struct nlink_table *links, + size_t *idx, u64 range_end) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct nlink *link = &links->d[*idx]; + int ret = 0; + + if (k.k->p.offset >= range_end) + return 1; + + if (!bkey_is_inode(k.k)) + return 0; + + BUG_ON(bch2_inode_unpack(k, &u)); + + if (S_ISDIR(u.bi_mode)) + return 0; + + if (!u.bi_nlink) + return 0; + + while ((cmp_int(link->inum, k.k->p.offset) ?: + cmp_int(link->snapshot, k.k->p.snapshot)) < 0) { + BUG_ON(*idx == links->nr); + link = &links->d[++*idx]; + } + + if (fsck_err_on(bch2_inode_nlink_get(&u) != link->count, c, + "inode %llu type %s has wrong i_nlink (%u, should be %u)", + u.bi_inum, bch2_d_types[mode_to_type(u.bi_mode)], + bch2_inode_nlink_get(&u), link->count)) { + bch2_inode_nlink_set(&u, link->count); + ret = __write_inode(trans, &u, k.k->p.snapshot); + } +fsck_err: + return ret; +} + +noinline_for_stack +static int check_nlinks_update_hardlinks(struct bch_fs *c, + struct nlink_table *links, + u64 range_start, u64 range_end) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + size_t idx = 0; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_inodes, + POS(0, range_start), + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_nlinks_update_inode(&trans, &iter, k, links, &idx, range_end)); + + bch2_trans_exit(&trans); + + if (ret < 0) { + bch_err(c, "error in fsck: btree error %i while walking inodes", ret); + return ret; + } + + return 0; +} + +int bch2_check_nlinks(struct bch_fs *c) +{ + struct nlink_table links = { 0 }; + u64 this_iter_range_start, next_iter_range_start = 0; + int ret = 0; + + do { + this_iter_range_start = next_iter_range_start; + next_iter_range_start = U64_MAX; + + ret = check_nlinks_find_hardlinks(c, &links, + this_iter_range_start, + &next_iter_range_start); + + ret = check_nlinks_walk_dirents(c, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) + break; + + ret = check_nlinks_update_hardlinks(c, &links, + this_iter_range_start, + next_iter_range_start); + if (ret) + break; + + links.nr = 0; + } while (next_iter_range_start != U64_MAX); + + kvfree(links.d); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p; + struct bkey_i_reflink_p *u; + int ret; + + if (k.k->type != KEY_TYPE_reflink_p) + return 0; + + p = bkey_s_c_to_reflink_p(k); + + if (!p.v->front_pad && !p.v->back_pad) + return 0; + + u = bch2_trans_kmalloc(trans, sizeof(*u)); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; + + bkey_reassemble(&u->k_i, k); + u->v.front_pad = 0; + u->v.back_pad = 0; + + return bch2_trans_update(trans, iter, &u->k_i, BTREE_TRIGGER_NORUN); +} + +int bch2_fix_reflink_p(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix) + return 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_INTENT|BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + fix_reflink_p_key(&trans, &iter, k))); + + if (ret) + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h new file mode 100644 index 000000000..90c87b508 --- /dev/null +++ b/fs/bcachefs/fsck.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_FSCK_H +#define _BCACHEFS_FSCK_H + +int bch2_check_inodes(struct bch_fs *); +int bch2_check_extents(struct bch_fs *); +int bch2_check_dirents(struct bch_fs *); +int bch2_check_xattrs(struct bch_fs *); +int bch2_check_root(struct bch_fs *); +int bch2_check_directory_structure(struct bch_fs *); +int bch2_check_nlinks(struct bch_fs *); +int bch2_fix_reflink_p(struct bch_fs *); + +#endif /* _BCACHEFS_FSCK_H */ diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c new file mode 100644 index 000000000..fa435d865 --- /dev/null +++ b/fs/bcachefs/inode.c @@ -0,0 +1,872 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "buckets.h" +#include "error.h" +#include "extents.h" +#include "extent_update.h" +#include "inode.h" +#include "str_hash.h" +#include "subvolume.h" +#include "varint.h" + +#include + +#include + +const char * const bch2_inode_opts[] = { +#define x(name, ...) #name, + BCH_INODE_OPTS() +#undef x + NULL, +}; + +static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; + +static int inode_decode_field(const u8 *in, const u8 *end, + u64 out[2], unsigned *out_bits) +{ + __be64 be[2] = { 0, 0 }; + unsigned bytes, shift; + u8 *p; + + if (in >= end) + return -1; + + if (!*in) + return -1; + + /* + * position of highest set bit indicates number of bytes: + * shift = number of bits to remove in high byte: + */ + shift = 8 - __fls(*in); /* 1 <= shift <= 8 */ + bytes = byte_table[shift - 1]; + + if (in + bytes > end) + return -1; + + p = (u8 *) be + 16 - bytes; + memcpy(p, in, bytes); + *p ^= (1 << 8) >> shift; + + out[0] = be64_to_cpu(be[0]); + out[1] = be64_to_cpu(be[1]); + *out_bits = out[0] ? 64 + fls64(out[0]) : fls64(out[1]); + + return bytes; +} + +static inline void bch2_inode_pack_inlined(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + struct bkey_i_inode_v3 *k = &packed->inode; + u8 *out = k->v.fields; + u8 *end = (void *) &packed[1]; + u8 *last_nonzero_field = out; + unsigned nr_fields = 0, last_nonzero_fieldnr = 0; + unsigned bytes; + int ret; + + bkey_inode_v3_init(&packed->inode.k_i); + packed->inode.k.p.offset = inode->bi_inum; + packed->inode.v.bi_journal_seq = cpu_to_le64(inode->bi_journal_seq); + packed->inode.v.bi_hash_seed = inode->bi_hash_seed; + packed->inode.v.bi_flags = cpu_to_le64(inode->bi_flags); + packed->inode.v.bi_sectors = cpu_to_le64(inode->bi_sectors); + packed->inode.v.bi_size = cpu_to_le64(inode->bi_size); + packed->inode.v.bi_version = cpu_to_le64(inode->bi_version); + SET_INODEv3_MODE(&packed->inode.v, inode->bi_mode); + SET_INODEv3_FIELDS_START(&packed->inode.v, INODEv3_FIELDS_START_CUR); + + +#define x(_name, _bits) \ + nr_fields++; \ + \ + if (inode->_name) { \ + ret = bch2_varint_encode_fast(out, inode->_name); \ + out += ret; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + \ + last_nonzero_field = out; \ + last_nonzero_fieldnr = nr_fields; \ + } else { \ + *out++ = 0; \ + \ + if (_bits > 64) \ + *out++ = 0; \ + } + + BCH_INODE_FIELDS_v3() +#undef x + BUG_ON(out > end); + + out = last_nonzero_field; + nr_fields = last_nonzero_fieldnr; + + bytes = out - (u8 *) &packed->inode.v; + set_bkey_val_bytes(&packed->inode.k, bytes); + memset_u64s_tail(&packed->inode.v, 0, bytes); + + SET_INODEv3_NR_FIELDS(&k->v, nr_fields); + + if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) { + struct bch_inode_unpacked unpacked; + + int ret = bch2_inode_unpack(bkey_i_to_s_c(&packed->inode.k_i), + &unpacked); + BUG_ON(ret); + BUG_ON(unpacked.bi_inum != inode->bi_inum); + BUG_ON(unpacked.bi_hash_seed != inode->bi_hash_seed); + BUG_ON(unpacked.bi_sectors != inode->bi_sectors); + BUG_ON(unpacked.bi_size != inode->bi_size); + BUG_ON(unpacked.bi_version != inode->bi_version); + BUG_ON(unpacked.bi_mode != inode->bi_mode); + +#define x(_name, _bits) if (unpacked._name != inode->_name) \ + panic("unpacked %llu should be %llu", \ + (u64) unpacked._name, (u64) inode->_name); + BCH_INODE_FIELDS_v3() +#undef x + } +} + +void bch2_inode_pack(struct bkey_inode_buf *packed, + const struct bch_inode_unpacked *inode) +{ + bch2_inode_pack_inlined(packed, inode); +} + +static noinline int bch2_inode_unpack_v1(struct bkey_s_c_inode inode, + struct bch_inode_unpacked *unpacked) +{ + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + u64 field[2]; + unsigned fieldnr = 0, field_bits; + int ret; + +#define x(_name, _bits) \ + if (fieldnr++ == INODE_NR_FIELDS(inode.v)) { \ + unsigned offset = offsetof(struct bch_inode_unpacked, _name);\ + memset((void *) unpacked + offset, 0, \ + sizeof(*unpacked) - offset); \ + return 0; \ + } \ + \ + ret = inode_decode_field(in, end, field, &field_bits); \ + if (ret < 0) \ + return ret; \ + \ + if (field_bits > sizeof(unpacked->_name) * 8) \ + return -1; \ + \ + unpacked->_name = field[1]; \ + in += ret; + + BCH_INODE_FIELDS_v2() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v2(struct bch_inode_unpacked *unpacked, + const u8 *in, const u8 *end, + unsigned nr_fields) +{ + unsigned fieldnr = 0; + int ret; + u64 v[2]; + +#define x(_name, _bits) \ + if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS_v2() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static int bch2_inode_unpack_v3(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + const u8 *in = inode.v->fields; + const u8 *end = bkey_val_end(inode); + unsigned nr_fields = INODEv3_NR_FIELDS(inode.v); + unsigned fieldnr = 0; + int ret; + u64 v[2]; + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_sectors = le64_to_cpu(inode.v->bi_sectors); + unpacked->bi_size = le64_to_cpu(inode.v->bi_size); + unpacked->bi_version = le64_to_cpu(inode.v->bi_version); + unpacked->bi_mode = INODEv3_MODE(inode.v); + +#define x(_name, _bits) \ + if (fieldnr < nr_fields) { \ + ret = bch2_varint_decode_fast(in, end, &v[0]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + \ + if (_bits > 64) { \ + ret = bch2_varint_decode_fast(in, end, &v[1]); \ + if (ret < 0) \ + return ret; \ + in += ret; \ + } else { \ + v[1] = 0; \ + } \ + } else { \ + v[0] = v[1] = 0; \ + } \ + \ + unpacked->_name = v[0]; \ + if (v[1] || v[0] != unpacked->_name) \ + return -1; \ + fieldnr++; + + BCH_INODE_FIELDS_v3() +#undef x + + /* XXX: signal if there were more fields than expected? */ + return 0; +} + +static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + memset(unpacked, 0, sizeof(*unpacked)); + + switch (k.k->type) { + case KEY_TYPE_inode: { + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= 0; + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + if (INODE_NEW_VARINT(inode.v)) { + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODE_NR_FIELDS(inode.v)); + } else { + return bch2_inode_unpack_v1(inode, unpacked); + } + break; + } + case KEY_TYPE_inode_v2: { + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + + unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); + unpacked->bi_hash_seed = inode.v->bi_hash_seed; + unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); + unpacked->bi_mode = le16_to_cpu(inode.v->bi_mode); + + return bch2_inode_unpack_v2(unpacked, inode.v->fields, + bkey_val_end(inode), + INODEv2_NR_FIELDS(inode.v)); + } + default: + BUG(); + } +} + +int bch2_inode_unpack(struct bkey_s_c k, + struct bch_inode_unpacked *unpacked) +{ + if (likely(k.k->type == KEY_TYPE_inode_v3)) + return bch2_inode_unpack_v3(k, unpacked); + return bch2_inode_unpack_slowpath(k, unpacked); +} + +int bch2_inode_peek(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode, + subvol_inum inum, unsigned flags) +{ + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + k = bch2_bkey_get_iter(trans, iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + flags|BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; + if (ret) + goto err; + + ret = bch2_inode_unpack(k, inode); + if (ret) + goto err; + + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + +int bch2_inode_write(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode) +{ + struct bkey_inode_buf *inode_p; + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return PTR_ERR(inode_p); + + bch2_inode_pack_inlined(inode_p, inode); + inode_p->inode.k.p.snapshot = iter->snapshot; + return bch2_trans_update(trans, iter, &inode_p->inode.k_i, 0); +} + +struct bkey_i *bch2_inode_to_v3(struct btree_trans *trans, struct bkey_i *k) +{ + struct bch_inode_unpacked u; + struct bkey_inode_buf *inode_p; + int ret; + + if (!bkey_is_inode(&k->k)) + return ERR_PTR(-ENOENT); + + inode_p = bch2_trans_kmalloc(trans, sizeof(*inode_p)); + if (IS_ERR(inode_p)) + return ERR_CAST(inode_p); + + ret = bch2_inode_unpack(bkey_i_to_s_c(k), &u); + if (ret) + return ERR_PTR(ret); + + bch2_inode_pack(inode_p, &u); + return &inode_p->inode.k_i; +} + +static int __bch2_inode_invalid(struct bkey_s_c k, struct printbuf *err) +{ + struct bch_inode_unpacked unpacked; + + if (k.k->p.inode) { + prt_printf(err, "nonzero k.p.inode"); + return -BCH_ERR_invalid_bkey; + } + + if (k.k->p.offset < BLOCKDEV_INODE_MAX) { + prt_printf(err, "fs inode in blockdev range"); + return -BCH_ERR_invalid_bkey; + } + + if (bch2_inode_unpack(k, &unpacked)) { + prt_printf(err, "invalid variable length fields"); + return -BCH_ERR_invalid_bkey; + } + + if (unpacked.bi_data_checksum >= BCH_CSUM_OPT_NR + 1) { + prt_printf(err, "invalid data checksum type (%u >= %u", + unpacked.bi_data_checksum, BCH_CSUM_OPT_NR + 1); + return -BCH_ERR_invalid_bkey; + } + + if (unpacked.bi_compression >= BCH_COMPRESSION_OPT_NR + 1) { + prt_printf(err, "invalid data checksum type (%u >= %u)", + unpacked.bi_compression, BCH_COMPRESSION_OPT_NR + 1); + return -BCH_ERR_invalid_bkey; + } + + if ((unpacked.bi_flags & BCH_INODE_UNLINKED) && + unpacked.bi_nlink != 0) { + prt_printf(err, "flagged as unlinked but bi_nlink != 0"); + return -BCH_ERR_invalid_bkey; + } + + if (unpacked.bi_subvol && !S_ISDIR(unpacked.bi_mode)) { + prt_printf(err, "subvolume root but not a directory"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_inode_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); + + if (INODE_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { + prt_printf(err, "invalid str hash type (%llu >= %u)", + INODE_STR_HASH(inode.v), BCH_STR_HASH_NR); + return -BCH_ERR_invalid_bkey; + } + + return __bch2_inode_invalid(k, err); +} + +int bch2_inode_v2_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); + + if (INODEv2_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { + prt_printf(err, "invalid str hash type (%llu >= %u)", + INODEv2_STR_HASH(inode.v), BCH_STR_HASH_NR); + return -BCH_ERR_invalid_bkey; + } + + return __bch2_inode_invalid(k, err); +} + +int bch2_inode_v3_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_inode_v3 inode = bkey_s_c_to_inode_v3(k); + + if (INODEv3_FIELDS_START(inode.v) < INODEv3_FIELDS_START_INITIAL || + INODEv3_FIELDS_START(inode.v) > bkey_val_u64s(inode.k)) { + prt_printf(err, "invalid fields_start (got %llu, min %u max %zu)", + INODEv3_FIELDS_START(inode.v), + INODEv3_FIELDS_START_INITIAL, + bkey_val_u64s(inode.k)); + return -BCH_ERR_invalid_bkey; + } + + if (INODEv3_STR_HASH(inode.v) >= BCH_STR_HASH_NR) { + prt_printf(err, "invalid str hash type (%llu >= %u)", + INODEv3_STR_HASH(inode.v), BCH_STR_HASH_NR); + return -BCH_ERR_invalid_bkey; + } + + return __bch2_inode_invalid(k, err); +} + +static void __bch2_inode_unpacked_to_text(struct printbuf *out, + struct bch_inode_unpacked *inode) +{ + prt_printf(out, "mode %o flags %x journal_seq %llu bi_size %llu bi_sectors %llu bi_version %llu", + inode->bi_mode, inode->bi_flags, + inode->bi_journal_seq, + inode->bi_size, + inode->bi_sectors, + inode->bi_version); + +#define x(_name, _bits) \ + prt_printf(out, " "#_name " %llu", (u64) inode->_name); + BCH_INODE_FIELDS_v3() +#undef x +} + +void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode) +{ + prt_printf(out, "inum: %llu ", inode->bi_inum); + __bch2_inode_unpacked_to_text(out, inode); +} + +void bch2_inode_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_inode_unpacked inode; + + if (bch2_inode_unpack(k, &inode)) { + prt_printf(out, "(unpack error)"); + return; + } + + __bch2_inode_unpacked_to_text(out, &inode); +} + +int bch2_inode_generation_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (k.k->p.inode) { + prt_printf(err, "nonzero k.p.inode"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_inode_generation_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_inode_generation gen = bkey_s_c_to_inode_generation(k); + + prt_printf(out, "generation: %u", le32_to_cpu(gen.v->bi_generation)); +} + +void bch2_inode_init_early(struct bch_fs *c, + struct bch_inode_unpacked *inode_u) +{ + enum bch_str_hash_type str_hash = + bch2_str_hash_opt_to_type(c, c->opts.str_hash); + + memset(inode_u, 0, sizeof(*inode_u)); + + /* ick */ + inode_u->bi_flags |= str_hash << INODE_STR_HASH_OFFSET; + get_random_bytes(&inode_u->bi_hash_seed, + sizeof(inode_u->bi_hash_seed)); +} + +void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ + inode_u->bi_mode = mode; + inode_u->bi_uid = uid; + inode_u->bi_gid = gid; + inode_u->bi_dev = rdev; + inode_u->bi_atime = now; + inode_u->bi_mtime = now; + inode_u->bi_ctime = now; + inode_u->bi_otime = now; + + if (parent && parent->bi_mode & S_ISGID) { + inode_u->bi_gid = parent->bi_gid; + if (S_ISDIR(mode)) + inode_u->bi_mode |= S_ISGID; + } + + if (parent) { +#define x(_name, ...) inode_u->bi_##_name = parent->bi_##_name; + BCH_INODE_OPTS() +#undef x + } +} + +void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, + uid_t uid, gid_t gid, umode_t mode, dev_t rdev, + struct bch_inode_unpacked *parent) +{ + bch2_inode_init_early(c, inode_u); + bch2_inode_init_late(inode_u, bch2_current_time(c), + uid, gid, mode, rdev, parent); +} + +static inline u32 bkey_generation(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_inode: + case KEY_TYPE_inode_v2: + BUG(); + case KEY_TYPE_inode_generation: + return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); + default: + return 0; + } +} + +/* + * This just finds an empty slot: + */ +int bch2_inode_create(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_inode_unpacked *inode_u, + u32 snapshot, u64 cpu) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c k; + u64 min, max, start, pos, *hint; + int ret = 0; + unsigned bits = (c->opts.inodes_32bit ? 31 : 63); + + if (c->opts.shard_inode_numbers) { + bits -= c->inode_shard_bits; + + min = (cpu << bits); + max = (cpu << bits) | ~(ULLONG_MAX << bits); + + min = max_t(u64, min, BLOCKDEV_INODE_MAX); + hint = c->unused_inode_hints + cpu; + } else { + min = BLOCKDEV_INODE_MAX; + max = ~(ULLONG_MAX << bits); + hint = c->unused_inode_hints; + } + + start = READ_ONCE(*hint); + + if (start >= max || start < min) + start = min; + + pos = start; + bch2_trans_iter_init(trans, iter, BTREE_ID_inodes, POS(0, pos), + BTREE_ITER_ALL_SNAPSHOTS| + BTREE_ITER_INTENT); +again: + while ((k = bch2_btree_iter_peek(iter)).k && + !(ret = bkey_err(k)) && + bkey_lt(k.k->p, POS(0, max))) { + if (pos < iter->pos.offset) + goto found_slot; + + /* + * We don't need to iterate over keys in every snapshot once + * we've found just one: + */ + pos = iter->pos.offset + 1; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + } + + if (!ret && pos < max) + goto found_slot; + + if (!ret && start == min) + ret = -BCH_ERR_ENOSPC_inode_create; + + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; + } + + /* Retry from start */ + pos = start = min; + bch2_btree_iter_set_pos(iter, POS(0, pos)); + goto again; +found_slot: + bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) { + bch2_trans_iter_exit(trans, iter); + return ret; + } + + *hint = k.k->p.offset; + inode_u->bi_inum = k.k->p.offset; + inode_u->bi_generation = bkey_generation(k); + return 0; +} + +static int bch2_inode_delete_keys(struct btree_trans *trans, + subvol_inum inum, enum btree_id id) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i delete; + u32 snapshot; + int ret = 0; + + /* + * We're never going to be deleting partial extents, no need to use an + * extent iterator: + */ + bch2_trans_iter_init(trans, &iter, id, POS(inum.inum, 0), + BTREE_ITER_INTENT|BTREE_ITER_NOT_EXTENTS); + + while (1) { + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_btree_iter_set_snapshot(&iter, snapshot); + + k = bch2_btree_iter_peek_upto(&iter, POS(inum.inum, U64_MAX)); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + break; + + bkey_init(&delete.k); + delete.k.p = iter.pos; + + ret = bch2_trans_update(trans, &iter, &delete, 0) ?: + bch2_trans_commit(trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + break; + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_i_inode_generation delete; + struct bch_inode_unpacked inode_u; + struct bkey_s_c k; + u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 1024); + + /* + * If this was a directory, there shouldn't be any real dirents left - + * but there could be whiteouts (from hash collisions) that we should + * delete: + * + * XXX: the dirent could ideally would delete whiteouts when they're no + * longer needed + */ + ret = bch2_inode_delete_keys(&trans, inum, BTREE_ID_extents) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_xattrs) ?: + bch2_inode_delete_keys(&trans, inum, BTREE_ID_dirents); + if (ret) + goto err; +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + k = bch2_bkey_get_iter(&trans, &iter, BTREE_ID_inodes, + SPOS(0, inum.inum, snapshot), + BTREE_ITER_INTENT|BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + goto err; + + if (!bkey_is_inode(k.k)) { + bch2_fs_inconsistent(trans.c, + "inode %llu:%u not found when deleting", + inum.inum, snapshot); + ret = -EIO; + goto err; + } + + bch2_inode_unpack(k, &inode_u); + + bkey_inode_generation_init(&delete.k_i); + delete.k.p = iter.pos; + delete.v.bi_generation = cpu_to_le32(inode_u.bi_generation + 1); + + ret = bch2_trans_update(&trans, &iter, &delete.k_i, 0) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); +err: + bch2_trans_iter_exit(&trans, &iter); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + return ret; +} + +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, NULL, NULL, 0, + bch2_inode_find_by_inum_trans(&trans, inum, inode)); +} + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) +{ + if (bi->bi_flags & BCH_INODE_UNLINKED) + bi->bi_flags &= ~BCH_INODE_UNLINKED; + else { + if (bi->bi_nlink == U32_MAX) + return -EINVAL; + + bi->bi_nlink++; + } + + return 0; +} + +void bch2_inode_nlink_dec(struct btree_trans *trans, struct bch_inode_unpacked *bi) +{ + if (bi->bi_nlink && (bi->bi_flags & BCH_INODE_UNLINKED)) { + bch2_trans_inconsistent(trans, "inode %llu unlinked but link count nonzero", + bi->bi_inum); + return; + } + + if (bi->bi_flags & BCH_INODE_UNLINKED) { + bch2_trans_inconsistent(trans, "inode %llu link count underflow", bi->bi_inum); + return; + } + + if (bi->bi_nlink) + bi->bi_nlink--; + else + bi->bi_flags |= BCH_INODE_UNLINKED; +} + +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *inode) +{ + struct bch_opts ret = { 0 }; +#define x(_name, _bits) \ + if (inode->bi_##_name) \ + opt_set(ret, _name, inode->bi_##_name - 1); + BCH_INODE_OPTS() +#undef x + return ret; +} + +void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, + struct bch_inode_unpacked *inode) +{ +#define x(_name, _bits) opts->_name = inode_opt_get(c, inode, _name); + BCH_INODE_OPTS() +#undef x + + if (opts->nocow) + opts->compression = opts->background_compression = opts->data_checksum = opts->erasure_code = 0; +} diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h new file mode 100644 index 000000000..8f9be5e58 --- /dev/null +++ b/fs/bcachefs/inode.h @@ -0,0 +1,196 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_INODE_H +#define _BCACHEFS_INODE_H + +#include "bkey.h" +#include "opts.h" + +enum bkey_invalid_flags; +extern const char * const bch2_inode_opts[]; + +int bch2_inode_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_v2_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_inode_v3_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode ((struct bkey_ops) { \ + .key_invalid = bch2_inode_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 16, \ +}) + +#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \ + .key_invalid = bch2_inode_v2_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 32, \ +}) + +#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \ + .key_invalid = bch2_inode_v3_invalid, \ + .val_to_text = bch2_inode_to_text, \ + .trans_trigger = bch2_trans_mark_inode, \ + .atomic_trigger = bch2_mark_inode, \ + .min_val_size = 48, \ +}) + +static inline bool bkey_is_inode(const struct bkey *k) +{ + return k->type == KEY_TYPE_inode || + k->type == KEY_TYPE_inode_v2 || + k->type == KEY_TYPE_inode_v3; +} + +int bch2_inode_generation_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_inode_generation_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_inode_generation ((struct bkey_ops) { \ + .key_invalid = bch2_inode_generation_invalid, \ + .val_to_text = bch2_inode_generation_to_text, \ + .min_val_size = 8, \ +}) + +#if 0 +typedef struct { + u64 lo; + u32 hi; +} __packed __aligned(4) u96; +#endif +typedef u64 u96; + +struct bch_inode_unpacked { + u64 bi_inum; + u64 bi_journal_seq; + __le64 bi_hash_seed; + u64 bi_size; + u64 bi_sectors; + u64 bi_version; + u32 bi_flags; + u16 bi_mode; + +#define x(_name, _bits) u##_bits _name; + BCH_INODE_FIELDS_v3() +#undef x +}; + +struct bkey_inode_buf { + struct bkey_i_inode_v3 inode; + +#define x(_name, _bits) + 8 + _bits / 8 + u8 _pad[0 + BCH_INODE_FIELDS_v3()]; +#undef x +} __packed __aligned(8); + +void bch2_inode_pack(struct bkey_inode_buf *, const struct bch_inode_unpacked *); +int bch2_inode_unpack(struct bkey_s_c, struct bch_inode_unpacked *); +struct bkey_i *bch2_inode_to_v3(struct btree_trans *, struct bkey_i *); + +void bch2_inode_unpacked_to_text(struct printbuf *, struct bch_inode_unpacked *); + +int bch2_inode_peek(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, subvol_inum, unsigned); +int bch2_inode_write(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *); + +void bch2_inode_init_early(struct bch_fs *, + struct bch_inode_unpacked *); +void bch2_inode_init_late(struct bch_inode_unpacked *, u64, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); +void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, + uid_t, gid_t, umode_t, dev_t, + struct bch_inode_unpacked *); + +int bch2_inode_create(struct btree_trans *, struct btree_iter *, + struct bch_inode_unpacked *, u32, u64); + +int bch2_inode_rm(struct bch_fs *, subvol_inum); + +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +#define inode_opt_get(_c, _inode, _name) \ + ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) + +static inline void bch2_inode_opt_set(struct bch_inode_unpacked *inode, + enum inode_opt_id id, u64 v) +{ + switch (id) { +#define x(_name, ...) \ + case Inode_opt_##_name: \ + inode->bi_##_name = v; \ + break; + BCH_INODE_OPTS() +#undef x + default: + BUG(); + } +} + +static inline u64 bch2_inode_opt_get(struct bch_inode_unpacked *inode, + enum inode_opt_id id) +{ + switch (id) { +#define x(_name, ...) \ + case Inode_opt_##_name: \ + return inode->bi_##_name; + BCH_INODE_OPTS() +#undef x + default: + BUG(); + } +} + +static inline u8 mode_to_type(umode_t mode) +{ + return (mode >> 12) & 15; +} + +static inline u8 inode_d_type(struct bch_inode_unpacked *inode) +{ + return inode->bi_subvol ? DT_SUBVOL : mode_to_type(inode->bi_mode); +} + +/* i_nlink: */ + +static inline unsigned nlink_bias(umode_t mode) +{ + return S_ISDIR(mode) ? 2 : 1; +} + +static inline unsigned bch2_inode_nlink_get(struct bch_inode_unpacked *bi) +{ + return bi->bi_flags & BCH_INODE_UNLINKED + ? 0 + : bi->bi_nlink + nlink_bias(bi->bi_mode); +} + +static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, + unsigned nlink) +{ + if (nlink) { + bi->bi_nlink = nlink - nlink_bias(bi->bi_mode); + bi->bi_flags &= ~BCH_INODE_UNLINKED; + } else { + bi->bi_nlink = 0; + bi->bi_flags |= BCH_INODE_UNLINKED; + } +} + +int bch2_inode_nlink_inc(struct bch_inode_unpacked *); +void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); + +struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); +void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, + struct bch_inode_unpacked *); + +#endif /* _BCACHEFS_INODE_H */ diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c new file mode 100644 index 000000000..8604df80a --- /dev/null +++ b/fs/bcachefs/io.c @@ -0,0 +1,3056 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some low level IO code, and hacks for various block layer limitations + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_buf.h" +#include "bset.h" +#include "btree_update.h" +#include "buckets.h" +#include "checksum.h" +#include "compress.h" +#include "clock.h" +#include "data_update.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "extent_update.h" +#include "inode.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "nocow_locking.h" +#include "rebalance.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" +#include "trace.h" + +#include +#include +#include +#include + +const char *bch2_blk_status_to_str(blk_status_t status) +{ + if (status == BLK_STS_REMOVED) + return "device removed"; + return blk_status_to_str(status); +} + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + const struct bch_devs_mask *devs; + unsigned d, nr = 0, total = 0; + u64 now = local_clock(), last; + s64 congested; + struct bch_dev *ca; + + if (!target) + return false; + + rcu_read_lock(); + devs = bch2_target_to_mask(c, target) ?: + &c->rw_devs[BCH_DATA_user]; + + for_each_set_bit(d, devs->d, BCH_SB_MEMBERS_MAX) { + ca = rcu_dereference(c->devs[d]); + if (!ca) + continue; + + congested = atomic_read(&ca->congested); + last = READ_ONCE(ca->congested_last); + if (time_after64(now, last)) + congested -= (now - last) >> 12; + + total += max(congested, 0LL); + nr++; + } + rcu_read_unlock(); + + return bch2_rand_range(nr * CONGESTED_MAX) < total; +} + +static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, + u64 now, int rw) +{ + u64 latency_capable = + ca->io_latency[rw].quantiles.entries[QUANTILE_IDX(1)].m; + /* ideally we'd be taking into account the device's variance here: */ + u64 latency_threshold = latency_capable << (rw == READ ? 2 : 3); + s64 latency_over = io_latency - latency_threshold; + + if (latency_threshold && latency_over > 0) { + /* + * bump up congested by approximately latency_over * 4 / + * latency_threshold - we don't need much accuracy here so don't + * bother with the divide: + */ + if (atomic_read(&ca->congested) < CONGESTED_MAX) + atomic_add(latency_over >> + max_t(int, ilog2(latency_threshold) - 2, 0), + &ca->congested); + + ca->congested_last = now; + } else if (atomic_read(&ca->congested) > 0) { + atomic_dec(&ca->congested); + } +} + +void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) +{ + atomic64_t *latency = &ca->cur_latency[rw]; + u64 now = local_clock(); + u64 io_latency = time_after64(now, submit_time) + ? now - submit_time + : 0; + u64 old, new, v = atomic64_read(latency); + + do { + old = v; + + /* + * If the io latency was reasonably close to the current + * latency, skip doing the update and atomic operation - most of + * the time: + */ + if (abs((int) (old - io_latency)) < (old >> 1) && + now & ~(~0U << 5)) + break; + + new = ewma_add(old, io_latency, 5); + } while ((v = atomic64_cmpxchg(latency, old, new)) != old); + + bch2_congested_acct(ca, io_latency, now, rw); + + __bch2_time_stats_update(&ca->io_latency[rw], submit_time, now); +} + +#else + +static bool bch2_target_congested(struct bch_fs *c, u16 target) +{ + return false; +} + +#endif + +/* Allocate, free from mempool: */ + +void bch2_bio_free_pages_pool(struct bch_fs *c, struct bio *bio) +{ + struct bvec_iter_all iter; + struct bio_vec *bv; + + bio_for_each_segment_all(bv, bio, iter) + if (bv->bv_page != ZERO_PAGE(0)) + mempool_free(bv->bv_page, &c->bio_bounce_pages); + bio->bi_vcnt = 0; +} + +static struct page *__bio_alloc_page_pool(struct bch_fs *c, bool *using_mempool) +{ + struct page *page; + + if (likely(!*using_mempool)) { + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) { + mutex_lock(&c->bio_bounce_pages_lock); + *using_mempool = true; + goto pool_alloc; + + } + } else { +pool_alloc: + page = mempool_alloc(&c->bio_bounce_pages, GFP_NOFS); + } + + return page; +} + +void bch2_bio_alloc_pages_pool(struct bch_fs *c, struct bio *bio, + size_t size) +{ + bool using_mempool = false; + + while (size) { + struct page *page = __bio_alloc_page_pool(c, &using_mempool); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + BUG_ON(!bio_add_page(bio, page, len, 0)); + size -= len; + } + + if (using_mempool) + mutex_unlock(&c->bio_bounce_pages_lock); +} + +/* Extent update path: */ + +int bch2_sum_sector_overwrites(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *new, + bool *usage_increasing, + s64 *i_sectors_delta, + s64 *disk_sectors_delta) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c old; + unsigned new_replicas = bch2_bkey_replicas(c, bkey_i_to_s_c(new)); + bool new_compressed = bch2_bkey_sectors_compressed(bkey_i_to_s_c(new)); + int ret = 0; + + *usage_increasing = false; + *i_sectors_delta = 0; + *disk_sectors_delta = 0; + + bch2_trans_copy_iter(&iter, extent_iter); + + for_each_btree_key_upto_continue_norestart(iter, + new->k.p, BTREE_ITER_SLOTS, old, ret) { + s64 sectors = min(new->k.p.offset, old.k->p.offset) - + max(bkey_start_offset(&new->k), + bkey_start_offset(old.k)); + + *i_sectors_delta += sectors * + (bkey_extent_is_allocation(&new->k) - + bkey_extent_is_allocation(old.k)); + + *disk_sectors_delta += sectors * bch2_bkey_nr_ptrs_allocated(bkey_i_to_s_c(new)); + *disk_sectors_delta -= new->k.p.snapshot == old.k->p.snapshot + ? sectors * bch2_bkey_nr_ptrs_fully_allocated(old) + : 0; + + if (!*usage_increasing && + (new->k.p.snapshot != old.k->p.snapshot || + new_replicas > bch2_bkey_replicas(c, old) || + (!new_compressed && bch2_bkey_sectors_compressed(old)))) + *usage_increasing = true; + + if (bkey_ge(old.k->p, new->k.p)) + break; + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, + struct btree_iter *extent_iter, + u64 new_i_size, + s64 i_sectors_delta) +{ + struct btree_iter iter; + struct bkey_i *k; + struct bkey_i_inode_v3 *inode; + unsigned inode_update_flags = BTREE_UPDATE_NOJOURNAL; + int ret; + + k = bch2_bkey_get_mut_noupdate(trans, &iter, BTREE_ID_inodes, + SPOS(0, + extent_iter->pos.inode, + extent_iter->snapshot), + BTREE_ITER_CACHED); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + if (unlikely(k->k.type != KEY_TYPE_inode_v3)) { + k = bch2_inode_to_v3(trans, k); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + goto err; + } + + inode = bkey_i_to_inode_v3(k); + + if (!(le64_to_cpu(inode->v.bi_flags) & BCH_INODE_I_SIZE_DIRTY) && + new_i_size > le64_to_cpu(inode->v.bi_size)) { + inode->v.bi_size = cpu_to_le64(new_i_size); + inode_update_flags = 0; + } + + if (i_sectors_delta) { + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); + inode_update_flags = 0; + } + + if (inode->k.p.snapshot != iter.snapshot) { + inode->k.p.snapshot = iter.snapshot; + inode_update_flags = 0; + } + + ret = bch2_trans_update(trans, &iter, &inode->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE| + inode_update_flags); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_extent_update(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + struct bkey_i *k, + struct disk_reservation *disk_res, + u64 new_i_size, + s64 *i_sectors_delta_total, + bool check_enospc) +{ + struct bpos next_pos; + bool usage_increasing; + s64 i_sectors_delta = 0, disk_sectors_delta = 0; + int ret; + + /* + * This traverses us the iterator without changing iter->path->pos to + * search_key() (which is pos + 1 for extents): we want there to be a + * path already traversed at iter->pos because + * bch2_trans_extent_update() will use it to attempt extent merging + */ + ret = __bch2_btree_iter_traverse(iter); + if (ret) + return ret; + + ret = bch2_extent_trim_atomic(trans, iter, k); + if (ret) + return ret; + + next_pos = k->k.p; + + ret = bch2_sum_sector_overwrites(trans, iter, k, + &usage_increasing, + &i_sectors_delta, + &disk_sectors_delta); + if (ret) + return ret; + + if (disk_res && + disk_sectors_delta > (s64) disk_res->sectors) { + ret = bch2_disk_reservation_add(trans->c, disk_res, + disk_sectors_delta - disk_res->sectors, + !check_enospc || !usage_increasing + ? BCH_DISK_RESERVATION_NOFAIL : 0); + if (ret) + return ret; + } + + /* + * Note: + * We always have to do an inode update - even when i_size/i_sectors + * aren't changing - for fsync to work properly; fsync relies on + * inode->bi_journal_seq which is updated by the trigger code: + */ + ret = bch2_extent_update_i_size_sectors(trans, iter, + min(k->k.p.offset << 9, new_i_size), + i_sectors_delta) ?: + bch2_trans_update(trans, iter, k, 0) ?: + bch2_trans_commit(trans, disk_res, NULL, + BTREE_INSERT_NOCHECK_RW| + BTREE_INSERT_NOFAIL); + if (unlikely(ret)) + return ret; + + if (i_sectors_delta_total) + *i_sectors_delta_total += i_sectors_delta; + bch2_btree_iter_set_pos(iter, next_pos); + return 0; +} + +/* Overwrites whatever was present with zeroes: */ +int bch2_extent_fallocate(struct btree_trans *trans, + subvol_inum inum, + struct btree_iter *iter, + unsigned sectors, + struct bch_io_opts opts, + s64 *i_sectors_delta, + struct write_point_specifier write_point) +{ + struct bch_fs *c = trans->c; + struct disk_reservation disk_res = { 0 }; + struct closure cl; + struct open_buckets open_buckets; + struct bkey_s_c k; + struct bkey_buf old, new; + unsigned sectors_allocated; + bool have_reservation = false; + bool unwritten = opts.nocow && + c->sb.version >= bcachefs_metadata_version_unwritten_extents; + int ret; + + bch2_bkey_buf_init(&old); + bch2_bkey_buf_init(&new); + closure_init_stack(&cl); + open_buckets.nr = 0; +retry: + sectors_allocated = 0; + + k = bch2_btree_iter_peek_slot(iter); + ret = bkey_err(k); + if (ret) + return ret; + + sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset); + + if (!have_reservation) { + unsigned new_replicas = + max(0, (int) opts.data_replicas - + (int) bch2_bkey_nr_ptrs_fully_allocated(k)); + /* + * Get a disk reservation before (in the nocow case) calling + * into the allocator: + */ + ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0); + if (unlikely(ret)) + goto out; + + bch2_bkey_buf_reassemble(&old, c, k); + } + + if (have_reservation) { + if (!bch2_extents_match(k, bkey_i_to_s_c(old.k))) + goto out; + + bch2_key_resize(&new.k->k, sectors); + } else if (!unwritten) { + struct bkey_i_reservation *reservation; + + bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64)); + reservation = bkey_reservation_init(new.k); + reservation->k.p = iter->pos; + bch2_key_resize(&reservation->k, sectors); + reservation->v.nr_replicas = opts.data_replicas; + } else { + struct bkey_i_extent *e; + struct bch_devs_list devs_have; + struct write_point *wp; + struct bch_extent_ptr *ptr; + + devs_have.nr = 0; + + bch2_bkey_buf_realloc(&new, c, BKEY_EXTENT_U64s_MAX); + + e = bkey_extent_init(new.k); + e->k.p = iter->pos; + + ret = bch2_alloc_sectors_start_trans(trans, + opts.foreground_target, + false, + write_point, + &devs_have, + opts.data_replicas, + opts.data_replicas, + BCH_WATERMARK_normal, 0, &cl, &wp); + if (ret) { + bch2_trans_unlock(trans); + closure_sync(&cl); + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + goto retry; + return ret; + } + + sectors = min(sectors, wp->sectors_free); + sectors_allocated = sectors; + + bch2_key_resize(&e->k, sectors); + + bch2_open_bucket_get(c, wp, &open_buckets); + bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); + bch2_alloc_sectors_done(c, wp); + + extent_for_each_ptr(extent_i_to_s(e), ptr) + ptr->unwritten = true; + } + + have_reservation = true; + + ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res, + 0, i_sectors_delta, true); +out: + if ((atomic_read(&cl.remaining) & CLOSURE_REMAINING_MASK) != 1) { + bch2_trans_unlock(trans); + closure_sync(&cl); + } + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_begin(trans); + goto retry; + } + + if (!ret && sectors_allocated) + bch2_increment_clock(c, sectors_allocated, WRITE); + + bch2_open_buckets_put(c, &open_buckets); + bch2_disk_reservation_put(c, &disk_res); + bch2_bkey_buf_exit(&new, c); + bch2_bkey_buf_exit(&old, c); + + return ret; +} + +/* + * Returns -BCH_ERR_transacton_restart if we had to drop locks: + */ +int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, + subvol_inum inum, u64 end, + s64 *i_sectors_delta) +{ + struct bch_fs *c = trans->c; + unsigned max_sectors = KEY_SIZE_MAX & (~0 << c->block_bits); + struct bpos end_pos = POS(inum.inum, end); + struct bkey_s_c k; + int ret = 0, ret2 = 0; + u32 snapshot; + + while (!ret || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + struct disk_reservation disk_res = + bch2_disk_reservation_init(c, 0); + struct bkey_i delete; + + if (ret) + ret2 = ret; + + bch2_trans_begin(trans); + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(iter, snapshot); + + /* + * peek_upto() doesn't have ideal semantics for extents: + */ + k = bch2_btree_iter_peek_upto(iter, end_pos); + if (!k.k) + break; + + ret = bkey_err(k); + if (ret) + continue; + + bkey_init(&delete.k); + delete.k.p = iter->pos; + + /* create the biggest key we can */ + bch2_key_resize(&delete.k, max_sectors); + bch2_cut_back(end_pos, &delete); + + ret = bch2_extent_update(trans, inum, iter, &delete, + &disk_res, 0, i_sectors_delta, false); + bch2_disk_reservation_put(c, &disk_res); + } + + return ret ?: ret2; +} + +int bch2_fpunch(struct bch_fs *c, subvol_inum inum, u64 start, u64 end, + s64 *i_sectors_delta) +{ + struct btree_trans trans; + struct btree_iter iter; + int ret; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + POS(inum.inum, start), + BTREE_ITER_INTENT); + + ret = bch2_fpunch_at(&trans, &iter, inum, end, i_sectors_delta); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + + return ret; +} + +static int bch2_write_index_default(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct bkey_buf sk; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k = bch2_keylist_front(keys); + struct btree_trans trans; + struct btree_iter iter; + subvol_inum inum = { + .subvol = op->subvol, + .inum = k->k.p.inode, + }; + int ret; + + BUG_ON(!inum.subvol); + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 1024); + + do { + bch2_trans_begin(&trans); + + k = bch2_keylist_front(keys); + bch2_bkey_buf_copy(&sk, c, k); + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, + &sk.k->k.p.snapshot); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + bkey_start_pos(&sk.k->k), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + + ret = bch2_extent_update(&trans, inum, &iter, sk.k, + &op->res, + op->new_i_size, &op->i_sectors_delta, + op->flags & BCH_WRITE_CHECK_ENOSPC); + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_ge(iter.pos, k->k.p)) + bch2_keylist_pop_front(&op->insert_keys); + else + bch2_cut_front(iter.pos, k); + } while (!bch2_keylist_empty(keys)); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} + +/* Writes */ + +void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, + enum bch_data_type type, + const struct bkey_i *k, + bool nocow) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + const struct bch_extent_ptr *ptr; + struct bch_write_bio *n; + struct bch_dev *ca; + + BUG_ON(c->opts.nochanges); + + bkey_for_each_ptr(ptrs, ptr) { + BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX || + !c->devs[ptr->dev]); + + ca = bch_dev_bkey_exists(c, ptr->dev); + + if (to_entry(ptr + 1) < ptrs.end) { + n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, + GFP_NOFS, &ca->replica_set)); + + n->bio.bi_end_io = wbio->bio.bi_end_io; + n->bio.bi_private = wbio->bio.bi_private; + n->parent = wbio; + n->split = true; + n->bounce = false; + n->put_bio = true; + n->bio.bi_opf = wbio->bio.bi_opf; + bio_inc_remaining(&wbio->bio); + } else { + n = wbio; + n->split = false; + } + + n->c = c; + n->dev = ptr->dev; + n->have_ioref = nocow || bch2_dev_get_ioref(ca, + type == BCH_DATA_btree ? READ : WRITE); + n->nocow = nocow; + n->submit_time = local_clock(); + n->inode_offset = bkey_start_offset(&k->k); + n->bio.bi_iter.bi_sector = ptr->offset; + + if (likely(n->have_ioref)) { + this_cpu_add(ca->io_done->sectors[WRITE][type], + bio_sectors(&n->bio)); + + bio_set_dev(&n->bio, ca->disk_sb.bdev); + + if (type != BCH_DATA_btree && unlikely(c->opts.no_data_io)) { + bio_endio(&n->bio); + continue; + } + + submit_bio(&n->bio); + } else { + n->bio.bi_status = BLK_STS_REMOVED; + bio_endio(&n->bio); + } + } +} + +static void __bch2_write(struct bch_write_op *); + +static void bch2_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_fs *c = op->c; + + bch2_disk_reservation_put(c, &op->res); + if (!(op->flags & BCH_WRITE_MOVE)) + bch2_write_ref_put(c, BCH_WRITE_REF_write); + bch2_keylist_free(&op->insert_keys, op->inline_keys); + + bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); + + EBUG_ON(cl->parent); + closure_debug_destroy(cl); + if (op->end_io) + op->end_io(op); +} + +static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) +{ + struct keylist *keys = &op->insert_keys; + struct bch_extent_ptr *ptr; + struct bkey_i *src, *dst = keys->keys, *n; + + for (src = keys->keys; src != keys->top; src = n) { + n = bkey_next(src); + + if (bkey_extent_is_direct_data(&src->k)) { + bch2_bkey_drop_ptrs(bkey_i_to_s(src), ptr, + test_bit(ptr->dev, op->failed.d)); + + if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) + return -EIO; + } + + if (dst != src) + memmove_u64s_down(dst, src, src->k.u64s); + dst = bkey_next(dst); + } + + keys->top = dst; + return 0; +} + +/** + * bch_write_index - after a write, update index to point to new data + */ +static void __bch2_write_index(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct keylist *keys = &op->insert_keys; + struct bkey_i *k; + unsigned dev; + int ret = 0; + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + ret = bch2_write_drop_io_error_ptrs(op); + if (ret) + goto err; + } + + /* + * probably not the ideal place to hook this in, but I don't + * particularly want to plumb io_opts all the way through the btree + * update stack right now + */ + for_each_keylist_key(keys, k) + bch2_rebalance_add_key(c, bkey_i_to_s_c(k), &op->opts); + + if (!bch2_keylist_empty(keys)) { + u64 sectors_start = keylist_sectors(keys); + + ret = !(op->flags & BCH_WRITE_MOVE) + ? bch2_write_index_default(op) + : bch2_data_update_index_update(op); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + BUG_ON(keylist_sectors(keys) && !ret); + + op->written += sectors_start - keylist_sectors(keys); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + k->k.p.inode, k->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) + goto err; + } +out: + /* If some a bucket wasn't written, we can't erasure code it: */ + for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) + bch2_open_bucket_write_error(c, &op->open_buckets, dev); + + bch2_open_buckets_put(c, &op->open_buckets); + return; +err: + keys->top = keys->keys; + op->error = ret; + op->flags |= BCH_WRITE_DONE; + goto out; +} + +static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) +{ + if (state != wp->state) { + u64 now = ktime_get_ns(); + + if (wp->last_state_change && + time_after64(now, wp->last_state_change)) + wp->time[wp->state] += now - wp->last_state_change; + wp->state = state; + wp->last_state_change = now; + } +} + +static inline void wp_update_state(struct write_point *wp, bool running) +{ + enum write_point_state state; + + state = running ? WRITE_POINT_running : + !list_empty(&wp->writes) ? WRITE_POINT_waiting_io + : WRITE_POINT_stopped; + + __wp_update_state(wp, state); +} + +static void bch2_write_index(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct write_point *wp = op->wp; + struct workqueue_struct *wq = index_update_wq(op); + unsigned long flags; + + if ((op->flags & BCH_WRITE_DONE) && + (op->flags & BCH_WRITE_MOVE)) + bch2_bio_free_pages_pool(op->c, &op->wbio.bio); + + spin_lock_irqsave(&wp->writes_lock, flags); + if (wp->state == WRITE_POINT_waiting_io) + __wp_update_state(wp, WRITE_POINT_waiting_work); + list_add_tail(&op->wp_list, &wp->writes); + spin_unlock_irqrestore (&wp->writes_lock, flags); + + queue_work(wq, &wp->index_update_work); +} + +static inline void bch2_write_queue(struct bch_write_op *op, struct write_point *wp) +{ + op->wp = wp; + + if (wp->state == WRITE_POINT_stopped) { + spin_lock_irq(&wp->writes_lock); + __wp_update_state(wp, WRITE_POINT_waiting_io); + spin_unlock_irq(&wp->writes_lock); + } +} + +void bch2_write_point_do_index_updates(struct work_struct *work) +{ + struct write_point *wp = + container_of(work, struct write_point, index_update_work); + struct bch_write_op *op; + + while (1) { + spin_lock_irq(&wp->writes_lock); + op = list_first_entry_or_null(&wp->writes, struct bch_write_op, wp_list); + if (op) + list_del(&op->wp_list); + wp_update_state(wp, op != NULL); + spin_unlock_irq(&wp->writes_lock); + + if (!op) + break; + + op->flags |= BCH_WRITE_IN_WORKER; + + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + __bch2_write(op); + else + bch2_write_done(&op->cl); + } +} + +static void bch2_write_endio(struct bio *bio) +{ + struct closure *cl = bio->bi_private; + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bch_write_bio *wbio = to_wbio(bio); + struct bch_write_bio *parent = wbio->split ? wbio->parent : NULL; + struct bch_fs *c = wbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, wbio->dev); + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + set_bit(wbio->dev, op->failed.d); + op->flags |= BCH_WRITE_IO_ERROR; + } + + if (wbio->nocow) + set_bit(wbio->dev, op->devs_need_flush->d); + + if (wbio->have_ioref) { + bch2_latency_acct(ca, wbio->submit_time, WRITE); + percpu_ref_put(&ca->io_ref); + } + + if (wbio->bounce) + bch2_bio_free_pages_pool(c, bio); + + if (wbio->put_bio) + bio_put(bio); + + if (parent) + bio_endio(&parent->bio); + else + closure_put(cl); +} + +static void init_append_extent(struct bch_write_op *op, + struct write_point *wp, + struct bversion version, + struct bch_extent_crc_unpacked crc) +{ + struct bkey_i_extent *e; + + op->pos.offset += crc.uncompressed_size; + + e = bkey_extent_init(op->insert_keys.top); + e->k.p = op->pos; + e->k.size = crc.uncompressed_size; + e->k.version = version; + + if (crc.csum_type || + crc.compression_type || + crc.nonce) + bch2_extent_crc_append(&e->k_i, crc); + + bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, + op->flags & BCH_WRITE_CACHED); + + bch2_keylist_push(&op->insert_keys); +} + +static struct bio *bch2_write_bio_alloc(struct bch_fs *c, + struct write_point *wp, + struct bio *src, + bool *page_alloc_failed, + void *buf) +{ + struct bch_write_bio *wbio; + struct bio *bio; + unsigned output_available = + min(wp->sectors_free << 9, src->bi_iter.bi_size); + unsigned pages = DIV_ROUND_UP(output_available + + (buf + ? ((unsigned long) buf & (PAGE_SIZE - 1)) + : 0), PAGE_SIZE); + + pages = min(pages, BIO_MAX_VECS); + + bio = bio_alloc_bioset(NULL, pages, 0, + GFP_NOFS, &c->bio_write); + wbio = wbio_init(bio); + wbio->put_bio = true; + /* copy WRITE_SYNC flag */ + wbio->bio.bi_opf = src->bi_opf; + + if (buf) { + bch2_bio_map(bio, buf, output_available); + return bio; + } + + wbio->bounce = true; + + /* + * We can't use mempool for more than c->sb.encoded_extent_max + * worth of pages, but we'd like to allocate more if we can: + */ + bch2_bio_alloc_pages_pool(c, bio, + min_t(unsigned, output_available, + c->opts.encoded_extent_max)); + + if (bio->bi_iter.bi_size < output_available) + *page_alloc_failed = + bch2_bio_alloc_pages(bio, + output_available - + bio->bi_iter.bi_size, + GFP_NOFS) != 0; + + return bio; +} + +static int bch2_write_rechecksum(struct bch_fs *c, + struct bch_write_op *op, + unsigned new_csum_type) +{ + struct bio *bio = &op->wbio.bio; + struct bch_extent_crc_unpacked new_crc; + int ret; + + /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ + + if (bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(new_csum_type)) + new_csum_type = op->crc.csum_type; + + ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); + if (ret) + return ret; + + bio_advance(bio, op->crc.offset << 9); + bio->bi_iter.bi_size = op->crc.live_size << 9; + op->crc = new_crc; + return 0; +} + +static int bch2_write_decrypt(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct nonce nonce = extent_nonce(op->version, op->crc); + struct bch_csum csum; + int ret; + + if (!bch2_csum_type_is_encryption(op->crc.csum_type)) + return 0; + + /* + * If we need to decrypt data in the write path, we'll no longer be able + * to verify the existing checksum (poly1305 mac, in this case) after + * it's decrypted - this is the last point we'll be able to reverify the + * checksum: + */ + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + if (bch2_crc_cmp(op->crc.csum, csum)) + return -EIO; + + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + return ret; +} + +static enum prep_encoded_ret { + PREP_ENCODED_OK, + PREP_ENCODED_ERR, + PREP_ENCODED_CHECKSUM_ERR, + PREP_ENCODED_DO_WRITE, +} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +{ + struct bch_fs *c = op->c; + struct bio *bio = &op->wbio.bio; + + if (!(op->flags & BCH_WRITE_DATA_ENCODED)) + return PREP_ENCODED_OK; + + BUG_ON(bio_sectors(bio) != op->crc.compressed_size); + + /* Can we just write the entire extent as is? */ + if (op->crc.uncompressed_size == op->crc.live_size && + op->crc.compressed_size <= wp->sectors_free && + (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || + op->incompressible)) { + if (!crc_is_compressed(op->crc) && + op->csum_type != op->crc.csum_type && + bch2_write_rechecksum(c, op, op->csum_type)) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_DO_WRITE; + } + + /* + * If the data is compressed and we couldn't write the entire extent as + * is, we have to decompress it: + */ + if (crc_is_compressed(op->crc)) { + struct bch_csum csum; + + if (bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; + + /* Last point we can still verify checksum: */ + csum = bch2_checksum_bio(c, op->crc.csum_type, + extent_nonce(op->version, op->crc), + bio); + if (bch2_crc_cmp(op->crc.csum, csum)) + return PREP_ENCODED_CHECKSUM_ERR; + + if (bch2_bio_uncompress_inplace(c, bio, &op->crc)) + return PREP_ENCODED_ERR; + } + + /* + * No longer have compressed data after this point - data might be + * encrypted: + */ + + /* + * If the data is checksummed and we're only writing a subset, + * rechecksum and adjust bio to point to currently live data: + */ + if ((op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) && + bch2_write_rechecksum(c, op, op->csum_type)) + return PREP_ENCODED_CHECKSUM_ERR; + + /* + * If we want to compress the data, it has to be decrypted: + */ + if ((op->compression_opt || + bch2_csum_type_is_encryption(op->crc.csum_type) != + bch2_csum_type_is_encryption(op->csum_type)) && + bch2_write_decrypt(op)) + return PREP_ENCODED_CHECKSUM_ERR; + + return PREP_ENCODED_OK; +} + +static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, + struct bio **_dst) +{ + struct bch_fs *c = op->c; + struct bio *src = &op->wbio.bio, *dst = src; + struct bvec_iter saved_iter; + void *ec_buf; + unsigned total_output = 0, total_input = 0; + bool bounce = false; + bool page_alloc_failed = false; + int ret, more = 0; + + BUG_ON(!bio_sectors(src)); + + ec_buf = bch2_writepoint_ec_buf(c, wp); + + switch (bch2_write_prep_encoded_data(op, wp)) { + case PREP_ENCODED_OK: + break; + case PREP_ENCODED_ERR: + ret = -EIO; + goto err; + case PREP_ENCODED_CHECKSUM_ERR: + goto csum_err; + case PREP_ENCODED_DO_WRITE: + /* XXX look for bug here */ + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; + } + + if (ec_buf || + op->compression_opt || + (op->csum_type && + !(op->flags & BCH_WRITE_PAGES_STABLE)) || + (bch2_csum_type_is_encryption(op->csum_type) && + !(op->flags & BCH_WRITE_PAGES_OWNED))) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } + + saved_iter = dst->bi_iter; + + do { + struct bch_extent_crc_unpacked crc = { 0 }; + struct bversion version = op->version; + size_t dst_len, src_len; + + if (page_alloc_failed && + dst->bi_iter.bi_size < (wp->sectors_free << 9) && + dst->bi_iter.bi_size < c->opts.encoded_extent_max) + break; + + BUG_ON(op->compression_opt && + (op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_csum_type_is_encryption(op->crc.csum_type)); + BUG_ON(op->compression_opt && !bounce); + + crc.compression_type = op->incompressible + ? BCH_COMPRESSION_TYPE_incompressible + : op->compression_opt + ? bch2_bio_compress(c, dst, &dst_len, src, &src_len, + op->compression_opt) + : 0; + if (!crc_is_compressed(crc)) { + dst_len = min(dst->bi_iter.bi_size, src->bi_iter.bi_size); + dst_len = min_t(unsigned, dst_len, wp->sectors_free << 9); + + if (op->csum_type) + dst_len = min_t(unsigned, dst_len, + c->opts.encoded_extent_max); + + if (bounce) { + swap(dst->bi_iter.bi_size, dst_len); + bio_copy_data(dst, src); + swap(dst->bi_iter.bi_size, dst_len); + } + + src_len = dst_len; + } + + BUG_ON(!src_len || !dst_len); + + if (bch2_csum_type_is_encryption(op->csum_type)) { + if (bversion_zero(version)) { + version.lo = atomic64_inc_return(&c->key_version); + } else { + crc.nonce = op->nonce; + op->nonce += src_len >> 9; + } + } + + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + !crc_is_compressed(crc) && + bch2_csum_type_is_encryption(op->crc.csum_type) == + bch2_csum_type_is_encryption(op->csum_type)) { + u8 compression_type = crc.compression_type; + u16 nonce = crc.nonce; + /* + * Note: when we're using rechecksum(), we need to be + * checksumming @src because it has all the data our + * existing checksum covers - if we bounced (because we + * were trying to compress), @dst will only have the + * part of the data the new checksum will cover. + * + * But normally we want to be checksumming post bounce, + * because part of the reason for bouncing is so the + * data can't be modified (by userspace) while it's in + * flight. + */ + if (bch2_rechecksum_bio(c, src, version, op->crc, + &crc, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->csum_type)) + goto csum_err; + /* + * rchecksum_bio sets compression_type on crc from op->crc, + * this isn't always correct as sometimes we're changing + * an extent from uncompressed to incompressible. + */ + crc.compression_type = compression_type; + crc.nonce = nonce; + } else { + if ((op->flags & BCH_WRITE_DATA_ENCODED) && + bch2_rechecksum_bio(c, src, version, op->crc, + NULL, &op->crc, + src_len >> 9, + bio_sectors(src) - (src_len >> 9), + op->crc.csum_type)) + goto csum_err; + + crc.compressed_size = dst_len >> 9; + crc.uncompressed_size = src_len >> 9; + crc.live_size = src_len >> 9; + + swap(dst->bi_iter.bi_size, dst_len); + ret = bch2_encrypt_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + if (ret) + goto err; + + crc.csum = bch2_checksum_bio(c, op->csum_type, + extent_nonce(version, crc), dst); + crc.csum_type = op->csum_type; + swap(dst->bi_iter.bi_size, dst_len); + } + + init_append_extent(op, wp, version, crc); + + if (dst != src) + bio_advance(dst, dst_len); + bio_advance(src, src_len); + total_output += dst_len; + total_input += src_len; + } while (dst->bi_iter.bi_size && + src->bi_iter.bi_size && + wp->sectors_free && + !bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)); + + more = src->bi_iter.bi_size != 0; + + dst->bi_iter = saved_iter; + + if (dst == src && more) { + BUG_ON(total_output != total_input); + + dst = bio_split(src, total_input >> 9, + GFP_NOFS, &c->bio_write); + wbio_init(dst)->put_bio = true; + /* copy WRITE_SYNC flag */ + dst->bi_opf = src->bi_opf; + } + + dst->bi_iter.bi_size = total_output; +do_write: + *_dst = dst; + return more; +csum_err: + bch_err(c, "error verifying existing checksum while rewriting existing data (memory corruption?)"); + ret = -EIO; +err: + if (to_wbio(dst)->bounce) + bch2_bio_free_pages_pool(c, dst); + if (to_wbio(dst)->put_bio) + bio_put(dst); + + return ret; +} + +static bool bch2_extent_is_writeable(struct bch_write_op *op, + struct bkey_s_c k) +{ + struct bch_fs *c = op->c; + struct bkey_s_c_extent e; + struct extent_ptr_decoded p; + const union bch_extent_entry *entry; + unsigned replicas = 0; + + if (k.k->type != KEY_TYPE_extent) + return false; + + e = bkey_s_c_to_extent(k); + extent_for_each_ptr_decode(e, p, entry) { + if (p.crc.csum_type || + crc_is_compressed(p.crc) || + p.has_ec) + return false; + + replicas += bch2_extent_ptr_durability(c, &p); + } + + return replicas >= op->opts.data_replicas; +} + +static inline void bch2_nocow_write_unlock(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + const struct bch_extent_ptr *ptr; + struct bkey_i *k; + + for_each_keylist_key(&op->insert_keys, k) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); + + bkey_for_each_ptr(ptrs, ptr) + bch2_bucket_nocow_unlock(&c->nocow_locks, + PTR_BUCKET_POS(c, ptr), + BUCKET_NOCOW_LOCK_UPDATE); + } +} + +static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i *orig, + struct bkey_s_c k, + u64 new_i_size) +{ + struct bkey_i *new; + struct bkey_ptrs ptrs; + struct bch_extent_ptr *ptr; + int ret; + + if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) { + /* trace this */ + return 0; + } + + new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + return ret; + + bch2_cut_front(bkey_start_pos(&orig->k), new); + bch2_cut_back(orig->k.p, new); + + ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); + bkey_for_each_ptr(ptrs, ptr) + ptr->unwritten = 0; + + /* + * Note that we're not calling bch2_subvol_get_snapshot() in this path - + * that was done when we kicked off the write, and here it's important + * that we update the extent that we wrote to - even if a snapshot has + * since been created. The write is still outstanding, so we're ok + * w.r.t. snapshot atomicity: + */ + return bch2_extent_update_i_size_sectors(trans, iter, + min(new->k.p.offset << 9, new_i_size), 0) ?: + bch2_trans_update(trans, iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +} + +static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_i *orig; + struct bkey_s_c k; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for_each_keylist_key(&op->insert_keys, orig) { + ret = for_each_btree_key_upto_commit(&trans, iter, BTREE_ID_extents, + bkey_start_pos(&orig->k), orig->k.p, + BTREE_ITER_INTENT, k, + NULL, NULL, BTREE_INSERT_NOFAIL, ({ + bch2_nocow_write_convert_one_unwritten(&trans, &iter, orig, k, op->new_i_size); + })); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *k = bch2_keylist_front(&op->insert_keys); + + bch_err_inum_offset_ratelimited(c, + k->k.p.inode, k->k.p.offset << 9, + "write error while doing btree update: %s", + bch2_err_str(ret)); + } + + if (ret) { + op->error = ret; + break; + } + } + + bch2_trans_exit(&trans); +} + +static void __bch2_nocow_write_done(struct bch_write_op *op) +{ + bch2_nocow_write_unlock(op); + + if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + op->error = -EIO; + } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + bch2_nocow_write_convert_unwritten(op); +} + +static void bch2_nocow_write_done(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + + __bch2_nocow_write_done(op); + bch2_write_done(cl); +} + +static void bch2_nocow_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + struct { + struct bpos b; + unsigned gen; + struct nocow_lock_bucket *l; + } buckets[BCH_REPLICAS_MAX]; + unsigned nr_buckets = 0; + u32 snapshot; + int ret, i; + + if (op->flags & BCH_WRITE_MOVE) + return; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + + ret = bch2_subvolume_get_snapshot(&trans, op->subvol, &snapshot); + if (unlikely(ret)) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(op->pos.inode, op->pos.offset, snapshot), + BTREE_ITER_SLOTS); + while (1) { + struct bio *bio = &op->wbio.bio; + + nr_buckets = 0; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + /* fall back to normal cow write path? */ + if (unlikely(k.k->p.snapshot != snapshot || + !bch2_extent_is_writeable(op, k))) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + k.k->u64s)) + break; + + /* Get iorefs before dropping btree locks: */ + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + buckets[nr_buckets].b = PTR_BUCKET_POS(c, ptr); + buckets[nr_buckets].gen = ptr->gen; + buckets[nr_buckets].l = + bucket_nocow_lock(&c->nocow_locks, + bucket_to_u64(buckets[nr_buckets].b)); + + prefetch(buckets[nr_buckets].l); + + if (unlikely(!bch2_dev_get_ioref(bch_dev_bkey_exists(c, ptr->dev), WRITE))) + goto err_get_ioref; + + nr_buckets++; + + if (ptr->unwritten) + op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + } + + /* Unlock before taking nocow locks, doing IO: */ + bkey_reassemble(op->insert_keys.top, k); + bch2_trans_unlock(&trans); + + bch2_cut_front(op->pos, op->insert_keys.top); + if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); + + for (i = 0; i < nr_buckets; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, buckets[i].b.inode); + struct nocow_lock_bucket *l = buckets[i].l; + bool stale; + + __bch2_bucket_nocow_lock(&c->nocow_locks, l, + bucket_to_u64(buckets[i].b), + BUCKET_NOCOW_LOCK_UPDATE); + + rcu_read_lock(); + stale = gen_after(*bucket_gen(ca, buckets[i].b.offset), buckets[i].gen); + rcu_read_unlock(); + + if (unlikely(stale)) + goto err_bucket_stale; + } + + bio = &op->wbio.bio; + if (k.k->p.offset < op->pos.offset + bio_sectors(bio)) { + bio = bio_split(bio, k.k->p.offset - op->pos.offset, + GFP_KERNEL, &c->bio_write); + wbio_init(bio)->put_bio = true; + bio->bi_opf = op->wbio.bio.bi_opf; + } else { + op->flags |= BCH_WRITE_DONE; + } + + op->pos.offset += bio_sectors(bio); + op->written += bio_sectors(bio); + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + op->insert_keys.top, true); + + bch2_keylist_push(&op->insert_keys); + if (op->flags & BCH_WRITE_DONE) + break; + bch2_btree_iter_advance(&iter); + } +out: + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + if (ret) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "%s: btree lookup error %s", + __func__, bch2_err_str(ret)); + op->error = ret; + op->flags |= BCH_WRITE_DONE; + } + + bch2_trans_exit(&trans); + + /* fallback to cow write path? */ + if (!(op->flags & BCH_WRITE_DONE)) { + closure_sync(&op->cl); + __bch2_nocow_write_done(op); + op->insert_keys.top = op->insert_keys.keys; + } else if (op->flags & BCH_WRITE_SYNC) { + closure_sync(&op->cl); + bch2_nocow_write_done(&op->cl); + } else { + /* + * XXX + * needs to run out of process context because ei_quota_lock is + * a mutex + */ + continue_at(&op->cl, bch2_nocow_write_done, index_update_wq(op)); + } + return; +err_get_ioref: + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* Fall back to COW path: */ + goto out; +err_bucket_stale: + while (--i >= 0) + bch2_bucket_nocow_unlock(&c->nocow_locks, + buckets[i].b, + BUCKET_NOCOW_LOCK_UPDATE); + for (i = 0; i < nr_buckets; i++) + percpu_ref_put(&bch_dev_bkey_exists(c, buckets[i].b.inode)->io_ref); + + /* We can retry this: */ + ret = -BCH_ERR_transaction_restart; + goto out; +} + +static void __bch2_write(struct bch_write_op *op) +{ + struct bch_fs *c = op->c; + struct write_point *wp = NULL; + struct bio *bio = NULL; + unsigned nofs_flags; + int ret; + + nofs_flags = memalloc_nofs_save(); + + if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { + bch2_nocow_write(op); + if (op->flags & BCH_WRITE_DONE) + goto out_nofs_restore; + } +again: + memset(&op->failed, 0, sizeof(op->failed)); + + do { + struct bkey_i *key_to_write; + unsigned key_to_write_offset = op->insert_keys.top_p - + op->insert_keys.keys_p; + + /* +1 for possible cache device: */ + if (op->open_buckets.nr + op->nr_replicas + 1 > + ARRAY_SIZE(op->open_buckets.v)) + break; + + if (bch2_keylist_realloc(&op->insert_keys, + op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_EXTENT_U64s_MAX)) + break; + + /* + * The copygc thread is now global, which means it's no longer + * freeing up space on specific disks, which means that + * allocations for specific disks may hang arbitrarily long: + */ + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_alloc_sectors_start_trans(&trans, + op->target, + op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->write_point, + &op->devs_have, + op->nr_replicas, + op->nr_replicas_required, + op->watermark, + op->flags, + (op->flags & (BCH_WRITE_ALLOC_NOWAIT| + BCH_WRITE_ONLY_SPECIFIED_DEVS)) + ? NULL : &op->cl, &wp)); + if (unlikely(ret)) { + if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) + break; + + goto err; + } + + EBUG_ON(!wp); + + bch2_open_bucket_get(c, wp, &op->open_buckets); + ret = bch2_write_extent(op, wp, &bio); + + bch2_alloc_sectors_done_inlined(c, wp); +err: + if (ret <= 0) { + op->flags |= BCH_WRITE_DONE; + + if (ret < 0) { + op->error = ret; + break; + } + } + + bio->bi_end_io = bch2_write_endio; + bio->bi_private = &op->cl; + bio->bi_opf |= REQ_OP_WRITE; + + closure_get(bio->bi_private); + + key_to_write = (void *) (op->insert_keys.keys_p + + key_to_write_offset); + + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, + key_to_write, false); + } while (ret); + + /* + * Sync or no? + * + * If we're running asynchronously, wne may still want to block + * synchronously here if we weren't able to submit all of the IO at + * once, as that signals backpressure to the caller. + */ + if ((op->flags & BCH_WRITE_SYNC) || + (!(op->flags & BCH_WRITE_DONE) && + !(op->flags & BCH_WRITE_IN_WORKER))) { + closure_sync(&op->cl); + __bch2_write_index(op); + + if (!(op->flags & BCH_WRITE_DONE)) + goto again; + bch2_write_done(&op->cl); + } else { + bch2_write_queue(op, wp); + continue_at(&op->cl, bch2_write_index, NULL); + } +out_nofs_restore: + memalloc_nofs_restore(nofs_flags); +} + +static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) +{ + struct bio *bio = &op->wbio.bio; + struct bvec_iter iter; + struct bkey_i_inline_data *id; + unsigned sectors; + int ret; + + op->flags |= BCH_WRITE_WROTE_DATA_INLINE; + op->flags |= BCH_WRITE_DONE; + + bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); + + ret = bch2_keylist_realloc(&op->insert_keys, op->inline_keys, + ARRAY_SIZE(op->inline_keys), + BKEY_U64s + DIV_ROUND_UP(data_len, 8)); + if (ret) { + op->error = ret; + goto err; + } + + sectors = bio_sectors(bio); + op->pos.offset += sectors; + + id = bkey_inline_data_init(op->insert_keys.top); + id->k.p = op->pos; + id->k.version = op->version; + id->k.size = sectors; + + iter = bio->bi_iter; + iter.bi_size = data_len; + memcpy_from_bio(id->v.data, bio, iter); + + while (data_len & 7) + id->v.data[data_len++] = '\0'; + set_bkey_val_bytes(&id->k, data_len); + bch2_keylist_push(&op->insert_keys); + + __bch2_write_index(op); +err: + bch2_write_done(&op->cl); +} + +/** + * bch_write - handle a write to a cache device or flash only volume + * + * This is the starting point for any data to end up in a cache device; it could + * be from a normal write, or a writeback write, or a write to a flash only + * volume - it's also used by the moving garbage collector to compact data in + * mostly empty buckets. + * + * It first writes the data to the cache, creating a list of keys to be inserted + * (if the data won't fit in a single open bucket, there will be multiple keys); + * after the data is written it calls bch_journal, and after the keys have been + * added to the next journal write they're inserted into the btree. + * + * If op->discard is true, instead of inserting the data it invalidates the + * region of the cache represented by op->bio and op->inode. + */ +void bch2_write(struct closure *cl) +{ + struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); + struct bio *bio = &op->wbio.bio; + struct bch_fs *c = op->c; + unsigned data_len; + + EBUG_ON(op->cl.parent); + BUG_ON(!op->nr_replicas); + BUG_ON(!op->write_point.v); + BUG_ON(bkey_eq(op->pos, POS_MAX)); + + op->start_time = local_clock(); + bch2_keylist_init(&op->insert_keys, op->inline_keys); + wbio_init(bio)->put_bio = false; + + if (bio->bi_iter.bi_size & (c->opts.block_size - 1)) { + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + op->pos.offset << 9, + "misaligned write"); + op->error = -EIO; + goto err; + } + + if (c->opts.nochanges) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + if (!(op->flags & BCH_WRITE_MOVE) && + !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + op->error = -BCH_ERR_erofs_no_writes; + goto err; + } + + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + bch2_increment_clock(c, bio_sectors(bio), WRITE); + + data_len = min_t(u64, bio->bi_iter.bi_size, + op->new_i_size - (op->pos.offset << 9)); + + if (c->opts.inline_data && + data_len <= min(block_bytes(c) / 2, 1024U)) { + bch2_write_data_inline(op, data_len); + return; + } + + __bch2_write(op); + return; +err: + bch2_disk_reservation_put(c, &op->res); + + closure_debug_destroy(&op->cl); + if (op->end_io) + op->end_io(op); +} + +static const char * const bch2_write_flags[] = { +#define x(f) #f, + BCH_WRITE_FLAGS() +#undef x + NULL +}; + +void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) +{ + prt_str(out, "pos: "); + bch2_bpos_to_text(out, op->pos); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_str(out, "started: "); + bch2_pr_time_units(out, local_clock() - op->start_time); + prt_newline(out); + + prt_str(out, "flags: "); + prt_bitflags(out, bch2_write_flags, op->flags); + prt_newline(out); + + prt_printf(out, "ref: %u", closure_nr_remaining(&op->cl)); + prt_newline(out); + + printbuf_indent_sub(out, 2); +} + +/* Cache promotion on read */ + +struct promote_op { + struct rcu_head rcu; + u64 start_time; + + struct rhash_head hash; + struct bpos pos; + + struct data_update write; + struct bio_vec bi_inline_vecs[0]; /* must be last */ +}; + +static const struct rhashtable_params bch_promote_params = { + .head_offset = offsetof(struct promote_op, hash), + .key_offset = offsetof(struct promote_op, pos), + .key_len = sizeof(struct bpos), +}; + +static inline bool should_promote(struct bch_fs *c, struct bkey_s_c k, + struct bpos pos, + struct bch_io_opts opts, + unsigned flags) +{ + if (!(flags & BCH_READ_MAY_PROMOTE)) + return false; + + if (!opts.promote_target) + return false; + + if (bch2_bkey_has_target(c, k, opts.promote_target)) + return false; + + if (bkey_extent_is_unwritten(k)) + return false; + + if (bch2_target_congested(c, opts.promote_target)) { + /* XXX trace this */ + return false; + } + + if (rhashtable_lookup_fast(&c->promote_table, &pos, + bch_promote_params)) + return false; + + return true; +} + +static void promote_free(struct bch_fs *c, struct promote_op *op) +{ + int ret; + + bch2_data_update_exit(&op->write); + + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + kfree_rcu(op, rcu); +} + +static void promote_done(struct bch_write_op *wop) +{ + struct promote_op *op = + container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.op.c; + + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], + op->start_time); + promote_free(c, op); +} + +static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +{ + struct bio *bio = &op->write.op.wbio.bio; + + trace_and_count(op->write.op.c, read_promote, &rbio->bio); + + /* we now own pages: */ + BUG_ON(!rbio->bounce); + BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); + + memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, + sizeof(struct bio_vec) * rbio->bio.bi_vcnt); + swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + + bch2_data_update_read_done(&op->write, rbio->pick.crc); +} + +static struct promote_op *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned sectors, + struct bch_read_bio **rbio) +{ + struct bch_fs *c = trans->c; + struct promote_op *op = NULL; + struct bio *bio; + unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + int ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) + return NULL; + + op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS); + if (!op) + goto err; + + op->start_time = local_clock(); + op->pos = pos; + + /* + * We don't use the mempool here because extents that aren't + * checksummed or compressed can be too big for the mempool: + */ + *rbio = kzalloc(sizeof(struct bch_read_bio) + + sizeof(struct bio_vec) * pages, + GFP_NOFS); + if (!*rbio) + goto err; + + rbio_init(&(*rbio)->bio, opts); + bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + + if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, + GFP_NOFS)) + goto err; + + (*rbio)->bounce = true; + (*rbio)->split = true; + (*rbio)->kmalloc = true; + + if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, + bch_promote_params)) + goto err; + + bio = &op->write.op.wbio.bio; + bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); + + ret = bch2_data_update_init(trans, NULL, &op->write, + writepoint_hashed((unsigned long) current), + opts, + (struct data_update_opts) { + .target = opts.promote_target, + .extra_replicas = 1, + .write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED, + }, + btree_id, k); + /* + * possible errors: -BCH_ERR_nocow_lock_blocked, + * -BCH_ERR_ENOSPC_disk_reservation: + */ + if (ret) { + ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + goto err; + } + + op->write.op.end_io = promote_done; + + return op; +err: + if (*rbio) + bio_free_pages(&(*rbio)->bio); + kfree(*rbio); + *rbio = NULL; + kfree(op); + bch2_write_ref_put(c, BCH_WRITE_REF_promote); + return NULL; +} + +noinline +static struct promote_op *promote_alloc(struct btree_trans *trans, + struct bvec_iter iter, + struct bkey_s_c k, + struct extent_ptr_decoded *pick, + struct bch_io_opts opts, + unsigned flags, + struct bch_read_bio **rbio, + bool *bounce, + bool *read_full) +{ + struct bch_fs *c = trans->c; + bool promote_full = *read_full || READ_ONCE(c->promote_whole_extents); + /* data might have to be decompressed in the write path: */ + unsigned sectors = promote_full + ? max(pick->crc.compressed_size, pick->crc.live_size) + : bvec_iter_sectors(iter); + struct bpos pos = promote_full + ? bkey_start_pos(k.k) + : POS(k.k->p.inode, iter.bi_sector); + struct promote_op *promote; + + if (!should_promote(c, k, pos, opts, flags)) + return NULL; + + promote = __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, opts, sectors, rbio); + if (!promote) + return NULL; + + *bounce = true; + *read_full = promote_full; + return promote; +} + +/* Read */ + +#define READ_RETRY_AVOID 1 +#define READ_RETRY 2 +#define READ_ERR 3 + +enum rbio_context { + RBIO_CONTEXT_NULL, + RBIO_CONTEXT_HIGHPRI, + RBIO_CONTEXT_UNBOUND, +}; + +static inline struct bch_read_bio * +bch2_rbio_parent(struct bch_read_bio *rbio) +{ + return rbio->split ? rbio->parent : rbio; +} + +__always_inline +static void bch2_rbio_punt(struct bch_read_bio *rbio, work_func_t fn, + enum rbio_context context, + struct workqueue_struct *wq) +{ + if (context <= rbio->context) { + fn(&rbio->work); + } else { + rbio->work.func = fn; + rbio->context = context; + queue_work(wq, &rbio->work); + } +} + +static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) +{ + BUG_ON(rbio->bounce && !rbio->split); + + if (rbio->promote) + promote_free(rbio->c, rbio->promote); + rbio->promote = NULL; + + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + + if (rbio->split) { + struct bch_read_bio *parent = rbio->parent; + + if (rbio->kmalloc) + kfree(rbio); + else + bio_put(&rbio->bio); + + rbio = parent; + } + + return rbio; +} + +/* + * Only called on a top level bch_read_bio to complete an entire read request, + * not a split: + */ +static void bch2_rbio_done(struct bch_read_bio *rbio) +{ + if (rbio->start_time) + bch2_time_stats_update(&rbio->c->times[BCH_TIME_data_read], + rbio->start_time); + bio_endio(&rbio->bio); +} + +static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + int ret; + + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter, rbio->data_btree, + rbio->read_pos, BTREE_ITER_SLOTS); +retry: + rbio->bio.bi_status = 0; + + k = bch2_btree_iter_peek_slot(&iter); + if (bkey_err(k)) + goto err; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + bch2_trans_unlock(&trans); + + if (!bch2_bkey_matches_ptr(c, k, + rbio->pick.ptr, + rbio->data_pos.offset - + rbio->pick.crc.offset)) { + /* extent we wanted to read no longer exists: */ + rbio->hole = true; + goto out; + } + + ret = __bch2_read_extent(&trans, rbio, bvec_iter, + rbio->read_pos, + rbio->data_btree, + k, 0, failed, flags); + if (ret == READ_RETRY) + goto retry; + if (ret) + goto err; +out: + bch2_rbio_done(rbio); + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + return; +err: + rbio->bio.bi_status = BLK_STS_IOERR; + goto out; +} + +static void bch2_rbio_retry(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bvec_iter iter = rbio->bvec_iter; + unsigned flags = rbio->flags; + subvol_inum inum = { + .subvol = rbio->subvol, + .inum = rbio->read_pos.inode, + }; + struct bch_io_failures failed = { .nr = 0 }; + + trace_and_count(c, read_retry, &rbio->bio); + + if (rbio->retry == READ_RETRY_AVOID) + bch2_mark_io_failure(&failed, &rbio->pick); + + rbio->bio.bi_status = 0; + + rbio = bch2_rbio_free(rbio); + + flags |= BCH_READ_IN_RETRY; + flags &= ~BCH_READ_MAY_PROMOTE; + + if (flags & BCH_READ_NODECODE) { + bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); + } else { + flags &= ~BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_MUST_CLONE; + + __bch2_read(c, rbio, iter, inum, &failed, flags); + } +} + +static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, + blk_status_t error) +{ + rbio->retry = retry; + + if (rbio->flags & BCH_READ_IN_RETRY) + return; + + if (retry == READ_ERR) { + rbio = bch2_rbio_free(rbio); + + rbio->bio.bi_status = error; + bch2_rbio_done(rbio); + } else { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); + } +} + +static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, + struct bch_read_bio *rbio) +{ + struct bch_fs *c = rbio->c; + u64 data_offset = rbio->data_pos.offset - rbio->pick.crc.offset; + struct bch_extent_crc_unpacked new_crc; + struct btree_iter iter; + struct bkey_i *new; + struct bkey_s_c k; + int ret = 0; + + if (crc_is_compressed(rbio->pick.crc)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, rbio->data_btree, rbio->data_pos, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + if ((ret = bkey_err(k))) + goto out; + + if (bversion_cmp(k.k->version, rbio->version) || + !bch2_bkey_matches_ptr(c, k, rbio->pick.ptr, data_offset)) + goto out; + + /* Extent was merged? */ + if (bkey_start_offset(k.k) < data_offset || + k.k->p.offset > data_offset + rbio->pick.crc.uncompressed_size) + goto out; + + if (bch2_rechecksum_bio(c, &rbio->bio, rbio->version, + rbio->pick.crc, NULL, &new_crc, + bkey_start_offset(k.k) - data_offset, k.k->size, + rbio->pick.crc.csum_type)) { + bch_err(c, "error verifying existing checksum while narrowing checksum (memory corruption?)"); + ret = 0; + goto out; + } + + /* + * going to be temporarily appending another checksum entry: + */ + new = bch2_trans_kmalloc(trans, bkey_bytes(k.k) + + sizeof(struct bch_extent_crc128)); + if ((ret = PTR_ERR_OR_ZERO(new))) + goto out; + + bkey_reassemble(new, k); + + if (!bch2_bkey_narrow_crcs(new, new_crc)) + goto out; + + ret = bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) +{ + bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_rbio_narrow_crcs(&trans, rbio)); +} + +/* Inner part that may run in process context */ +static void __bch2_read_endio(struct work_struct *work) +{ + struct bch_read_bio *rbio = + container_of(work, struct bch_read_bio, work); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct bio *src = &rbio->bio; + struct bio *dst = &bch2_rbio_parent(rbio)->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + unsigned nofs_flags; + struct bch_csum csum; + int ret; + + nofs_flags = memalloc_nofs_save(); + + /* Reset iterator for checksumming and copying bounced data: */ + if (rbio->bounce) { + src->bi_iter.bi_size = crc.compressed_size << 9; + src->bi_iter.bi_idx = 0; + src->bi_iter.bi_bvec_done = 0; + } else { + src->bi_iter = rbio->bvec_iter; + } + + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); + if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + goto csum_err; + + /* + * XXX + * We need to rework the narrow_crcs path to deliver the read completion + * first, and then punt to a different workqueue, otherwise we're + * holding up reads while doing btree updates which is bad for memory + * reclaim. + */ + if (unlikely(rbio->narrow_crcs)) + bch2_rbio_narrow_crcs(rbio); + + if (rbio->flags & BCH_READ_NODECODE) + goto nodecode; + + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc)) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); + + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; + + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + + if (rbio->promote) { + /* + * Re encrypt data we decrypted, so it's consistent with + * rbio->crc: + */ + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; + + promote_start(rbio->promote, rbio); + rbio->promote = NULL; + } +nodecode: + if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + rbio = bch2_rbio_free(rbio); + bch2_rbio_done(rbio); + } +out: + memalloc_nofs_restore(nofs_flags); + return; +csum_err: + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { + rbio->flags |= BCH_READ_MUST_BOUNCE; + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); + goto out; + } + + bch_err_inum_offset_ratelimited(ca, + rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)", + rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo, + csum.hi, csum.lo, bch2_csum_types[crc.csum_type]); + bch2_io_error(ca); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; +decompression_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decompression error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +decrypt_err: + bch_err_inum_offset_ratelimited(c, rbio->read_pos.inode, + rbio->read_pos.offset << 9, + "decrypt error"); + bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + goto out; +} + +static void bch2_read_endio(struct bio *bio) +{ + struct bch_read_bio *rbio = + container_of(bio, struct bch_read_bio, bio); + struct bch_fs *c = rbio->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, rbio->pick.ptr.dev); + struct workqueue_struct *wq = NULL; + enum rbio_context context = RBIO_CONTEXT_NULL; + + if (rbio->have_ioref) { + bch2_latency_acct(ca, rbio->submit_time, READ); + percpu_ref_put(&ca->io_ref); + } + + if (!rbio->split) + rbio->bio.bi_end_io = rbio->end_io; + + if (bch2_dev_inum_io_err_on(bio->bi_status, ca, + rbio->read_pos.inode, + rbio->read_pos.offset, + "data read error: %s", + bch2_blk_status_to_str(bio->bi_status))) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + return; + } + + if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + ptr_stale(ca, &rbio->pick.ptr)) { + trace_and_count(c, read_reuse_race, &rbio->bio); + + if (rbio->flags & BCH_READ_RETRY_IF_STALE) + bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + else + bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + return; + } + + if (rbio->narrow_crcs || + rbio->promote || + crc_is_compressed(rbio->pick.crc) || + bch2_csum_type_is_encryption(rbio->pick.crc.csum_type)) + context = RBIO_CONTEXT_UNBOUND, wq = system_unbound_wq; + else if (rbio->pick.crc.csum_type) + context = RBIO_CONTEXT_HIGHPRI, wq = system_highpri_wq; + + bch2_rbio_punt(rbio, __bch2_read_endio, context, wq); +} + +int __bch2_read_indirect_extent(struct btree_trans *trans, + unsigned *offset_into_extent, + struct bkey_buf *orig_k) +{ + struct btree_iter iter; + struct bkey_s_c k; + u64 reflink_offset; + int ret; + + reflink_offset = le64_to_cpu(bkey_i_to_reflink_p(orig_k->k)->v.idx) + + *offset_into_extent; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_reflink, + POS(0, reflink_offset), 0); + ret = bkey_err(k); + if (ret) + goto err; + + if (k.k->type != KEY_TYPE_reflink_v && + k.k->type != KEY_TYPE_indirect_inline_data) { + bch_err_inum_offset_ratelimited(trans->c, + orig_k->k->k.p.inode, + orig_k->k->k.p.offset << 9, + "%llu len %u points to nonexistent indirect extent %llu", + orig_k->k->k.p.offset, + orig_k->k->k.size, + reflink_offset); + bch2_inconsistent_error(trans->c); + ret = -EIO; + goto err; + } + + *offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); + bch2_bkey_buf_reassemble(orig_k, trans->c, k); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, + struct bkey_s_c k, + struct bch_extent_ptr ptr) +{ + struct bch_fs *c = trans->c; + struct bch_dev *ca = bch_dev_bkey_exists(c, ptr.dev); + struct btree_iter iter; + struct printbuf buf = PRINTBUF; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + PTR_BUCKET_POS(c, &ptr), + BTREE_ITER_CACHED); + + prt_printf(&buf, "Attempting to read from stale dirty pointer:"); + printbuf_indent_add(&buf, 2); + prt_newline(&buf); + + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + + prt_printf(&buf, "memory gen: %u", *bucket_gen(ca, iter.pos.offset)); + + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + if (!ret) { + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + } + + bch2_fs_inconsistent(c, "%s", buf.buf); + + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf); +} + +int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, + struct bvec_iter iter, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, + struct bch_io_failures *failed, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct extent_ptr_decoded pick; + struct bch_read_bio *rbio = NULL; + struct bch_dev *ca = NULL; + struct promote_op *promote = NULL; + bool bounce = false, read_full = false, narrow_crcs = false; + struct bpos data_pos = bkey_start_pos(k.k); + int pick_ret; + + if (bkey_extent_is_inline_data(k.k)) { + unsigned bytes = min_t(unsigned, iter.bi_size, + bkey_inline_data_bytes(k.k)); + + swap(iter.bi_size, bytes); + memcpy_to_bio(&orig->bio, iter, bkey_inline_data_p(k)); + swap(iter.bi_size, bytes); + bio_advance_iter(&orig->bio, &iter, bytes); + zero_fill_bio_iter(&orig->bio, iter); + goto out_read_done; + } +retry_pick: + pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + + /* hole or reservation - just zero fill: */ + if (!pick_ret) + goto hole; + + if (pick_ret < 0) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, read_pos.offset << 9, + "no device to read from"); + goto err; + } + + ca = bch_dev_bkey_exists(c, pick.ptr.dev); + + /* + * Stale dirty pointers are treated as IO errors, but @failed isn't + * allocated unless we're in the retry path - so if we're not in the + * retry path, don't check here, it'll be caught in bch2_read_endio() + * and we'll end up in the retry path: + */ + if ((flags & BCH_READ_IN_RETRY) && + !pick.ptr.cached && + unlikely(ptr_stale(ca, &pick.ptr))) { + read_from_stale_dirty_pointer(trans, k, pick.ptr); + bch2_mark_io_failure(failed, &pick); + goto retry_pick; + } + + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + bch2_trans_unlock(trans); + + if (flags & BCH_READ_NODECODE) { + /* + * can happen if we retry, and the extent we were going to read + * has been merged in the meantime: + */ + if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) + goto hole; + + iter.bi_size = pick.crc.compressed_size << 9; + goto get_bio; + } + + if (!(flags & BCH_READ_LAST_FRAGMENT) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_MUST_CLONE; + + narrow_crcs = !(flags & BCH_READ_IN_RETRY) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) + flags |= BCH_READ_MUST_BOUNCE; + + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_USER_MAPPED)) || + (flags & BCH_READ_MUST_BOUNCE)))) { + read_full = true; + bounce = true; + } + + if (orig->opts.promote_target) + promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, + &rbio, &bounce, &read_full); + + if (!read_full) { + EBUG_ON(crc_is_compressed(pick.crc)); + EBUG_ON(pick.crc.csum_type && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + bvec_iter_sectors(iter) != pick.crc.live_size || + pick.crc.offset || + offset_into_extent)); + + data_pos.offset += offset_into_extent; + pick.ptr.offset += pick.crc.offset + + offset_into_extent; + offset_into_extent = 0; + pick.crc.compressed_size = bvec_iter_sectors(iter); + pick.crc.uncompressed_size = bvec_iter_sectors(iter); + pick.crc.offset = 0; + pick.crc.live_size = bvec_iter_sectors(iter); + offset_into_extent = 0; + } +get_bio: + if (rbio) { + /* + * promote already allocated bounce rbio: + * promote needs to allocate a bio big enough for uncompressing + * data in the write path, but we're not going to use it all + * here: + */ + EBUG_ON(rbio->bio.bi_iter.bi_size < + pick.crc.compressed_size << 9); + rbio->bio.bi_iter.bi_size = + pick.crc.compressed_size << 9; + } else if (bounce) { + unsigned sectors = pick.crc.compressed_size; + + rbio = rbio_init(bio_alloc_bioset(NULL, + DIV_ROUND_UP(sectors, PAGE_SECTORS), + 0, + GFP_NOFS, + &c->bio_read_split), + orig->opts); + + bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); + rbio->bounce = true; + rbio->split = true; + } else if (flags & BCH_READ_MUST_CLONE) { + /* + * Have to clone if there were any splits, due to error + * reporting issues (if a split errored, and retrying didn't + * work, when it reports the error to its parent (us) we don't + * know if the error was from our bio, and we should retry, or + * from the whole bio, in which case we don't want to retry and + * lose the error) + */ + rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + &c->bio_read_split), + orig->opts); + rbio->bio.bi_iter = iter; + rbio->split = true; + } else { + rbio = orig; + rbio->bio.bi_iter = iter; + EBUG_ON(bio_flagged(&rbio->bio, BIO_CHAIN)); + } + + EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); + + rbio->c = c; + rbio->submit_time = local_clock(); + if (rbio->split) + rbio->parent = orig; + else + rbio->end_io = orig->bio.bi_end_io; + rbio->bvec_iter = iter; + rbio->offset_into_extent= offset_into_extent; + rbio->flags = flags; + rbio->have_ioref = pick_ret > 0 && bch2_dev_get_ioref(ca, READ); + rbio->narrow_crcs = narrow_crcs; + rbio->hole = 0; + rbio->retry = 0; + rbio->context = 0; + /* XXX: only initialize this if needed */ + rbio->devs_have = bch2_bkey_devs(k); + rbio->pick = pick; + rbio->subvol = orig->subvol; + rbio->read_pos = read_pos; + rbio->data_btree = data_btree; + rbio->data_pos = data_pos; + rbio->version = k.k->version; + rbio->promote = promote; + INIT_WORK(&rbio->work, NULL); + + rbio->bio.bi_opf = orig->bio.bi_opf; + rbio->bio.bi_iter.bi_sector = pick.ptr.offset; + rbio->bio.bi_end_io = bch2_read_endio; + + if (rbio->bounce) + trace_and_count(c, read_bounce, &rbio->bio); + + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); + + /* + * If it's being moved internally, we don't want to flag it as a cache + * hit: + */ + if (pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + bch2_bucket_io_time_reset(trans, pick.ptr.dev, + PTR_BUCKET_NR(ca, &pick.ptr), READ); + + if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + bio_inc_remaining(&orig->bio); + trace_and_count(c, read_split, &orig->bio); + } + + if (!rbio->pick.idx) { + if (!rbio->have_ioref) { + bch_err_inum_offset_ratelimited(c, + read_pos.inode, + read_pos.offset << 9, + "no device to read from"); + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_user], + bio_sectors(&rbio->bio)); + bio_set_dev(&rbio->bio, ca->disk_sb.bdev); + + if (unlikely(c->opts.no_data_io)) { + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } else { + if (likely(!(flags & BCH_READ_IN_RETRY))) + submit_bio(&rbio->bio); + else + submit_bio_wait(&rbio->bio); + } + + /* + * We just submitted IO which may block, we expect relock fail + * events and shouldn't count them: + */ + trans->notrace_relock_fail = true; + } else { + /* Attempting reconstruct read: */ + if (bch2_ec_read_extent(c, rbio)) { + bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + goto out; + } + + if (likely(!(flags & BCH_READ_IN_RETRY))) + bio_endio(&rbio->bio); + } +out: + if (likely(!(flags & BCH_READ_IN_RETRY))) { + return 0; + } else { + int ret; + + rbio->context = RBIO_CONTEXT_UNBOUND; + bch2_read_endio(&rbio->bio); + + ret = rbio->retry; + rbio = bch2_rbio_free(rbio); + + if (ret == READ_RETRY_AVOID) { + bch2_mark_io_failure(failed, &pick); + ret = READ_RETRY; + } + + if (!ret) + goto out_read_done; + + return ret; + } + +err: + if (flags & BCH_READ_IN_RETRY) + return READ_ERR; + + orig->bio.bi_status = BLK_STS_IOERR; + goto out_read_done; + +hole: + /* + * won't normally happen in the BCH_READ_NODECODE + * (bch2_move_extent()) path, but if we retry and the extent we wanted + * to read no longer exists we have to signal that: + */ + if (flags & BCH_READ_NODECODE) + orig->hole = true; + + zero_fill_bio_iter(&orig->bio, iter); +out_read_done: + if (flags & BCH_READ_LAST_FRAGMENT) + bch2_rbio_done(orig); + return 0; +} + +void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, unsigned flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_buf sk; + struct bkey_s_c k; + u32 snapshot; + int ret; + + BUG_ON(flags & BCH_READ_NODECODE); + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inum.subvol, &snapshot); + if (ret) + goto err; + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(inum.inum, bvec_iter.bi_sector, snapshot), + BTREE_ITER_SLOTS); + while (1) { + unsigned bytes, sectors, offset_into_extent; + enum btree_id data_btree = BTREE_ID_extents; + + /* + * read_extent -> io_time_reset may cause a transaction restart + * without returning an error, we need to check for that here: + */ + ret = bch2_trans_relock(&trans); + if (ret) + break; + + bch2_btree_iter_set_pos(&iter, + POS(inum.inum, bvec_iter.bi_sector)); + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + break; + + offset_into_extent = iter.pos.offset - + bkey_start_offset(k.k); + sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&sk, c, k); + + ret = bch2_read_indirect_extent(&trans, &data_btree, + &offset_into_extent, &sk); + if (ret) + break; + + k = bkey_i_to_s_c(sk.k); + + /* + * With indirect extents, the amount of data to read is the min + * of the original extent and the indirect extent: + */ + sectors = min(sectors, k.k->size - offset_into_extent); + + bytes = min(sectors, bvec_iter_sectors(bvec_iter)) << 9; + swap(bvec_iter.bi_size, bytes); + + if (bvec_iter.bi_size == bytes) + flags |= BCH_READ_LAST_FRAGMENT; + + ret = __bch2_read_extent(&trans, rbio, bvec_iter, iter.pos, + data_btree, k, + offset_into_extent, failed, flags); + if (ret) + break; + + if (flags & BCH_READ_LAST_FRAGMENT) + break; + + swap(bvec_iter.bi_size, bytes); + bio_advance_iter(&rbio->bio, &bvec_iter, bytes); + + ret = btree_trans_too_many_iters(&trans); + if (ret) + break; + } +err: + bch2_trans_iter_exit(&trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + ret == READ_RETRY || + ret == READ_RETRY_AVOID) + goto retry; + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + if (ret) { + bch_err_inum_offset_ratelimited(c, inum.inum, + bvec_iter.bi_sector << 9, + "read error %i from btree lookup", ret); + rbio->bio.bi_status = BLK_STS_IOERR; + bch2_rbio_done(rbio); + } +} + +void bch2_fs_io_exit(struct bch_fs *c) +{ + if (c->promote_table.tbl) + rhashtable_destroy(&c->promote_table); + mempool_exit(&c->bio_bounce_pages); + bioset_exit(&c->bio_write); + bioset_exit(&c->bio_read_split); + bioset_exit(&c->bio_read); +} + +int bch2_fs_io_init(struct bch_fs *c) +{ + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_init; + + if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_read_split_init; + + if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), + BIOSET_NEED_BVECS)) + return -BCH_ERR_ENOMEM_bio_write_init; + + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + + if (rhashtable_init(&c->promote_table, &bch_promote_params)) + return -BCH_ERR_ENOMEM_promote_table_init; + + return 0; +} diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h new file mode 100644 index 000000000..1476380d5 --- /dev/null +++ b/fs/bcachefs/io.h @@ -0,0 +1,202 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_H +#define _BCACHEFS_IO_H + +#include "checksum.h" +#include "bkey_buf.h" +#include "io_types.h" + +#define to_wbio(_bio) \ + container_of((_bio), struct bch_write_bio, bio) + +#define to_rbio(_bio) \ + container_of((_bio), struct bch_read_bio, bio) + +void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); +void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, + enum bch_data_type, const struct bkey_i *, bool); + +#define BLK_STS_REMOVED ((__force blk_status_t)128) + +const char *bch2_blk_status_to_str(blk_status_t); + +#define BCH_WRITE_FLAGS() \ + x(ALLOC_NOWAIT) \ + x(CACHED) \ + x(DATA_ENCODED) \ + x(PAGES_STABLE) \ + x(PAGES_OWNED) \ + x(ONLY_SPECIFIED_DEVS) \ + x(WROTE_DATA_INLINE) \ + x(FROM_INTERNAL) \ + x(CHECK_ENOSPC) \ + x(SYNC) \ + x(MOVE) \ + x(IN_WORKER) \ + x(DONE) \ + x(IO_ERROR) \ + x(CONVERT_UNWRITTEN) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = 1U << __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) +{ + return op->watermark == BCH_WATERMARK_copygc + ? op->c->copygc_wq + : op->c->btree_update_wq; +} + +int bch2_sum_sector_overwrites(struct btree_trans *, struct btree_iter *, + struct bkey_i *, bool *, s64 *, s64 *); +int bch2_extent_update(struct btree_trans *, subvol_inum, + struct btree_iter *, struct bkey_i *, + struct disk_reservation *, u64, s64 *, bool); +int bch2_extent_fallocate(struct btree_trans *, subvol_inum, struct btree_iter *, + unsigned, struct bch_io_opts, s64 *, + struct write_point_specifier); + +int bch2_fpunch_at(struct btree_trans *, struct btree_iter *, + subvol_inum, u64, s64 *); +int bch2_fpunch(struct bch_fs *c, subvol_inum, u64, u64, s64 *); + +static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, + struct bch_io_opts opts) +{ + op->c = c; + op->end_io = NULL; + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c, opts); + op->compression_opt = opts.compression; + op->nr_replicas = 0; + op->nr_replicas_required = c->opts.data_replicas_required; + op->watermark = BCH_WATERMARK_normal; + op->incompressible = 0; + op->open_buckets.nr = 0; + op->devs_have.nr = 0; + op->target = 0; + op->opts = opts; + op->subvol = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->new_i_size = U64_MAX; + op->i_sectors_delta = 0; + op->devs_need_flush = NULL; +} + +void bch2_write(struct closure *); + +void bch2_write_point_do_index_updates(struct work_struct *); + +static inline struct bch_write_bio *wbio_init(struct bio *bio) +{ + struct bch_write_bio *wbio = to_wbio(bio); + + memset(&wbio->wbio, 0, sizeof(wbio->wbio)); + return wbio; +} + +void bch2_write_op_to_text(struct printbuf *, struct bch_write_op *); + +struct bch_devs_mask; +struct cache_promote_op; +struct extent_ptr_decoded; + +int __bch2_read_indirect_extent(struct btree_trans *, unsigned *, + struct bkey_buf *); + +static inline int bch2_read_indirect_extent(struct btree_trans *trans, + enum btree_id *data_btree, + unsigned *offset_into_extent, + struct bkey_buf *k) +{ + if (k->k->k.type != KEY_TYPE_reflink_p) + return 0; + + *data_btree = BTREE_ID_reflink; + return __bch2_read_indirect_extent(trans, offset_into_extent, k); +} + +enum bch_read_flags { + BCH_READ_RETRY_IF_STALE = 1 << 0, + BCH_READ_MAY_PROMOTE = 1 << 1, + BCH_READ_USER_MAPPED = 1 << 2, + BCH_READ_NODECODE = 1 << 3, + BCH_READ_LAST_FRAGMENT = 1 << 4, + + /* internal: */ + BCH_READ_MUST_BOUNCE = 1 << 5, + BCH_READ_MUST_CLONE = 1 << 6, + BCH_READ_IN_RETRY = 1 << 7, +}; + +int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, + struct bvec_iter, struct bpos, enum btree_id, + struct bkey_s_c, unsigned, + struct bch_io_failures *, unsigned); + +static inline void bch2_read_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, struct bpos read_pos, + enum btree_id data_btree, struct bkey_s_c k, + unsigned offset_into_extent, unsigned flags) +{ + __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags); +} + +void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, struct bch_io_failures *, unsigned flags); + +static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, + subvol_inum inum) +{ + struct bch_io_failures failed = { .nr = 0 }; + + BUG_ON(rbio->_state); + + rbio->c = c; + rbio->start_time = local_clock(); + rbio->subvol = inum.subvol; + + __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, + BCH_READ_RETRY_IF_STALE| + BCH_READ_MAY_PROMOTE| + BCH_READ_USER_MAPPED); +} + +static inline struct bch_read_bio *rbio_init(struct bio *bio, + struct bch_io_opts opts) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->_state = 0; + rbio->promote = NULL; + rbio->opts = opts; + return rbio; +} + +void bch2_fs_io_exit(struct bch_fs *); +int bch2_fs_io_init(struct bch_fs *); + +#endif /* _BCACHEFS_IO_H */ diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h new file mode 100644 index 000000000..737f16d78 --- /dev/null +++ b/fs/bcachefs/io_types.h @@ -0,0 +1,165 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_IO_TYPES_H +#define _BCACHEFS_IO_TYPES_H + +#include "alloc_types.h" +#include "btree_types.h" +#include "buckets_types.h" +#include "extents_types.h" +#include "keylist_types.h" +#include "opts.h" +#include "super_types.h" + +#include +#include + +struct bch_read_bio { + struct bch_fs *c; + u64 start_time; + u64 submit_time; + + /* + * Reads will often have to be split, and if the extent being read from + * was checksummed or compressed we'll also have to allocate bounce + * buffers and copy the data back into the original bio. + * + * If we didn't have to split, we have to save and restore the original + * bi_end_io - @split below indicates which: + */ + union { + struct bch_read_bio *parent; + bio_end_io_t *end_io; + }; + + /* + * Saved copy of bio->bi_iter, from submission time - allows us to + * resubmit on IO error, and also to copy data back to the original bio + * when we're bouncing: + */ + struct bvec_iter bvec_iter; + + unsigned offset_into_extent; + + u16 flags; + union { + struct { + u16 bounce:1, + split:1, + kmalloc:1, + have_ioref:1, + narrow_crcs:1, + hole:1, + retry:2, + context:2; + }; + u16 _state; + }; + + struct bch_devs_list devs_have; + + struct extent_ptr_decoded pick; + + /* + * pos we read from - different from data_pos for indirect extents: + */ + u32 subvol; + struct bpos read_pos; + + /* + * start pos of data we read (may not be pos of data we want) - for + * promote, narrow extents paths: + */ + enum btree_id data_btree; + struct bpos data_pos; + struct bversion version; + + struct promote_op *promote; + + struct bch_io_opts opts; + + struct work_struct work; + + struct bio bio; +}; + +struct bch_write_bio { + struct_group(wbio, + struct bch_fs *c; + struct bch_write_bio *parent; + + u64 submit_time; + u64 inode_offset; + + struct bch_devs_list failed; + u8 dev; + + unsigned split:1, + bounce:1, + put_bio:1, + have_ioref:1, + nocow:1, + used_mempool:1, + first_btree_write:1; + ); + + struct bio bio; +}; + +struct bch_write_op { + struct closure cl; + struct bch_fs *c; + void (*end_io)(struct bch_write_op *); + u64 start_time; + + unsigned written; /* sectors */ + u16 flags; + s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ + + unsigned compression_opt:8; + unsigned csum_type:4; + unsigned nr_replicas:4; + unsigned nr_replicas_required:4; + unsigned watermark:3; + unsigned incompressible:1; + unsigned stripe_waited:1; + + struct bch_devs_list devs_have; + u16 target; + u16 nonce; + struct bch_io_opts opts; + + u32 subvol; + struct bpos pos; + struct bversion version; + + /* For BCH_WRITE_DATA_ENCODED: */ + struct bch_extent_crc_unpacked crc; + + struct write_point_specifier write_point; + + struct write_point *wp; + struct list_head wp_list; + + struct disk_reservation res; + + struct open_buckets open_buckets; + + u64 new_i_size; + s64 i_sectors_delta; + + struct bch_devs_mask failed; + + struct keylist insert_keys; + u64 inline_keys[BKEY_EXTENT_U64s_MAX * 2]; + + /* + * Bitmask of devices that have had nocow writes issued to them since + * last flush: + */ + struct bch_devs_mask *devs_need_flush; + + /* Must be last: */ + struct bch_write_bio wbio; +}; + +#endif /* _BCACHEFS_IO_TYPES_H */ diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c new file mode 100644 index 000000000..80a612c05 --- /dev/null +++ b/fs/bcachefs/journal.c @@ -0,0 +1,1438 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs journalling code, for btree insertions + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "bkey_methods.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "buckets.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_sb.h" +#include "journal_seq_blacklist.h" +#include "trace.h" + +static const char * const bch2_journal_errors[] = { +#define x(n) #n, + JOURNAL_ERRORS() +#undef x + NULL +}; + +static inline bool journal_seq_unwritten(struct journal *j, u64 seq) +{ + return seq > j->seq_ondisk; +} + +static bool __journal_entry_is_open(union journal_res_state state) +{ + return state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL; +} + +static inline unsigned nr_unwritten_journal_entries(struct journal *j) +{ + return atomic64_read(&j->seq) - j->seq_ondisk; +} + +static bool journal_entry_is_open(struct journal *j) +{ + return __journal_entry_is_open(j->reservations); +} + +static inline struct journal_buf * +journal_seq_to_buf(struct journal *j, u64 seq) +{ + struct journal_buf *buf = NULL; + + EBUG_ON(seq > journal_cur_seq(j)); + + if (journal_seq_unwritten(j, seq)) { + buf = j->buf + (seq & JOURNAL_BUF_MASK); + EBUG_ON(le64_to_cpu(buf->data->seq) != seq); + } + return buf; +} + +static void journal_pin_list_init(struct journal_entry_pin_list *p, int count) +{ + unsigned i; + for (i = 0; i < ARRAY_SIZE(p->list); i++) + INIT_LIST_HEAD(&p->list[i]); + INIT_LIST_HEAD(&p->flushed); + atomic_set(&p->count, count); + p->devs.nr = 0; +} + +/* + * Detect stuck journal conditions and trigger shutdown. Technically the journal + * can end up stuck for a variety of reasons, such as a blocked I/O, journal + * reservation lockup, etc. Since this is a fatal error with potentially + * unpredictable characteristics, we want to be fairly conservative before we + * decide to shut things down. + * + * Consider the journal stuck when it appears full with no ability to commit + * btree transactions, to discard journal buckets, nor acquire priority + * (reserved watermark) reservation. + */ +static inline bool +journal_error_check_stuck(struct journal *j, int error, unsigned flags) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool stuck = false; + struct printbuf buf = PRINTBUF; + + if (!(error == JOURNAL_ERR_journal_full || + error == JOURNAL_ERR_journal_pin_full) || + nr_unwritten_journal_entries(j) || + (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) + return stuck; + + spin_lock(&j->lock); + + if (j->can_discard) { + spin_unlock(&j->lock); + return stuck; + } + + stuck = true; + + /* + * The journal shutdown path will set ->err_seq, but do it here first to + * serialize against concurrent failures and avoid duplicate error + * reports. + */ + if (j->err_seq) { + spin_unlock(&j->lock); + return stuck; + } + j->err_seq = journal_cur_seq(j); + spin_unlock(&j->lock); + + bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", + bch2_journal_errors[error]); + bch2_journal_debug_to_text(&buf, j); + bch_err(c, "%s", buf.buf); + + printbuf_reset(&buf); + bch2_journal_pins_to_text(&buf, j); + bch_err(c, "Journal pins:\n%s", buf.buf); + printbuf_exit(&buf); + + bch2_fatal_error(c); + dump_stack(); + + return stuck; +} + +/* journal entry close/open: */ + +void __bch2_journal_buf_put(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); +} + +/* + * Returns true if journal entry is now closed: + * + * We don't close a journal_buf until the next journal_buf is finished writing, + * and can be opened again - this also initializes the next journal_buf: + */ +static void __journal_entry_close(struct journal *j, unsigned closed_val) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = journal_cur_buf(j); + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + unsigned sectors; + + BUG_ON(closed_val != JOURNAL_ENTRY_CLOSED_VAL && + closed_val != JOURNAL_ENTRY_ERROR_VAL); + + lockdep_assert_held(&j->lock); + + do { + old.v = new.v = v; + new.cur_entry_offset = closed_val; + + if (old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL || + old.cur_entry_offset == new.cur_entry_offset) + return; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + if (!__journal_entry_is_open(old)) + return; + + /* Close out old buffer: */ + buf->data->u64s = cpu_to_le32(old.cur_entry_offset); + + sectors = vstruct_blocks_plus(buf->data, c->block_bits, + buf->u64s_reserved) << c->block_bits; + BUG_ON(sectors > buf->sectors); + buf->sectors = sectors; + + /* + * We have to set last_seq here, _before_ opening a new journal entry: + * + * A threads may replace an old pin with a new pin on their current + * journal reservation - the expectation being that the journal will + * contain either what the old pin protected or what the new pin + * protects. + * + * After the old pin is dropped journal_last_seq() won't include the old + * pin, so we can only write the updated last_seq on the entry that + * contains whatever the new pin protects. + * + * Restated, we can _not_ update last_seq for a given entry if there + * could be a newer entry open with reservations/pins that have been + * taken against it. + * + * Hence, we want update/set last_seq on the current journal entry right + * before we open a new one: + */ + buf->last_seq = journal_last_seq(j); + buf->data->last_seq = cpu_to_le64(buf->last_seq); + BUG_ON(buf->last_seq > le64_to_cpu(buf->data->seq)); + + __bch2_journal_pin_put(j, le64_to_cpu(buf->data->seq)); + + cancel_delayed_work(&j->write_work); + + bch2_journal_space_available(j); + + bch2_journal_buf_put(j, old.idx); +} + +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL); + if (!j->err_seq) + j->err_seq = journal_cur_seq(j); + journal_wake(j); + spin_unlock(&j->lock); +} + +static bool journal_entry_want_write(struct journal *j) +{ + bool ret = !journal_entry_is_open(j) || + journal_cur_seq(j) == journal_last_unwritten_seq(j); + + /* Don't close it yet if we already have a write in flight: */ + if (ret) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + else if (nr_unwritten_journal_entries(j)) { + struct journal_buf *buf = journal_cur_buf(j); + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + } + + return ret; +} + +static bool journal_entry_close(struct journal *j) +{ + bool ret; + + spin_lock(&j->lock); + ret = journal_entry_want_write(j); + spin_unlock(&j->lock); + + return ret; +} + +/* + * should _only_ called from journal_res_get() - when we actually want a + * journal reservation - journal entry is open means journal is dirty: + */ +static int journal_entry_open(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf = j->buf + + ((journal_cur_seq(j) + 1) & JOURNAL_BUF_MASK); + union journal_res_state old, new; + int u64s; + u64 v; + + lockdep_assert_held(&j->lock); + BUG_ON(journal_entry_is_open(j)); + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + if (j->blocked) + return JOURNAL_ERR_blocked; + + if (j->cur_entry_error) + return j->cur_entry_error; + + if (bch2_journal_error(j)) + return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + + if (!fifo_free(&j->pin)) + return JOURNAL_ERR_journal_pin_full; + + if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) + return JOURNAL_ERR_max_in_flight; + + BUG_ON(!j->cur_entry_sectors); + + buf->expires = + (journal_cur_seq(j) == j->flushed_seq_ondisk + ? jiffies + : j->last_flush_write) + + msecs_to_jiffies(c->opts.journal_flush_delay); + + buf->u64s_reserved = j->entry_u64s_reserved; + buf->disk_sectors = j->cur_entry_sectors; + buf->sectors = min(buf->disk_sectors, buf->buf_size >> 9); + + u64s = (int) (buf->sectors << 9) / sizeof(u64) - + journal_entry_overhead(j); + u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); + + if (u64s <= (ssize_t) j->early_journal_entries.nr) + return JOURNAL_ERR_journal_full; + + if (fifo_empty(&j->pin) && j->reclaim_thread) + wake_up_process(j->reclaim_thread); + + /* + * The fifo_push() needs to happen at the same time as j->seq is + * incremented for journal_last_seq() to be calculated correctly + */ + atomic64_inc(&j->seq); + journal_pin_list_init(fifo_push_ref(&j->pin), 1); + + BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); + + bkey_extent_init(&buf->key); + buf->noflush = false; + buf->must_flush = false; + buf->separate_flush = false; + buf->flush_time = 0; + + memset(buf->data, 0, sizeof(*buf->data)); + buf->data->seq = cpu_to_le64(journal_cur_seq(j)); + buf->data->u64s = 0; + + if (j->early_journal_entries.nr) { + memcpy(buf->data->_data, j->early_journal_entries.data, + j->early_journal_entries.nr * sizeof(u64)); + le32_add_cpu(&buf->data->u64s, j->early_journal_entries.nr); + } + + /* + * Must be set before marking the journal entry as open: + */ + j->cur_entry_u64s = u64s; + + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + + BUG_ON(old.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL); + + new.idx++; + BUG_ON(journal_state_count(new, new.idx)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + + journal_state_inc(&new); + + /* Handle any already added entries */ + new.cur_entry_offset = le32_to_cpu(buf->data->u64s); + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + if (j->res_get_blocked_start) + bch2_time_stats_update(j->blocked_time, + j->res_get_blocked_start); + j->res_get_blocked_start = 0; + + mod_delayed_work(c->io_complete_wq, + &j->write_work, + msecs_to_jiffies(c->opts.journal_flush_delay)); + journal_wake(j); + + if (j->early_journal_entries.nr) + darray_exit(&j->early_journal_entries); + return 0; +} + +static bool journal_quiesced(struct journal *j) +{ + bool ret = atomic64_read(&j->seq) == j->seq_ondisk; + + if (!ret) + journal_entry_close(j); + return ret; +} + +static void journal_quiesce(struct journal *j) +{ + wait_event(j->wait, journal_quiesced(j)); +} + +static void journal_write_work(struct work_struct *work) +{ + struct journal *j = container_of(work, struct journal, write_work.work); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + long delta; + + spin_lock(&j->lock); + if (!__journal_entry_is_open(j->reservations)) + goto unlock; + + delta = journal_cur_buf(j)->expires - jiffies; + + if (delta > 0) + mod_delayed_work(c->io_complete_wq, &j->write_work, delta); + else + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); +unlock: + spin_unlock(&j->lock); +} + +static int __journal_res_get(struct journal *j, struct journal_res *res, + unsigned flags) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *buf; + bool can_discard; + int ret; +retry: + if (journal_res_get_fast(j, res, flags)) + return 0; + + if (bch2_journal_error(j)) + return -BCH_ERR_erofs_journal_err; + + spin_lock(&j->lock); + + /* check once more in case somebody else shut things down... */ + if (bch2_journal_error(j)) { + spin_unlock(&j->lock); + return -BCH_ERR_erofs_journal_err; + } + + /* + * Recheck after taking the lock, so we don't race with another thread + * that just did journal_entry_open() and call journal_entry_close() + * unnecessarily + */ + if (journal_res_get_fast(j, res, flags)) { + spin_unlock(&j->lock); + return 0; + } + + if ((flags & BCH_WATERMARK_MASK) < j->watermark) { + /* + * Don't want to close current journal entry, just need to + * invoke reclaim: + */ + ret = JOURNAL_ERR_journal_full; + goto unlock; + } + + /* + * If we couldn't get a reservation because the current buf filled up, + * and we had room for a bigger entry on disk, signal that we want to + * realloc the journal bufs: + */ + buf = journal_cur_buf(j); + if (journal_entry_is_open(j) && + buf->buf_size >> 9 < buf->disk_sectors && + buf->buf_size < JOURNAL_ENTRY_SIZE_MAX) + j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); + + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + ret = journal_entry_open(j); + + if (ret == JOURNAL_ERR_max_in_flight) + trace_and_count(c, journal_entry_full, c); +unlock: + if ((ret && ret != JOURNAL_ERR_insufficient_devices) && + !j->res_get_blocked_start) { + j->res_get_blocked_start = local_clock() ?: 1; + trace_and_count(c, journal_full, c); + } + + can_discard = j->can_discard; + spin_unlock(&j->lock); + + if (!ret) + goto retry; + if (journal_error_check_stuck(j, ret, flags)) + ret = -BCH_ERR_journal_res_get_blocked; + + /* + * Journal is full - can't rely on reclaim from work item due to + * freezing: + */ + if ((ret == JOURNAL_ERR_journal_full || + ret == JOURNAL_ERR_journal_pin_full) && + !(flags & JOURNAL_RES_GET_NONBLOCK)) { + if (can_discard) { + bch2_journal_do_discards(j); + goto retry; + } + + if (mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } + } + + return ret == JOURNAL_ERR_insufficient_devices + ? -BCH_ERR_erofs_journal_err + : -BCH_ERR_journal_res_get_blocked; +} + +/* + * Essentially the entry function to the journaling code. When bcachefs is doing + * a btree insert, it calls this function to get the current journal write. + * Journal write is the structure used set up journal writes. The calling + * function will then add its keys to the structure, queuing them for the next + * write. + * + * To ensure forward progress, the current task must not be holding any + * btree node write locks. + */ +int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, + unsigned flags) +{ + int ret; + + closure_wait_event(&j->async_wait, + (ret = __journal_res_get(j, res, flags)) != + -BCH_ERR_journal_res_get_blocked|| + (flags & JOURNAL_RES_GET_NONBLOCK)); + return ret; +} + +/* journal_preres: */ + +static bool journal_preres_available(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags) +{ + bool ret = bch2_journal_preres_get_fast(j, res, new_u64s, flags, true); + + if (!ret && mutex_trylock(&j->reclaim_lock)) { + bch2_journal_reclaim(j); + mutex_unlock(&j->reclaim_lock); + } + + return ret; +} + +int __bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags) +{ + int ret; + + closure_wait_event(&j->preres_wait, + (ret = bch2_journal_error(j)) || + journal_preres_available(j, res, new_u64s, flags)); + return ret; +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *j, + struct journal_entry_res *res, + unsigned new_u64s) +{ + union journal_res_state state; + int d = new_u64s - res->u64s; + + spin_lock(&j->lock); + + j->entry_u64s_reserved += d; + if (d <= 0) + goto out; + + j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); + smp_mb(); + state = READ_ONCE(j->reservations); + + if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && + state.cur_entry_offset > j->cur_entry_u64s) { + j->cur_entry_u64s += d; + /* + * Not enough room in current journal entry, have to flush it: + */ + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + } else { + journal_cur_buf(j)->u64s_reserved += d; + } +out: + spin_unlock(&j->lock); + res->u64s += d; +} + +/* journal flushing: */ + +/** + * bch2_journal_flush_seq_async - wait for a journal entry to be written + * + * like bch2_journal_wait_on_seq, except that it triggers a write immediately if + * necessary + */ +int bch2_journal_flush_seq_async(struct journal *j, u64 seq, + struct closure *parent) +{ + struct journal_buf *buf; + int ret = 0; + + if (seq <= j->flushed_seq_ondisk) + return 1; + + spin_lock(&j->lock); + + if (WARN_ONCE(seq > journal_cur_seq(j), + "requested to flush journal seq %llu, but currently at %llu", + seq, journal_cur_seq(j))) + goto out; + + /* Recheck under lock: */ + if (j->err_seq && seq >= j->err_seq) { + ret = -EIO; + goto out; + } + + if (seq <= j->flushed_seq_ondisk) { + ret = 1; + goto out; + } + + /* if seq was written, but not flushed - flush a newer one instead */ + seq = max(seq, journal_last_unwritten_seq(j)); + +recheck_need_open: + if (seq > journal_cur_seq(j)) { + struct journal_res res = { 0 }; + + if (journal_entry_is_open(j)) + __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL); + + spin_unlock(&j->lock); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + seq = res.seq; + buf = j->buf + (seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); + + bch2_journal_res_put(j, &res); + + spin_lock(&j->lock); + goto want_write; + } + + /* + * if write was kicked off without a flush, flush the next sequence + * number instead + */ + buf = journal_seq_to_buf(j, seq); + if (buf->noflush) { + seq++; + goto recheck_need_open; + } + + buf->must_flush = true; + + if (parent && !closure_wait(&buf->wait, parent)) + BUG(); +want_write: + if (seq == journal_cur_seq(j)) + journal_entry_want_write(j); +out: + spin_unlock(&j->lock); + return ret; +} + +int bch2_journal_flush_seq(struct journal *j, u64 seq) +{ + u64 start_time = local_clock(); + int ret, ret2; + + /* + * Don't update time_stats when @seq is already flushed: + */ + if (seq <= j->flushed_seq_ondisk) + return 0; + + ret = wait_event_interruptible(j->wait, (ret2 = bch2_journal_flush_seq_async(j, seq, NULL))); + + if (!ret) + bch2_time_stats_update(j->flush_seq_time, start_time); + + return ret ?: ret2 < 0 ? ret2 : 0; +} + +/* + * bch2_journal_flush_async - if there is an open journal entry, or a journal + * still being written, write it and wait for the write to complete + */ +void bch2_journal_flush_async(struct journal *j, struct closure *parent) +{ + bch2_journal_flush_seq_async(j, atomic64_read(&j->seq), parent); +} + +int bch2_journal_flush(struct journal *j) +{ + return bch2_journal_flush_seq(j, atomic64_read(&j->seq)); +} + +/* + * bch2_journal_noflush_seq - tell the journal not to issue any flushes before + * @seq + */ +bool bch2_journal_noflush_seq(struct journal *j, u64 seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + u64 unwritten_seq; + bool ret = false; + + if (!(c->sb.features & (1ULL << BCH_FEATURE_journal_no_flush))) + return false; + + if (seq <= c->journal.flushed_seq_ondisk) + return false; + + spin_lock(&j->lock); + if (seq <= c->journal.flushed_seq_ondisk) + goto out; + + for (unwritten_seq = journal_last_unwritten_seq(j); + unwritten_seq < seq; + unwritten_seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, unwritten_seq); + + /* journal write is already in flight, and was a flush write: */ + if (unwritten_seq == journal_last_unwritten_seq(j) && !buf->noflush) + goto out; + + buf->noflush = true; + } + + ret = true; +out: + spin_unlock(&j->lock); + return ret; +} + +int bch2_journal_meta(struct journal *j) +{ + struct journal_buf *buf; + struct journal_res res; + int ret; + + memset(&res, 0, sizeof(res)); + + ret = bch2_journal_res_get(j, &res, jset_u64s(0), 0); + if (ret) + return ret; + + buf = j->buf + (res.seq & JOURNAL_BUF_MASK); + buf->must_flush = true; + + if (!buf->flush_time) { + buf->flush_time = local_clock() ?: 1; + buf->expires = jiffies; + } + + bch2_journal_res_put(j, &res); + + return bch2_journal_flush_seq(j, res.seq); +} + +/* block/unlock the journal: */ + +void bch2_journal_unblock(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked--; + spin_unlock(&j->lock); + + journal_wake(j); +} + +void bch2_journal_block(struct journal *j) +{ + spin_lock(&j->lock); + j->blocked++; + spin_unlock(&j->lock); + + journal_quiesce(j); +} + +/* allocate journal on a device: */ + +static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + u64 *new_bucket_seq = NULL, *new_buckets = NULL; + struct open_bucket **ob = NULL; + long *bu = NULL; + unsigned i, pos, nr_got = 0, nr_want = nr - ja->nr; + int ret = 0; + + BUG_ON(nr <= ja->nr); + + bu = kcalloc(nr_want, sizeof(*bu), GFP_KERNEL); + ob = kcalloc(nr_want, sizeof(*ob), GFP_KERNEL); + new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); + new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); + if (!bu || !ob || !new_buckets || !new_bucket_seq) { + ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + goto err_free; + } + + for (nr_got = 0; nr_got < nr_want; nr_got++) { + if (new_fs) { + bu[nr_got] = bch2_bucket_alloc_new_fs(ca); + if (bu[nr_got] < 0) { + ret = -BCH_ERR_ENOSPC_bucket_alloc; + break; + } + } else { + ob[nr_got] = bch2_bucket_alloc(c, ca, BCH_WATERMARK_normal, cl); + ret = PTR_ERR_OR_ZERO(ob[nr_got]); + if (ret) + break; + + ret = bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + ob[nr_got]->bucket, BCH_DATA_journal, + ca->mi.bucket_size)); + if (ret) { + bch2_open_bucket_put(c, ob[nr_got]); + bch_err(c, "error marking new journal buckets: %s", bch2_err_str(ret)); + break; + } + + bu[nr_got] = ob[nr_got]->bucket; + } + } + + if (!nr_got) + goto err_free; + + /* Don't return an error if we successfully allocated some buckets: */ + ret = 0; + + if (c) { + bch2_journal_flush_all_pins(&c->journal); + bch2_journal_block(&c->journal); + mutex_lock(&c->sb_lock); + } + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); + + BUG_ON(ja->discard_idx > ja->nr); + + pos = ja->discard_idx ?: ja->nr; + + memmove(new_buckets + pos + nr_got, + new_buckets + pos, + sizeof(new_buckets[0]) * (ja->nr - pos)); + memmove(new_bucket_seq + pos + nr_got, + new_bucket_seq + pos, + sizeof(new_bucket_seq[0]) * (ja->nr - pos)); + + for (i = 0; i < nr_got; i++) { + new_buckets[pos + i] = bu[i]; + new_bucket_seq[pos + i] = 0; + } + + nr = ja->nr + nr_got; + + ret = bch2_journal_buckets_to_sb(c, ca, new_buckets, nr); + if (ret) + goto err_unblock; + + if (!new_fs) + bch2_write_super(c); + + /* Commit: */ + if (c) + spin_lock(&c->journal.lock); + + swap(new_buckets, ja->buckets); + swap(new_bucket_seq, ja->bucket_seq); + ja->nr = nr; + + if (pos <= ja->discard_idx) + ja->discard_idx = (ja->discard_idx + nr_got) % ja->nr; + if (pos <= ja->dirty_idx_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + nr_got) % ja->nr; + if (pos <= ja->dirty_idx) + ja->dirty_idx = (ja->dirty_idx + nr_got) % ja->nr; + if (pos <= ja->cur_idx) + ja->cur_idx = (ja->cur_idx + nr_got) % ja->nr; + + if (c) + spin_unlock(&c->journal.lock); +err_unblock: + if (c) { + bch2_journal_unblock(&c->journal); + mutex_unlock(&c->sb_lock); + } + + if (ret && !new_fs) + for (i = 0; i < nr_got; i++) + bch2_trans_run(c, + bch2_trans_mark_metadata_bucket(&trans, ca, + bu[i], BCH_DATA_free, 0)); +err_free: + if (!new_fs) + for (i = 0; i < nr_got; i++) + bch2_open_bucket_put(c, ob[i]); + + kfree(new_bucket_seq); + kfree(new_buckets); + kfree(ob); + kfree(bu); + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + struct journal_device *ja = &ca->journal; + struct closure cl; + int ret = 0; + + closure_init_stack(&cl); + + down_write(&c->state_lock); + + /* don't handle reducing nr of buckets yet: */ + if (nr < ja->nr) + goto unlock; + + while (ja->nr < nr) { + struct disk_reservation disk_res = { 0, 0 }; + + /* + * note: journal buckets aren't really counted as _sectors_ used yet, so + * we don't need the disk reservation to avoid the BUG_ON() in buckets.c + * when space used goes up without a reservation - but we do need the + * reservation to ensure we'll actually be able to allocate: + * + * XXX: that's not right, disk reservations only ensure a + * filesystem-wide allocation will succeed, this is a device + * specific allocation - we can hang here: + */ + + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) + break; + + ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + + bch2_disk_reservation_put(c, &disk_res); + + closure_sync(&cl); + + if (ret && ret != -BCH_ERR_bucket_alloc_blocked) + break; + } + + if (ret) + bch_err_fn(c, ret); +unlock: + up_write(&c->state_lock); + return ret; +} + +int bch2_dev_journal_alloc(struct bch_dev *ca) +{ + unsigned nr; + int ret; + + if (dynamic_fault("bcachefs:add:journal_alloc")) { + ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + goto err; + } + + /* 1/128th of the device by default: */ + nr = ca->mi.nbuckets >> 7; + + /* + * clamp journal size to 8192 buckets or 8GB (in sectors), whichever + * is smaller: + */ + nr = clamp_t(unsigned, nr, + BCH_JOURNAL_BUCKETS_MIN, + min(1 << 13, + (1 << 24) / ca->mi.bucket_size)); + + ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL); +err: + if (ret) + bch_err_fn(ca, ret); + return ret; +} + +/* startup/shutdown: */ + +static bool bch2_journal_writing_to_device(struct journal *j, unsigned dev_idx) +{ + bool ret = false; + u64 seq; + + spin_lock(&j->lock); + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j) && !ret; + seq++) { + struct journal_buf *buf = journal_seq_to_buf(j, seq); + + if (bch2_bkey_has_device_c(bkey_i_to_s_c(&buf->key), dev_idx)) + ret = true; + } + spin_unlock(&j->lock); + + return ret; +} + +void bch2_dev_journal_stop(struct journal *j, struct bch_dev *ca) +{ + wait_event(j->wait, !bch2_journal_writing_to_device(j, ca->dev_idx)); +} + +void bch2_fs_journal_stop(struct journal *j) +{ + bch2_journal_reclaim_stop(j); + bch2_journal_flush_all_pins(j); + + wait_event(j->wait, journal_entry_close(j)); + + /* + * Always write a new journal entry, to make sure the clock hands are up + * to date (and match the superblock) + */ + bch2_journal_meta(j); + + journal_quiesce(j); + + BUG_ON(!bch2_journal_error(j) && + test_bit(JOURNAL_REPLAY_DONE, &j->flags) && + j->last_empty_seq != journal_cur_seq(j)); + + cancel_delayed_work_sync(&j->write_work); +} + +int bch2_fs_journal_start(struct journal *j, u64 cur_seq) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + struct journal_replay *i, **_i; + struct genradix_iter iter; + bool had_entries = false; + unsigned ptr; + u64 last_seq = cur_seq, nr, seq; + + genradix_for_each_reverse(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + last_seq = le64_to_cpu(i->j.last_seq); + break; + } + + nr = cur_seq - last_seq; + + if (nr + 1 > j->pin.size) { + free_fifo(&j->pin); + init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -BCH_ERR_ENOMEM_journal_pin_fifo; + } + } + + j->replay_journal_seq = last_seq; + j->replay_journal_seq_end = cur_seq; + j->last_seq_ondisk = last_seq; + j->flushed_seq_ondisk = cur_seq - 1; + j->seq_ondisk = cur_seq - 1; + j->pin.front = last_seq; + j->pin.back = cur_seq; + atomic64_set(&j->seq, cur_seq - 1); + + fifo_for_each_entry_ptr(p, &j->pin, seq) + journal_pin_list_init(p, 1); + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + BUG_ON(seq >= cur_seq); + + if (seq < last_seq) + continue; + + if (journal_entry_empty(&i->j)) + j->last_empty_seq = le64_to_cpu(i->j.seq); + + p = journal_seq_pin(j, seq); + + p->devs.nr = 0; + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + bch2_dev_list_add_dev(&p->devs, i->ptrs[ptr].dev); + + had_entries = true; + } + + if (!had_entries) + j->last_empty_seq = cur_seq; + + spin_lock(&j->lock); + + set_bit(JOURNAL_STARTED, &j->flags); + j->last_flush_write = jiffies; + + j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); + j->reservations.unwritten_idx++; + + c->last_bucket_seq_cleanup = journal_cur_seq(j); + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + + return bch2_journal_reclaim_start(j); +} + +/* init/exit: */ + +void bch2_dev_journal_exit(struct bch_dev *ca) +{ + kfree(ca->journal.bio); + kfree(ca->journal.buckets); + kfree(ca->journal.bucket_seq); + + ca->journal.bio = NULL; + ca->journal.buckets = NULL; + ca->journal.bucket_seq = NULL; +} + +int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) +{ + struct journal_device *ja = &ca->journal; + struct bch_sb_field_journal *journal_buckets = + bch2_sb_get_journal(sb); + struct bch_sb_field_journal_v2 *journal_buckets_v2 = + bch2_sb_get_journal_v2(sb); + unsigned i, nr_bvecs; + + ja->nr = 0; + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + + for (i = 0; i < nr; i++) + ja->nr += le64_to_cpu(journal_buckets_v2->d[i].nr); + } else if (journal_buckets) { + ja->nr = bch2_nr_journal_buckets(journal_buckets); + } + + ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->bucket_seq) + return -BCH_ERR_ENOMEM_dev_journal_init; + + nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); + + ca->journal.bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!ca->journal.bio) + return -BCH_ERR_ENOMEM_dev_journal_init; + + bio_init(ca->journal.bio, NULL, ca->journal.bio->bi_inline_vecs, nr_bvecs, 0); + + ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); + if (!ja->buckets) + return -BCH_ERR_ENOMEM_dev_journal_init; + + if (journal_buckets_v2) { + unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); + unsigned j, dst = 0; + + for (i = 0; i < nr; i++) + for (j = 0; j < le64_to_cpu(journal_buckets_v2->d[i].nr); j++) + ja->buckets[dst++] = + le64_to_cpu(journal_buckets_v2->d[i].start) + j; + } else if (journal_buckets) { + for (i = 0; i < ja->nr; i++) + ja->buckets[i] = le64_to_cpu(journal_buckets->buckets[i]); + } + + return 0; +} + +void bch2_fs_journal_exit(struct journal *j) +{ + unsigned i; + + darray_exit(&j->early_journal_entries); + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) + kvpfree(j->buf[i].data, j->buf[i].buf_size); + free_fifo(&j->pin); +} + +int bch2_fs_journal_init(struct journal *j) +{ + static struct lock_class_key res_key; + unsigned i; + + spin_lock_init(&j->lock); + spin_lock_init(&j->err_lock); + init_waitqueue_head(&j->wait); + INIT_DELAYED_WORK(&j->write_work, journal_write_work); + init_waitqueue_head(&j->reclaim_wait); + init_waitqueue_head(&j->pin_flush_wait); + mutex_init(&j->reclaim_lock); + mutex_init(&j->discard_lock); + + lockdep_init_map(&j->res_map, "journal res", &res_key, 0); + + atomic64_set(&j->reservations.counter, + ((union journal_res_state) + { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); + + if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) + return -BCH_ERR_ENOMEM_journal_pin_fifo; + + for (i = 0; i < ARRAY_SIZE(j->buf); i++) { + j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; + j->buf[i].data = kvpmalloc(j->buf[i].buf_size, GFP_KERNEL); + if (!j->buf[i].data) + return -BCH_ERR_ENOMEM_journal_buf; + } + + j->pin.front = j->pin.back = 1; + return 0; +} + +/* debug: */ + +void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + union journal_res_state s; + struct bch_dev *ca; + unsigned long now = jiffies; + u64 seq; + unsigned i; + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 24); + out->atomic++; + + rcu_read_lock(); + s = READ_ONCE(j->reservations); + + prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size); + prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j)); + prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk); + prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j)); + prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk); + prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk); + prt_printf(out, "prereserved:\t\t%u/%u\n", j->prereserved.reserved, j->prereserved.remaining); + prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]); + prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved); + prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes); + prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes); + prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); + prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); + prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked); + prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now) + ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); + prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); + prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry:\t\t"); + + switch (s.cur_entry_offset) { + case JOURNAL_ENTRY_ERROR_VAL: + prt_printf(out, "error"); + break; + case JOURNAL_ENTRY_CLOSED_VAL: + prt_printf(out, "closed"); + break; + default: + prt_printf(out, "%u/%u", s.cur_entry_offset, j->cur_entry_u64s); + break; + } + + prt_newline(out); + + for (seq = journal_cur_seq(j); + seq >= journal_last_unwritten_seq(j); + --seq) { + i = seq & JOURNAL_BUF_MASK; + + prt_printf(out, "unwritten entry:"); + prt_tab(out); + prt_printf(out, "%llu", seq); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "refcount:"); + prt_tab(out); + prt_printf(out, "%u", journal_state_count(s, i)); + prt_newline(out); + + prt_printf(out, "sectors:"); + prt_tab(out); + prt_printf(out, "%u", j->buf[i].sectors); + prt_newline(out); + + prt_printf(out, "expires"); + prt_tab(out); + prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies); + prt_newline(out); + + printbuf_indent_sub(out, 2); + } + + prt_printf(out, + "replay done:\t\t%i\n", + test_bit(JOURNAL_REPLAY_DONE, &j->flags)); + + prt_printf(out, "space:\n"); + prt_printf(out, "\tdiscarded\t%u:%u\n", + j->space[journal_space_discarded].next_entry, + j->space[journal_space_discarded].total); + prt_printf(out, "\tclean ondisk\t%u:%u\n", + j->space[journal_space_clean_ondisk].next_entry, + j->space[journal_space_clean_ondisk].total); + prt_printf(out, "\tclean\t\t%u:%u\n", + j->space[journal_space_clean].next_entry, + j->space[journal_space_clean].total); + prt_printf(out, "\ttotal\t\t%u:%u\n", + j->space[journal_space_total].next_entry, + j->space[journal_space_total].total); + + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + + if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d)) + continue; + + if (!ja->nr) + continue; + + prt_printf(out, "dev %u:\n", i); + prt_printf(out, "\tnr\t\t%u\n", ja->nr); + prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size); + prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free); + prt_printf(out, "\tdiscard_idx\t%u\n", ja->discard_idx); + prt_printf(out, "\tdirty_ondisk\t%u (seq %llu)\n", ja->dirty_idx_ondisk, ja->bucket_seq[ja->dirty_idx_ondisk]); + prt_printf(out, "\tdirty_idx\t%u (seq %llu)\n", ja->dirty_idx, ja->bucket_seq[ja->dirty_idx]); + prt_printf(out, "\tcur_idx\t\t%u (seq %llu)\n", ja->cur_idx, ja->bucket_seq[ja->cur_idx]); + } + + rcu_read_unlock(); + + --out->atomic; +} + +void bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) +{ + spin_lock(&j->lock); + __bch2_journal_debug_to_text(out, j); + spin_unlock(&j->lock); +} + +bool bch2_journal_seq_pins_to_text(struct printbuf *out, struct journal *j, u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *pin; + unsigned i; + + spin_lock(&j->lock); + *seq = max(*seq, j->pin.front); + + if (*seq >= j->pin.back) { + spin_unlock(&j->lock); + return true; + } + + out->atomic++; + + pin_list = journal_seq_pin(j, *seq); + + prt_printf(out, "%llu: count %u", *seq, atomic_read(&pin_list->count)); + prt_newline(out); + printbuf_indent_add(out, 2); + + for (i = 0; i < ARRAY_SIZE(pin_list->list); i++) + list_for_each_entry(pin, &pin_list->list[i], list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } + + if (!list_empty(&pin_list->flushed)) { + prt_printf(out, "flushed:"); + prt_newline(out); + } + + list_for_each_entry(pin, &pin_list->flushed, list) { + prt_printf(out, "\t%px %ps", pin, pin->flush); + prt_newline(out); + } + + printbuf_indent_sub(out, 2); + + --out->atomic; + spin_unlock(&j->lock); + + return false; +} + +void bch2_journal_pins_to_text(struct printbuf *out, struct journal *j) +{ + u64 seq = 0; + + while (!bch2_journal_seq_pins_to_text(out, j, &seq)) + seq++; +} diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h new file mode 100644 index 000000000..008a2e25a --- /dev/null +++ b/fs/bcachefs/journal.h @@ -0,0 +1,526 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_H +#define _BCACHEFS_JOURNAL_H + +/* + * THE JOURNAL: + * + * The primary purpose of the journal is to log updates (insertions) to the + * b-tree, to avoid having to do synchronous updates to the b-tree on disk. + * + * Without the journal, the b-tree is always internally consistent on + * disk - and in fact, in the earliest incarnations bcache didn't have a journal + * but did handle unclean shutdowns by doing all index updates synchronously + * (with coalescing). + * + * Updates to interior nodes still happen synchronously and without the journal + * (for simplicity) - this may change eventually but updates to interior nodes + * are rare enough it's not a huge priority. + * + * This means the journal is relatively separate from the b-tree; it consists of + * just a list of keys and journal replay consists of just redoing those + * insertions in same order that they appear in the journal. + * + * PERSISTENCE: + * + * For synchronous updates (where we're waiting on the index update to hit + * disk), the journal entry will be written out immediately (or as soon as + * possible, if the write for the previous journal entry was still in flight). + * + * Synchronous updates are specified by passing a closure (@flush_cl) to + * bch2_btree_insert() or bch_btree_insert_node(), which then pass that parameter + * down to the journalling code. That closure will wait on the journal write to + * complete (via closure_wait()). + * + * If the index update wasn't synchronous, the journal entry will be + * written out after 10 ms have elapsed, by default (the delay_ms field + * in struct journal). + * + * JOURNAL ENTRIES: + * + * A journal entry is variable size (struct jset), it's got a fixed length + * header and then a variable number of struct jset_entry entries. + * + * Journal entries are identified by monotonically increasing 64 bit sequence + * numbers - jset->seq; other places in the code refer to this sequence number. + * + * A jset_entry entry contains one or more bkeys (which is what gets inserted + * into the b-tree). We need a container to indicate which b-tree the key is + * for; also, the roots of the various b-trees are stored in jset_entry entries + * (one for each b-tree) - this lets us add new b-tree types without changing + * the on disk format. + * + * We also keep some things in the journal header that are logically part of the + * superblock - all the things that are frequently updated. This is for future + * bcache on raw flash support; the superblock (which will become another + * journal) can't be moved or wear leveled, so it contains just enough + * information to find the main journal, and the superblock only has to be + * rewritten when we want to move/wear level the main journal. + * + * JOURNAL LAYOUT ON DISK: + * + * The journal is written to a ringbuffer of buckets (which is kept in the + * superblock); the individual buckets are not necessarily contiguous on disk + * which means that journal entries are not allowed to span buckets, but also + * that we can resize the journal at runtime if desired (unimplemented). + * + * The journal buckets exist in the same pool as all the other buckets that are + * managed by the allocator and garbage collection - garbage collection marks + * the journal buckets as metadata buckets. + * + * OPEN/DIRTY JOURNAL ENTRIES: + * + * Open/dirty journal entries are journal entries that contain b-tree updates + * that have not yet been written out to the b-tree on disk. We have to track + * which journal entries are dirty, and we also have to avoid wrapping around + * the journal and overwriting old but still dirty journal entries with new + * journal entries. + * + * On disk, this is represented with the "last_seq" field of struct jset; + * last_seq is the first sequence number that journal replay has to replay. + * + * To avoid overwriting dirty journal entries on disk, we keep a mapping (in + * journal_device->seq) of for each journal bucket, the highest sequence number + * any journal entry it contains. Then, by comparing that against last_seq we + * can determine whether that journal bucket contains dirty journal entries or + * not. + * + * To track which journal entries are dirty, we maintain a fifo of refcounts + * (where each entry corresponds to a specific sequence number) - when a ref + * goes to 0, that journal entry is no longer dirty. + * + * Journalling of index updates is done at the same time as the b-tree itself is + * being modified (see btree_insert_key()); when we add the key to the journal + * the pending b-tree write takes a ref on the journal entry the key was added + * to. If a pending b-tree write would need to take refs on multiple dirty + * journal entries, it only keeps the ref on the oldest one (since a newer + * journal entry will still be replayed if an older entry was dirty). + * + * JOURNAL FILLING UP: + * + * There are two ways the journal could fill up; either we could run out of + * space to write to, or we could have too many open journal entries and run out + * of room in the fifo of refcounts. Since those refcounts are decremented + * without any locking we can't safely resize that fifo, so we handle it the + * same way. + * + * If the journal fills up, we start flushing dirty btree nodes until we can + * allocate space for a journal write again - preferentially flushing btree + * nodes that are pinning the oldest journal entries first. + */ + +#include + +#include "journal_types.h" + +struct bch_fs; + +static inline void journal_wake(struct journal *j) +{ + wake_up(&j->wait); + closure_wake_up(&j->async_wait); + closure_wake_up(&j->preres_wait); +} + +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + return j->buf + j->reservations.idx; +} + +/* Sequence number of oldest dirty journal entry */ + +static inline u64 journal_last_seq(struct journal *j) +{ + return j->pin.front; +} + +static inline u64 journal_cur_seq(struct journal *j) +{ + EBUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); + + return j->pin.back - 1; +} + +static inline u64 journal_last_unwritten_seq(struct journal *j) +{ + return j->seq_ondisk + 1; +} + +static inline int journal_state_count(union journal_res_state s, int idx) +{ + switch (idx) { + case 0: return s.buf0_count; + case 1: return s.buf1_count; + case 2: return s.buf2_count; + case 3: return s.buf3_count; + } + BUG(); +} + +static inline void journal_state_inc(union journal_res_state *s) +{ + s->buf0_count += s->idx == 0; + s->buf1_count += s->idx == 1; + s->buf2_count += s->idx == 2; + s->buf3_count += s->idx == 3; +} + +/* + * Amount of space that will be taken up by some keys in the journal (i.e. + * including the jset header) + */ +static inline unsigned jset_u64s(unsigned u64s) +{ + return u64s + sizeof(struct jset_entry) / sizeof(u64); +} + +static inline int journal_entry_overhead(struct journal *j) +{ + return sizeof(struct jset) / sizeof(u64) + j->entry_u64s_reserved; +} + +static inline struct jset_entry * +bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) +{ + struct jset *jset = buf->data; + struct jset_entry *entry = vstruct_idx(jset, le32_to_cpu(jset->u64s)); + + memset(entry, 0, sizeof(*entry)); + entry->u64s = cpu_to_le16(u64s); + + le32_add_cpu(&jset->u64s, jset_u64s(u64s)); + + return entry; +} + +static inline struct jset_entry * +journal_res_entry(struct journal *j, struct journal_res *res) +{ + return vstruct_idx(j->buf[res->idx].data, res->offset); +} + +static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, + unsigned u64s) +{ + entry->u64s = cpu_to_le16(u64s); + entry->btree_id = id; + entry->level = level; + entry->type = type; + entry->pad[0] = 0; + entry->pad[1] = 0; + entry->pad[2] = 0; + return jset_u64s(u64s); +} + +static inline unsigned journal_entry_set(struct jset_entry *entry, unsigned type, + enum btree_id id, unsigned level, + const void *data, unsigned u64s) +{ + unsigned ret = journal_entry_init(entry, type, id, level, u64s); + + memcpy_u64s_small(entry->_data, data, u64s); + return ret; +} + +static inline struct jset_entry * +bch2_journal_add_entry(struct journal *j, struct journal_res *res, + unsigned type, enum btree_id id, + unsigned level, unsigned u64s) +{ + struct jset_entry *entry = journal_res_entry(j, res); + unsigned actual = journal_entry_init(entry, type, id, level, u64s); + + EBUG_ON(!res->ref); + EBUG_ON(actual > res->u64s); + + res->offset += actual; + res->u64s -= actual; + return entry; +} + +static inline bool journal_entry_empty(struct jset *j) +{ + struct jset_entry *i; + + if (j->seq != j->last_seq) + return false; + + vstruct_for_each(j, i) + if (i->type == BCH_JSET_ENTRY_btree_keys && i->u64s) + return false; + return true; +} + +void __bch2_journal_buf_put(struct journal *); + +static inline void bch2_journal_buf_put(struct journal *j, unsigned idx) +{ + union journal_res_state s; + + s.v = atomic64_sub_return(((union journal_res_state) { + .buf0_count = idx == 0, + .buf1_count = idx == 1, + .buf2_count = idx == 2, + .buf3_count = idx == 3, + }).v, &j->reservations.counter); + + if (!journal_state_count(s, idx) && idx == s.unwritten_idx) + __bch2_journal_buf_put(j); +} + +/* + * This function releases the journal write structure so other threads can + * then proceed to add their keys as well. + */ +static inline void bch2_journal_res_put(struct journal *j, + struct journal_res *res) +{ + if (!res->ref) + return; + + lock_release(&j->res_map, _THIS_IP_); + + while (res->u64s) + bch2_journal_add_entry(j, res, + BCH_JSET_ENTRY_btree_keys, + 0, 0, 0); + + bch2_journal_buf_put(j, res->idx); + + res->ref = 0; +} + +int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *, + unsigned); + +/* First bits for BCH_WATERMARK: */ +enum journal_res_flags { + __JOURNAL_RES_GET_NONBLOCK = BCH_WATERMARK_BITS, + __JOURNAL_RES_GET_CHECK, +}; + +#define JOURNAL_RES_GET_NONBLOCK (1 << __JOURNAL_RES_GET_NONBLOCK) +#define JOURNAL_RES_GET_CHECK (1 << __JOURNAL_RES_GET_CHECK) + +static inline int journal_res_get_fast(struct journal *j, + struct journal_res *res, + unsigned flags) +{ + union journal_res_state old, new; + u64 v = atomic64_read(&j->reservations.counter); + + do { + old.v = new.v = v; + + /* + * Check if there is still room in the current journal + * entry: + */ + if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) + return 0; + + EBUG_ON(!journal_state_count(new, new.idx)); + + if ((flags & BCH_WATERMARK_MASK) < j->watermark) + return 0; + + new.cur_entry_offset += res->u64s; + journal_state_inc(&new); + + /* + * If the refcount would overflow, we have to wait: + * XXX - tracepoint this: + */ + if (!journal_state_count(new, new.idx)) + return 0; + + if (flags & JOURNAL_RES_GET_CHECK) + return 1; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + res->ref = true; + res->idx = old.idx; + res->offset = old.cur_entry_offset; + res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + return 1; +} + +static inline int bch2_journal_res_get(struct journal *j, struct journal_res *res, + unsigned u64s, unsigned flags) +{ + int ret; + + EBUG_ON(res->ref); + EBUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + + res->u64s = u64s; + + if (journal_res_get_fast(j, res, flags)) + goto out; + + ret = bch2_journal_res_get_slowpath(j, res, flags); + if (ret) + return ret; +out: + if (!(flags & JOURNAL_RES_GET_CHECK)) { + lock_acquire_shared(&j->res_map, 0, + (flags & JOURNAL_RES_GET_NONBLOCK) != 0, + NULL, _THIS_IP_); + EBUG_ON(!res->ref); + } + return 0; +} + +/* journal_preres: */ + +static inline void journal_set_watermark(struct journal *j) +{ + union journal_preres_state s = READ_ONCE(j->prereserved); + unsigned watermark = BCH_WATERMARK_stripe; + + if (fifo_free(&j->pin) < j->pin.size / 4) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); + if (fifo_free(&j->pin) < j->pin.size / 8) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (s.reserved > s.remaining) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_copygc); + if (!s.remaining) + watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim); + + if (watermark == j->watermark) + return; + + swap(watermark, j->watermark); + if (watermark > j->watermark) + journal_wake(j); +} + +static inline void bch2_journal_preres_put(struct journal *j, + struct journal_preres *res) +{ + union journal_preres_state s = { .reserved = res->u64s }; + + if (!res->u64s) + return; + + s.v = atomic64_sub_return(s.v, &j->prereserved.counter); + res->u64s = 0; + + if (unlikely(s.waiting)) { + clear_bit(ilog2((((union journal_preres_state) { .waiting = 1 }).v)), + (unsigned long *) &j->prereserved.v); + closure_wake_up(&j->preres_wait); + } + + if (s.reserved <= s.remaining && j->watermark) + journal_set_watermark(j); +} + +int __bch2_journal_preres_get(struct journal *, + struct journal_preres *, unsigned, unsigned); + +static inline int bch2_journal_preres_get_fast(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags, + bool set_waiting) +{ + int d = new_u64s - res->u64s; + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; + int ret; + + do { + old.v = new.v = v; + ret = 0; + + if (watermark == BCH_WATERMARK_reclaim || + new.reserved + d < new.remaining) { + new.reserved += d; + ret = 1; + } else if (set_waiting && !new.waiting) + new.waiting = true; + else + return 0; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); + + if (ret) + res->u64s += d; + return ret; +} + +static inline int bch2_journal_preres_get(struct journal *j, + struct journal_preres *res, + unsigned new_u64s, + unsigned flags) +{ + if (new_u64s <= res->u64s) + return 0; + + if (bch2_journal_preres_get_fast(j, res, new_u64s, flags, false)) + return 0; + + if (flags & JOURNAL_RES_GET_NONBLOCK) + return -BCH_ERR_journal_preres_get_blocked; + + return __bch2_journal_preres_get(j, res, new_u64s, flags); +} + +/* journal_entry_res: */ + +void bch2_journal_entry_res_resize(struct journal *, + struct journal_entry_res *, + unsigned); + +int bch2_journal_flush_seq_async(struct journal *, u64, struct closure *); +void bch2_journal_flush_async(struct journal *, struct closure *); + +int bch2_journal_flush_seq(struct journal *, u64); +int bch2_journal_flush(struct journal *); +bool bch2_journal_noflush_seq(struct journal *, u64); +int bch2_journal_meta(struct journal *); + +void bch2_journal_halt(struct journal *); + +static inline int bch2_journal_error(struct journal *j) +{ + return j->reservations.cur_entry_offset == JOURNAL_ENTRY_ERROR_VAL + ? -EIO : 0; +} + +struct bch_dev; + +static inline void bch2_journal_set_replay_done(struct journal *j) +{ + BUG_ON(!test_bit(JOURNAL_STARTED, &j->flags)); + set_bit(JOURNAL_REPLAY_DONE, &j->flags); +} + +void bch2_journal_unblock(struct journal *); +void bch2_journal_block(struct journal *); + +void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_debug_to_text(struct printbuf *, struct journal *); +void bch2_journal_pins_to_text(struct printbuf *, struct journal *); +bool bch2_journal_seq_pins_to_text(struct printbuf *, struct journal *, u64 *); + +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, + unsigned nr); +int bch2_dev_journal_alloc(struct bch_dev *); + +void bch2_dev_journal_stop(struct journal *, struct bch_dev *); + +void bch2_fs_journal_stop(struct journal *); +int bch2_fs_journal_start(struct journal *, u64); + +void bch2_dev_journal_exit(struct bch_dev *); +int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); +void bch2_fs_journal_exit(struct journal *); +int bch2_fs_journal_init(struct journal *); + +#endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c new file mode 100644 index 000000000..f861ae2f1 --- /dev/null +++ b/fs/bcachefs/journal_io.c @@ -0,0 +1,1863 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_io.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "checksum.h" +#include "disk_groups.h" +#include "error.h" +#include "io.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "replicas.h" +#include "trace.h" + +static struct nonce journal_nonce(const struct jset *jset) +{ + return (struct nonce) {{ + [0] = 0, + [1] = ((__le32 *) &jset->seq)[0], + [2] = ((__le32 *) &jset->seq)[1], + [3] = BCH_NONCE_JOURNAL, + }}; +} + +static bool jset_csum_good(struct bch_fs *c, struct jset *j) +{ + return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) && + !bch2_crc_cmp(j->csum, + csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j)); +} + +static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq) +{ + return (seq - c->journal_entries_base_seq) & (~0U >> 1); +} + +static void __journal_replay_free(struct bch_fs *c, + struct journal_replay *i) +{ + struct journal_replay **p = + genradix_ptr(&c->journal_entries, + journal_entry_radix_idx(c, le64_to_cpu(i->j.seq))); + + BUG_ON(*p != i); + *p = NULL; + kvpfree(i, offsetof(struct journal_replay, j) + + vstruct_bytes(&i->j)); +} + +static void journal_replay_free(struct bch_fs *c, struct journal_replay *i) +{ + i->ignore = true; + + if (!c->opts.read_entire_journal) + __journal_replay_free(c, i); +} + +struct journal_list { + struct closure cl; + u64 last_seq; + struct mutex lock; + int ret; +}; + +#define JOURNAL_ENTRY_ADD_OK 0 +#define JOURNAL_ENTRY_ADD_OUT_OF_RANGE 5 + +/* + * Given a journal entry we just read, add it to the list of journal entries to + * be replayed: + */ +static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, + struct journal_ptr entry_ptr, + struct journal_list *jlist, struct jset *j) +{ + struct genradix_iter iter; + struct journal_replay **_i, *i, *dup; + struct journal_ptr *ptr; + size_t bytes = vstruct_bytes(j); + u64 last_seq = !JSET_NO_FLUSH(j) ? le64_to_cpu(j->last_seq) : 0; + int ret = JOURNAL_ENTRY_ADD_OK; + + /* Is this entry older than the range we need? */ + if (!c->opts.read_entire_journal && + le64_to_cpu(j->seq) < jlist->last_seq) + return JOURNAL_ENTRY_ADD_OUT_OF_RANGE; + + /* + * genradixes are indexed by a ulong, not a u64, so we can't index them + * by sequence number directly: Assume instead that they will all fall + * within the range of +-2billion of the filrst one we find. + */ + if (!c->journal_entries_base_seq) + c->journal_entries_base_seq = max_t(s64, 1, le64_to_cpu(j->seq) - S32_MAX); + + /* Drop entries we don't need anymore */ + if (last_seq > jlist->last_seq && !c->opts.read_entire_journal) { + genradix_for_each_from(&c->journal_entries, iter, _i, + journal_entry_radix_idx(c, jlist->last_seq)) { + i = *_i; + + if (!i || i->ignore) + continue; + + if (le64_to_cpu(i->j.seq) >= last_seq) + break; + journal_replay_free(c, i); + } + } + + jlist->last_seq = max(jlist->last_seq, last_seq); + + _i = genradix_ptr_alloc(&c->journal_entries, + journal_entry_radix_idx(c, le64_to_cpu(j->seq)), + GFP_KERNEL); + if (!_i) + return -BCH_ERR_ENOMEM_journal_entry_add; + + /* + * Duplicate journal entries? If so we want the one that didn't have a + * checksum error: + */ + dup = *_i; + if (dup) { + if (bytes == vstruct_bytes(&dup->j) && + !memcmp(j, &dup->j, bytes)) { + i = dup; + goto found; + } + + if (!entry_ptr.csum_good) { + i = dup; + goto found; + } + + if (!dup->csum_good) + goto replace; + + fsck_err(c, "found duplicate but non identical journal entries (seq %llu)", + le64_to_cpu(j->seq)); + i = dup; + goto found; + } +replace: + i = kvpmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); + if (!i) + return -BCH_ERR_ENOMEM_journal_entry_add; + + i->nr_ptrs = 0; + i->csum_good = entry_ptr.csum_good; + i->ignore = false; + unsafe_memcpy(&i->j, j, bytes, "embedded variable length struct"); + i->ptrs[i->nr_ptrs++] = entry_ptr; + + if (dup) { + if (dup->nr_ptrs >= ARRAY_SIZE(dup->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + dup->nr_ptrs = ARRAY_SIZE(dup->ptrs) - 1; + } + + /* The first ptr should represent the jset we kept: */ + memcpy(i->ptrs + i->nr_ptrs, + dup->ptrs, + sizeof(dup->ptrs[0]) * dup->nr_ptrs); + i->nr_ptrs += dup->nr_ptrs; + __journal_replay_free(c, dup); + } + + *_i = i; + return 0; +found: + for (ptr = i->ptrs; ptr < i->ptrs + i->nr_ptrs; ptr++) { + if (ptr->dev == ca->dev_idx) { + bch_err(c, "duplicate journal entry %llu on same device", + le64_to_cpu(i->j.seq)); + goto out; + } + } + + if (i->nr_ptrs >= ARRAY_SIZE(i->ptrs)) { + bch_err(c, "found too many copies of journal entry %llu", + le64_to_cpu(i->j.seq)); + goto out; + } + + i->ptrs[i->nr_ptrs++] = entry_ptr; +out: +fsck_err: + return ret; +} + +/* this fills in a range with empty jset_entries: */ +static void journal_entry_null_range(void *start, void *end) +{ + struct jset_entry *entry; + + for (entry = start; entry != end; entry = vstruct_next(entry)) + memset(entry, 0, sizeof(*entry)); +} + +#define JOURNAL_ENTRY_REREAD 5 +#define JOURNAL_ENTRY_NONE 6 +#define JOURNAL_ENTRY_BAD 7 + +static void journal_entry_err_msg(struct printbuf *out, + struct jset *jset, + struct jset_entry *entry) +{ + prt_str(out, "invalid journal entry "); + if (entry) + prt_printf(out, "%s ", bch2_jset_entry_types[entry->type]); + + if (!jset) + prt_printf(out, "in superblock"); + else if (!entry) + prt_printf(out, "at seq %llu", le64_to_cpu(jset->seq)); + else + prt_printf(out, "at offset %zi/%u seq %llu", + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + le64_to_cpu(jset->seq)); + prt_str(out, ": "); +} + +#define journal_entry_err(c, jset, entry, msg, ...) \ +({ \ + struct printbuf buf = PRINTBUF; \ + \ + journal_entry_err_msg(&buf, jset, entry); \ + prt_printf(&buf, msg, ##__VA_ARGS__); \ + \ + switch (write) { \ + case READ: \ + mustfix_fsck_err(c, "%s", buf.buf); \ + break; \ + case WRITE: \ + bch_err(c, "corrupt metadata before write: %s\n", buf.buf);\ + if (bch2_fs_inconsistent(c)) { \ + ret = -BCH_ERR_fsck_errors_not_fixed; \ + goto fsck_err; \ + } \ + break; \ + } \ + \ + printbuf_exit(&buf); \ + true; \ +}) + +#define journal_entry_err_on(cond, c, jset, entry, msg, ...) \ + ((cond) ? journal_entry_err(c, jset, entry, msg, ##__VA_ARGS__) : false) + +#define FSCK_DELETED_KEY 5 + +static int journal_validate_key(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned level, enum btree_id btree_id, + struct bkey_i *k, + unsigned version, int big_endian, int write) +{ + void *next = vstruct_next(entry); + struct printbuf buf = PRINTBUF; + int ret = 0; + + if (journal_entry_err_on(!k->k.u64s, c, jset, entry, "k->u64s 0")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on((void *) bkey_next(k) > + (void *) vstruct_next(entry), + c, jset, entry, + "extends past end of journal entry")) { + entry->u64s = cpu_to_le16((u64 *) k - entry->_data); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; + } + + if (journal_entry_err_on(k->k.format != KEY_FORMAT_CURRENT, + c, jset, entry, + "bad format %u", k->k.format)) { + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + return FSCK_DELETED_KEY; + } + + if (!write) + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); + + if (bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf)) { + printbuf_reset(&buf); + prt_printf(&buf, "invalid journal entry %s at offset %zi/%u seq %llu:", + bch2_jset_entry_types[entry->type], + (u64 *) entry - jset->_data, + le32_to_cpu(jset->u64s), + le64_to_cpu(jset->seq)); + prt_newline(&buf); + printbuf_indent_add(&buf, 2); + + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + prt_newline(&buf); + bch2_bkey_invalid(c, bkey_i_to_s_c(k), + __btree_node_type(level, btree_id), write, &buf); + + mustfix_fsck_err(c, "%s", buf.buf); + + le16_add_cpu(&entry->u64s, -((u16) k->k.u64s)); + memmove(k, bkey_next(k), next - (void *) bkey_next(k)); + journal_entry_null_range(vstruct_next(entry), next); + + printbuf_exit(&buf); + return FSCK_DELETED_KEY; + } + + if (write) + bch2_bkey_compat(level, btree_id, version, big_endian, + write, NULL, bkey_to_packed(k)); +fsck_err: + printbuf_exit(&buf); + return ret; +} + +static int journal_entry_btree_keys_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct bkey_i *k = entry->start; + + while (k != vstruct_last(entry)) { + int ret = journal_validate_key(c, jset, entry, + entry->level, + entry->btree_id, + k, version, big_endian, + write|BKEY_INVALID_JOURNAL); + if (ret == FSCK_DELETED_KEY) + continue; + + k = bkey_next(k); + } + + return 0; +} + +static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct bkey_i *k; + bool first = true; + + jset_entry_for_each_key(entry, k) { + if (!first) { + prt_newline(out); + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + } + prt_printf(out, "btree=%s l=%u ", bch2_btree_ids[entry->btree_id], entry->level); + bch2_bkey_val_to_text(out, c, bkey_i_to_s_c(k)); + first = false; + } +} + +static int journal_entry_btree_root_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct bkey_i *k = entry->start; + int ret = 0; + + if (journal_entry_err_on(!entry->u64s || + le16_to_cpu(entry->u64s) != k->k.u64s, + c, jset, entry, + "invalid btree root journal entry: wrong number of keys")) { + void *next = vstruct_next(entry); + /* + * we don't want to null out this jset_entry, + * just the contents, so that later we can tell + * we were _supposed_ to have a btree root + */ + entry->u64s = 0; + journal_entry_null_range(vstruct_next(entry), next); + return 0; + } + + return journal_validate_key(c, jset, entry, 1, entry->btree_id, k, + version, big_endian, write); +fsck_err: + return ret; +} + +static void journal_entry_btree_root_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + +static int journal_entry_prio_ptrs_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + /* obsolete, don't care: */ + return 0; +} + +static void journal_entry_prio_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ +} + +static int journal_entry_blacklist_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 1, + c, jset, entry, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } +fsck_err: + return ret; +} + +static void journal_entry_blacklist_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist *bl = + container_of(entry, struct jset_entry_blacklist, entry); + + prt_printf(out, "seq=%llu", le64_to_cpu(bl->seq)); +} + +static int journal_entry_blacklist_v2_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_blacklist_v2 *bl_entry; + int ret = 0; + + if (journal_entry_err_on(le16_to_cpu(entry->u64s) != 2, + c, jset, entry, + "invalid journal seq blacklist entry: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + goto out; + } + + bl_entry = container_of(entry, struct jset_entry_blacklist_v2, entry); + + if (journal_entry_err_on(le64_to_cpu(bl_entry->start) > + le64_to_cpu(bl_entry->end), + c, jset, entry, + "invalid journal seq blacklist entry: start > end")) { + journal_entry_null_range(entry, vstruct_next(entry)); + } +out: +fsck_err: + return ret; +} + +static void journal_entry_blacklist_v2_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_blacklist_v2 *bl = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + prt_printf(out, "start=%llu end=%llu", + le64_to_cpu(bl->start), + le64_to_cpu(bl->end)); +} + +static int journal_entry_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u), + c, jset, entry, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + prt_printf(out, "type=%s v=%llu", + bch2_fs_usage_types[u->entry.btree_id], + le64_to_cpu(u->v)); +} + +static int journal_entry_data_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes < sizeof(*u) || + bytes < sizeof(*u) + u->r.nr_devs, + c, jset, entry, + "invalid journal entry usage: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_data_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + bch2_replicas_entry_to_text(out, &u->r); + prt_printf(out, "=%llu", le64_to_cpu(u->v)); +} + +static int journal_entry_clock_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), + c, jset, entry, "bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, + c, jset, entry, "bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_clock_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + prt_printf(out, "%s=%llu", clock->rw ? "write" : "read", le64_to_cpu(clock->time)); +} + +static int journal_entry_dev_usage_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + unsigned expected = sizeof(*u); + unsigned dev; + int ret = 0; + + if (journal_entry_err_on(bytes < expected, + c, jset, entry, "bad size (%u < %u)", + bytes, expected)) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + dev = le32_to_cpu(u->dev); + + if (journal_entry_err_on(!bch2_dev_exists2(c, dev), + c, jset, entry, "bad dev")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(u->pad, + c, jset, entry, "bad pad")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + +static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + prt_printf(out, "dev=%u", le32_to_cpu(u->dev)); + + for (i = 0; i < nr_types; i++) { + if (i < BCH_DATA_NR) + prt_printf(out, " %s", bch2_data_types[i]); + else + prt_printf(out, " (unknown data type %u)", i); + prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu", + le64_to_cpu(u->d[i].buckets), + le64_to_cpu(u->d[i].sectors), + le64_to_cpu(u->d[i].fragmented)); + } + + prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec)); +} + +static int journal_entry_log_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + return 0; +} + +static void journal_entry_log_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); + unsigned bytes = vstruct_bytes(entry) - offsetof(struct jset_entry_log, d); + + prt_printf(out, "%.*s", bytes, l->d); +} + +static int journal_entry_overwrite_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, READ); +} + +static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + +struct jset_entry_ops { + int (*validate)(struct bch_fs *, struct jset *, + struct jset_entry *, unsigned, int, int); + void (*to_text)(struct printbuf *, struct bch_fs *, struct jset_entry *); +}; + +static const struct jset_entry_ops bch2_jset_entry_ops[] = { +#define x(f, nr) \ + [BCH_JSET_ENTRY_##f] = (struct jset_entry_ops) { \ + .validate = journal_entry_##f##_validate, \ + .to_text = journal_entry_##f##_to_text, \ + }, + BCH_JSET_ENTRY_TYPES() +#undef x +}; + +int bch2_journal_entry_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, int write) +{ + return entry->type < BCH_JSET_ENTRY_NR + ? bch2_jset_entry_ops[entry->type].validate(c, jset, entry, + version, big_endian, write) + : 0; +} + +void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + if (entry->type < BCH_JSET_ENTRY_NR) { + prt_printf(out, "%s: ", bch2_jset_entry_types[entry->type]); + bch2_jset_entry_ops[entry->type].to_text(out, c, entry); + } else { + prt_printf(out, "(unknown type %u)", entry->type); + } +} + +static int jset_validate_entries(struct bch_fs *c, struct jset *jset, + int write) +{ + struct jset_entry *entry; + int ret = 0; + + vstruct_for_each(jset, entry) { + if (journal_entry_err_on(vstruct_next(entry) > + vstruct_last(jset), c, jset, entry, + "journal entry extends past end of jset")) { + jset->u64s = cpu_to_le32((u64 *) entry - jset->_data); + break; + } + + ret = bch2_journal_entry_validate(c, jset, entry, + le32_to_cpu(jset->version), + JSET_BIG_ENDIAN(jset), write); + if (ret) + break; + } +fsck_err: + return ret; +} + +static int jset_validate(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, + int write) +{ + unsigned version; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + "%s sector %llu seq %llu: incompatible journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version))) { + /* don't try to continue: */ + return -EINVAL; + } + + if (journal_entry_err_on(!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(jset)), + c, jset, NULL, + "%s sector %llu seq %llu: journal entry with unknown csum type %llu", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + JSET_CSUM_TYPE(jset))) + ret = JOURNAL_ENTRY_BAD; + + /* last_seq is ignored when JSET_NO_FLUSH is true */ + if (journal_entry_err_on(!JSET_NO_FLUSH(jset) && + le64_to_cpu(jset->last_seq) > le64_to_cpu(jset->seq), + c, jset, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(jset->last_seq), + le64_to_cpu(jset->seq))) { + jset->last_seq = jset->seq; + return JOURNAL_ENTRY_BAD; + } + + ret = jset_validate_entries(c, jset, write); +fsck_err: + return ret; +} + +static int jset_validate_early(struct bch_fs *c, + struct bch_dev *ca, + struct jset *jset, u64 sector, + unsigned bucket_sectors_left, + unsigned sectors_read) +{ + size_t bytes = vstruct_bytes(jset); + unsigned version; + int write = READ; + int ret = 0; + + if (le64_to_cpu(jset->magic) != jset_magic(c)) + return JOURNAL_ENTRY_NONE; + + version = le32_to_cpu(jset->version); + if (journal_entry_err_on(!bch2_version_compatible(version), c, jset, NULL, + "%s sector %llu seq %llu: unknown journal entry version %u.%u", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), + BCH_VERSION_MAJOR(version), + BCH_VERSION_MINOR(version))) { + /* don't try to continue: */ + return -EINVAL; + } + + if (bytes > (sectors_read << 9) && + sectors_read < bucket_sectors_left) + return JOURNAL_ENTRY_REREAD; + + if (journal_entry_err_on(bytes > bucket_sectors_left << 9, + c, jset, NULL, + "%s sector %llu seq %llu: journal entry too big (%zu bytes)", + ca ? ca->name : c->name, + sector, le64_to_cpu(jset->seq), bytes)) + le32_add_cpu(&jset->u64s, + -((bytes - (bucket_sectors_left << 9)) / 8)); +fsck_err: + return ret; +} + +struct journal_read_buf { + void *data; + size_t size; +}; + +static int journal_read_buf_realloc(struct journal_read_buf *b, + size_t new_size) +{ + void *n; + + /* the bios are sized for this many pages, max: */ + if (new_size > JOURNAL_ENTRY_SIZE_MAX) + return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + + new_size = roundup_pow_of_two(new_size); + n = kvpmalloc(new_size, GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + + kvpfree(b->data, b->size); + b->data = n; + b->size = new_size; + return 0; +} + +static int journal_read_bucket(struct bch_dev *ca, + struct journal_read_buf *buf, + struct journal_list *jlist, + unsigned bucket) +{ + struct bch_fs *c = ca->fs; + struct journal_device *ja = &ca->journal; + struct jset *j = NULL; + unsigned sectors, sectors_read = 0; + u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), + end = offset + ca->mi.bucket_size; + bool saw_bad = false, csum_good; + int ret = 0; + + pr_debug("reading %u", bucket); + + while (offset < end) { + if (!sectors_read) { + struct bio *bio; + unsigned nr_bvecs; +reread: + sectors_read = min_t(unsigned, + end - offset, buf->size >> 9); + nr_bvecs = buf_pages(buf->data, sectors_read << 9); + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); + + bio->bi_iter.bi_sector = offset; + bch2_bio_map(bio, buf->data, sectors_read << 9); + + ret = submit_bio_wait(bio); + kfree(bio); + + if (bch2_dev_io_err_on(ret, ca, + "journal read error: sector %llu", + offset) || + bch2_meta_read_fault("journal")) { + /* + * We don't error out of the recovery process + * here, since the relevant journal entry may be + * found on a different device, and missing or + * no journal entries will be handled later + */ + return 0; + } + + j = buf->data; + } + + ret = jset_validate_early(c, ca, j, offset, + end - offset, sectors_read); + switch (ret) { + case 0: + sectors = vstruct_sectors(j, c->block_bits); + break; + case JOURNAL_ENTRY_REREAD: + if (vstruct_bytes(j) > buf->size) { + ret = journal_read_buf_realloc(buf, + vstruct_bytes(j)); + if (ret) + return ret; + } + goto reread; + case JOURNAL_ENTRY_NONE: + if (!saw_bad) + return 0; + /* + * On checksum error we don't really trust the size + * field of the journal entry we read, so try reading + * again at next block boundary: + */ + sectors = block_sectors(c); + goto next_block; + default: + return ret; + } + + /* + * This happens sometimes if we don't have discards on - + * when we've partially overwritten a bucket with new + * journal entries. We don't need the rest of the + * bucket: + */ + if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) + return 0; + + ja->bucket_seq[bucket] = le64_to_cpu(j->seq); + + csum_good = jset_csum_good(c, j); + if (!csum_good) + saw_bad = true; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), + j->encrypted_start, + vstruct_end(j) - (void *) j->encrypted_start); + bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret); + + mutex_lock(&jlist->lock); + ret = journal_entry_add(c, ca, (struct journal_ptr) { + .csum_good = csum_good, + .dev = ca->dev_idx, + .bucket = bucket, + .bucket_offset = offset - + bucket_to_sector(ca, ja->buckets[bucket]), + .sector = offset, + }, jlist, j); + mutex_unlock(&jlist->lock); + + switch (ret) { + case JOURNAL_ENTRY_ADD_OK: + break; + case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: + break; + default: + return ret; + } +next_block: + pr_debug("next"); + offset += sectors; + sectors_read -= sectors; + j = ((void *) j) + (sectors << 9); + } + + return 0; +} + +static void bch2_journal_read_device(struct closure *cl) +{ + struct journal_device *ja = + container_of(cl, struct journal_device, read); + struct bch_dev *ca = container_of(ja, struct bch_dev, journal); + struct bch_fs *c = ca->fs; + struct journal_list *jlist = + container_of(cl->parent, struct journal_list, cl); + struct journal_replay *r, **_r; + struct genradix_iter iter; + struct journal_read_buf buf = { NULL, 0 }; + unsigned i; + int ret = 0; + + if (!ja->nr) + goto out; + + ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + if (ret) + goto err; + + pr_debug("%u journal buckets", ja->nr); + + for (i = 0; i < ja->nr; i++) { + ret = journal_read_bucket(ca, &buf, jlist, i); + if (ret) + goto err; + } + + ja->sectors_free = ca->mi.bucket_size; + + mutex_lock(&jlist->lock); + genradix_for_each_reverse(&c->journal_entries, iter, _r) { + r = *_r; + + if (!r) + continue; + + for (i = 0; i < r->nr_ptrs; i++) { + if (r->ptrs[i].dev == ca->dev_idx) { + unsigned wrote = bucket_remainder(ca, r->ptrs[i].sector) + + vstruct_sectors(&r->j, c->block_bits); + + ja->cur_idx = r->ptrs[i].bucket; + ja->sectors_free = ca->mi.bucket_size - wrote; + goto found; + } + } + } +found: + mutex_unlock(&jlist->lock); + + if (ja->bucket_seq[ja->cur_idx] && + ja->sectors_free == ca->mi.bucket_size) { + bch_err(c, "ja->sectors_free == ca->mi.bucket_size"); + bch_err(c, "cur_idx %u/%u", ja->cur_idx, ja->nr); + for (i = 0; i < 3; i++) { + unsigned idx = (ja->cur_idx + ja->nr - 1 + i) % ja->nr; + bch_err(c, "bucket_seq[%u] = %llu", idx, ja->bucket_seq[idx]); + } + ja->sectors_free = 0; + } + + /* + * Set dirty_idx to indicate the entire journal is full and needs to be + * reclaimed - journal reclaim will immediately reclaim whatever isn't + * pinned when it first runs: + */ + ja->discard_idx = ja->dirty_idx_ondisk = + ja->dirty_idx = (ja->cur_idx + 1) % ja->nr; +out: + bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); + kvpfree(buf.data, buf.size); + percpu_ref_put(&ca->io_ref); + closure_return(cl); + return; +err: + mutex_lock(&jlist->lock); + jlist->ret = ret; + mutex_unlock(&jlist->lock); + goto out; +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + unsigned i; + + for (i = 0; i < j->nr_ptrs; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, j->ptrs[i].dev); + u64 offset; + + div64_u64_rem(j->ptrs[i].sector, ca->mi.bucket_size, &offset); + + if (i) + prt_printf(out, " "); + prt_printf(out, "%u:%u:%u (sector %llu)", + j->ptrs[i].dev, + j->ptrs[i].bucket, + j->ptrs[i].bucket_offset, + j->ptrs[i].sector); + } +} + +int bch2_journal_read(struct bch_fs *c, + u64 *last_seq, + u64 *blacklist_seq, + u64 *start_seq) +{ + struct journal_list jlist; + struct journal_replay *i, **_i, *prev = NULL; + struct genradix_iter radix_iter; + struct bch_dev *ca; + unsigned iter; + struct printbuf buf = PRINTBUF; + bool degraded = false, last_write_torn = false; + u64 seq; + int ret = 0; + + closure_init_stack(&jlist.cl); + mutex_init(&jlist.lock); + jlist.last_seq = 0; + jlist.ret = 0; + + for_each_member_device(ca, c, iter) { + if (!c->opts.fsck && + !(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal))) + continue; + + if ((ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro) && + percpu_ref_tryget(&ca->io_ref)) + closure_call(&ca->journal.read, + bch2_journal_read_device, + system_unbound_wq, + &jlist.cl); + else + degraded = true; + } + + closure_sync(&jlist.cl); + + if (jlist.ret) + return jlist.ret; + + *last_seq = 0; + *start_seq = 0; + *blacklist_seq = 0; + + /* + * Find most recent flush entry, and ignore newer non flush entries - + * those entries will be blacklisted: + */ + genradix_for_each_reverse(&c->journal_entries, radix_iter, _i) { + int write = READ; + + i = *_i; + + if (!i || i->ignore) + continue; + + if (!*start_seq) + *blacklist_seq = *start_seq = le64_to_cpu(i->j.seq) + 1; + + if (JSET_NO_FLUSH(&i->j)) { + i->ignore = true; + continue; + } + + if (!last_write_torn && !i->csum_good) { + last_write_torn = true; + i->ignore = true; + continue; + } + + if (journal_entry_err_on(le64_to_cpu(i->j.last_seq) > le64_to_cpu(i->j.seq), + c, &i->j, NULL, + "invalid journal entry: last_seq > seq (%llu > %llu)", + le64_to_cpu(i->j.last_seq), + le64_to_cpu(i->j.seq))) + i->j.last_seq = i->j.seq; + + *last_seq = le64_to_cpu(i->j.last_seq); + *blacklist_seq = le64_to_cpu(i->j.seq) + 1; + break; + } + + if (!*start_seq) { + bch_info(c, "journal read done, but no entries found"); + return 0; + } + + if (!*last_seq) { + fsck_err(c, "journal read done, but no entries found after dropping non-flushes"); + return 0; + } + + bch_info(c, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); + + if (*start_seq != *blacklist_seq) + bch_info(c, "dropped unflushed entries %llu-%llu", + *blacklist_seq, *start_seq - 1); + + /* Drop blacklisted entries and entries older than last_seq: */ + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + seq = le64_to_cpu(i->j.seq); + if (seq < *last_seq) { + journal_replay_free(c, i); + continue; + } + + if (bch2_journal_seq_is_blacklisted(c, seq, true)) { + fsck_err_on(!JSET_NO_FLUSH(&i->j), c, + "found blacklisted journal entry %llu", seq); + i->ignore = true; + } + } + + /* Check for missing entries: */ + seq = *last_seq; + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + u64 missing_start, missing_end; + struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; + + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (prev) { + bch2_journal_ptrs_to_text(&buf1, c, prev); + prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } else + prt_printf(&buf1, "(none)"); + bch2_journal_ptrs_to_text(&buf2, c, i); + + missing_end = seq - 1; + fsck_err(c, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" + " prev at %s\n" + " next at %s", + missing_start, missing_end, + *last_seq, *blacklist_seq - 1, + buf1.buf, buf2.buf); + + printbuf_exit(&buf1); + printbuf_exit(&buf2); + } + + prev = i; + seq++; + } + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + struct bch_replicas_padded replicas = { + .e.data_type = BCH_DATA_journal, + .e.nr_required = 1, + }; + unsigned ptr; + + i = *_i; + if (!i || i->ignore) + continue; + + for (ptr = 0; ptr < i->nr_ptrs; ptr++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev); + + if (!i->ptrs[ptr].csum_good) + bch_err_dev_offset(ca, i->ptrs[ptr].sector, + "invalid journal checksum, seq %llu%s", + le64_to_cpu(i->j.seq), + i->csum_good ? " (had good copy on another device)" : ""); + } + + ret = jset_validate(c, + bch_dev_bkey_exists(c, i->ptrs[0].dev), + &i->j, + i->ptrs[0].sector, + READ); + if (ret) + goto err; + + for (ptr = 0; ptr < i->nr_ptrs; ptr++) + replicas.e.devs[replicas.e.nr_devs++] = i->ptrs[ptr].dev; + + bch2_replicas_entry_sort(&replicas.e); + + /* + * If we're mounting in degraded mode - if we didn't read all + * the devices - this is wrong: + */ + + printbuf_reset(&buf); + bch2_replicas_entry_to_text(&buf, &replicas.e); + + if (!degraded && + fsck_err_on(!bch2_replicas_marked(c, &replicas.e), c, + "superblock not marked as containing replicas %s", + buf.buf)) { + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) + goto err; + } + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* journal write: */ + +static void __journal_write_alloc(struct journal *j, + struct journal_buf *w, + struct dev_alloc_list *devs_sorted, + unsigned sectors, + unsigned *replicas, + unsigned replicas_want) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_device *ja; + struct bch_dev *ca; + unsigned i; + + if (*replicas >= replicas_want) + return; + + for (i = 0; i < devs_sorted->nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted->devs[i]]); + if (!ca) + continue; + + ja = &ca->journal; + + /* + * Check that we can use this device, and aren't already using + * it: + */ + if (!ca->mi.durability || + ca->mi.state != BCH_MEMBER_STATE_rw || + !ja->nr || + bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || + sectors > ja->sectors_free) + continue; + + bch2_dev_stripe_increment(ca, &j->wp.stripe); + + bch2_bkey_append_ptr(&w->key, + (struct bch_extent_ptr) { + .offset = bucket_to_sector(ca, + ja->buckets[ja->cur_idx]) + + ca->mi.bucket_size - + ja->sectors_free, + .dev = ca->dev_idx, + }); + + ja->sectors_free -= sectors; + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + + *replicas += ca->mi.durability; + + if (*replicas >= replicas_want) + break; + } +} + +/** + * journal_next_bucket - move on to the next journal bucket if possible + */ +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned sectors) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_devs_mask devs; + struct journal_device *ja; + struct bch_dev *ca; + struct dev_alloc_list devs_sorted; + unsigned target = c->opts.metadata_target ?: + c->opts.foreground_target; + unsigned i, replicas = 0, replicas_want = + READ_ONCE(c->opts.metadata_replicas); + + rcu_read_lock(); +retry: + devs = target_rw_devs(c, BCH_DATA_journal, target); + + devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); + + if (replicas >= replicas_want) + goto done; + + for (i = 0; i < devs_sorted.nr; i++) { + ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); + if (!ca) + continue; + + ja = &ca->journal; + + if (sectors > ja->sectors_free && + sectors <= ca->mi.bucket_size && + bch2_journal_dev_buckets_available(j, ja, + journal_space_discarded)) { + ja->cur_idx = (ja->cur_idx + 1) % ja->nr; + ja->sectors_free = ca->mi.bucket_size; + + /* + * ja->bucket_seq[ja->cur_idx] must always have + * something sensible: + */ + ja->bucket_seq[ja->cur_idx] = le64_to_cpu(w->data->seq); + } + } + + __journal_write_alloc(j, w, &devs_sorted, + sectors, &replicas, replicas_want); + + if (replicas < replicas_want && target) { + /* Retry from all devices: */ + target = 0; + goto retry; + } +done: + rcu_read_unlock(); + + BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); + + return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS; +} + +static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) +{ + /* we aren't holding j->lock: */ + unsigned new_size = READ_ONCE(j->buf_size_want); + void *new_buf; + + if (buf->buf_size >= new_size) + return; + + new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN); + if (!new_buf) + return; + + memcpy(new_buf, buf->data, buf->buf_size); + + spin_lock(&j->lock); + swap(buf->data, new_buf); + swap(buf->buf_size, new_size); + spin_unlock(&j->lock); + + kvpfree(new_buf, new_size); +} + +static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) +{ + return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); +} + +static void journal_write_done(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_buf *w = journal_last_unwritten_buf(j); + union journal_res_state old, new; + u64 v, seq; + int err = 0; + + bch2_time_stats_update(!JSET_NO_FLUSH(w->data) + ? j->flush_write_time + : j->noflush_write_time, j->write_start_time); + + if (!w->devs_written.nr) { + bch_err(c, "unable to write journal to sufficient devices"); + err = -EIO; + } + if (err) + bch2_fatal_error(c); + + spin_lock(&j->lock); + seq = le64_to_cpu(w->data->seq); + + if (seq >= j->pin.front) + journal_seq_pin(j, seq)->devs = w->devs_written; + + if (!err) { + if (!JSET_NO_FLUSH(w->data)) { + j->flushed_seq_ondisk = seq; + j->last_seq_ondisk = w->last_seq; + + bch2_do_discards(c); + closure_wake_up(&c->freelist_wait); + + bch2_reset_alloc_cursors(c); + } + } else if (!j->err_seq || seq < j->err_seq) + j->err_seq = seq; + + j->seq_ondisk = seq; + + /* + * Updating last_seq_ondisk may let bch2_journal_reclaim_work() discard + * more buckets: + * + * Must come before signaling write completion, for + * bch2_fs_journal_stop(): + */ + if (j->watermark != BCH_WATERMARK_stripe) + journal_reclaim_kick(&c->journal); + + /* also must come before signalling write completion: */ + closure_debug_destroy(cl); + + v = atomic64_read(&j->reservations.counter); + do { + old.v = new.v = v; + BUG_ON(journal_state_count(new, new.unwritten_idx)); + + new.unwritten_idx++; + } while ((v = atomic64_cmpxchg(&j->reservations.counter, + old.v, new.v)) != old.v); + + bch2_journal_space_available(j); + + closure_wake_up(&w->wait); + journal_wake(j); + + if (!journal_state_count(new, new.unwritten_idx) && + journal_last_unwritten_seq(j) <= journal_cur_seq(j)) { + closure_call(&j->io, bch2_journal_write, c->io_complete_wq, NULL); + } else if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && + new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + struct journal_buf *buf = journal_cur_buf(j); + long delta = buf->expires - jiffies; + + /* + * We don't close a journal entry to write it while there's + * previous entries still in flight - the current journal entry + * might want to be written now: + */ + + mod_delayed_work(c->io_complete_wq, &j->write_work, max(0L, delta)); + } + + spin_unlock(&j->lock); +} + +static void journal_write_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + struct journal *j = &ca->fs->journal; + struct journal_buf *w = journal_last_unwritten_buf(j); + unsigned long flags; + + if (bch2_dev_io_err_on(bio->bi_status, ca, "error writing journal entry %llu: %s", + le64_to_cpu(w->data->seq), + bch2_blk_status_to_str(bio->bi_status)) || + bch2_meta_write_fault("journal")) { + spin_lock_irqsave(&j->err_lock, flags); + bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); + spin_unlock_irqrestore(&j->err_lock, flags); + } + + closure_put(&j->io); + percpu_ref_put(&ca->io_ref); +} + +static void do_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_extent_ptr *ptr; + struct bio *bio; + unsigned sectors = vstruct_sectors(w->data, c->block_bits); + + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + ca = bch_dev_bkey_exists(c, ptr->dev); + if (!percpu_ref_tryget(&ca->io_ref)) { + /* XXX: fix this */ + bch_err(c, "missing device for journal write\n"); + continue; + } + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], + sectors); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = ptr->offset; + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + + BUG_ON(bio->bi_iter.bi_sector == ca->prev_journal_sector); + ca->prev_journal_sector = bio->bi_iter.bi_sector; + + if (!JSET_NO_FLUSH(w->data)) + bio->bi_opf |= REQ_FUA; + if (!JSET_NO_FLUSH(w->data) && !w->separate_flush) + bio->bi_opf |= REQ_PREFLUSH; + + bch2_bio_map(bio, w->data, sectors << 9); + + trace_and_count(c, journal_write, bio); + closure_bio_submit(bio, cl); + + ca->journal.bucket_seq[ca->journal.cur_idx] = + le64_to_cpu(w->data->seq); + } + + continue_at(cl, journal_write_done, c->io_complete_wq); + return; +} + +static void bch2_journal_entries_postprocess(struct bch_fs *c, struct jset *jset) +{ + struct jset_entry *i, *next, *prev = NULL; + + /* + * Simple compaction, dropping empty jset_entries (from journal + * reservations that weren't fully used) and merging jset_entries that + * can be. + * + * If we wanted to be really fancy here, we could sort all the keys in + * the jset and drop keys that were overwritten - probably not worth it: + */ + vstruct_for_each_safe(jset, i, next) { + unsigned u64s = le16_to_cpu(i->u64s); + + /* Empty entry: */ + if (!u64s) + continue; + + if (i->type == BCH_JSET_ENTRY_btree_root) + bch2_journal_entry_to_btree_root(c, i); + + /* Can we merge with previous entry? */ + if (prev && + i->btree_id == prev->btree_id && + i->level == prev->level && + i->type == prev->type && + i->type == BCH_JSET_ENTRY_btree_keys && + le16_to_cpu(prev->u64s) + u64s <= U16_MAX) { + memmove_u64s_down(vstruct_next(prev), + i->_data, + u64s); + le16_add_cpu(&prev->u64s, u64s); + continue; + } + + /* Couldn't merge, move i into new position (after prev): */ + prev = prev ? vstruct_next(prev) : jset->start; + if (i != prev) + memmove_u64s_down(prev, i, jset_u64s(u64s)); + } + + prev = prev ? vstruct_next(prev) : jset->start; + jset->u64s = cpu_to_le32((u64 *) prev - jset->_data); +} + +void bch2_journal_write(struct closure *cl) +{ + struct journal *j = container_of(cl, struct journal, io); + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + struct journal_buf *w = journal_last_unwritten_buf(j); + struct bch_replicas_padded replicas; + struct jset_entry *start, *end; + struct jset *jset; + struct bio *bio; + struct printbuf journal_debug_buf = PRINTBUF; + bool validate_before_checksum = false; + unsigned i, sectors, bytes, u64s, nr_rw_members = 0; + int ret; + + BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); + + journal_buf_realloc(j, w); + jset = w->data; + + j->write_start_time = local_clock(); + + spin_lock(&j->lock); + + /* + * If the journal is in an error state - we did an emergency shutdown - + * we prefer to continue doing journal writes. We just mark them as + * noflush so they'll never be used, but they'll still be visible by the + * list_journal tool - this helps in debugging. + * + * There's a caveat: the first journal write after marking the + * superblock dirty must always be a flush write, because on startup + * from a clean shutdown we didn't necessarily read the journal and the + * new journal write might overwrite whatever was in the journal + * previously - we can't leave the journal without any flush writes in + * it. + * + * So if we're in an error state, and we're still starting up, we don't + * write anything at all. + */ + if (!test_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags) && + (bch2_journal_error(j) || + w->noflush || + (!w->must_flush && + (jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) && + test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags)))) { + w->noflush = true; + SET_JSET_NO_FLUSH(jset, true); + jset->last_seq = 0; + w->last_seq = 0; + + j->nr_noflush_writes++; + } else if (!bch2_journal_error(j)) { + j->last_flush_write = jiffies; + j->nr_flush_writes++; + clear_bit(JOURNAL_NEED_FLUSH_WRITE, &j->flags); + } else { + spin_unlock(&j->lock); + goto err; + } + spin_unlock(&j->lock); + + /* + * New btree roots are set by journalling them; when the journal entry + * gets written we have to propagate them to c->btree_roots + * + * But, every journal entry we write has to contain all the btree roots + * (at least for now); so after we copy btree roots to c->btree_roots we + * have to get any missing btree roots and add them to this journal + * entry: + */ + + bch2_journal_entries_postprocess(c, jset); + + start = end = vstruct_last(jset); + + end = bch2_btree_roots_to_journal_entries(c, jset->start, end); + + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); + u64s = (u64 *) end - (u64 *) start; + BUG_ON(u64s > j->entry_u64s_reserved); + + le32_add_cpu(&jset->u64s, u64s); + + sectors = vstruct_sectors(jset, c->block_bits); + bytes = vstruct_bytes(jset); + + if (sectors > w->sectors) { + bch2_fs_fatal_error(c, "aieeee! journal write overran available space, %zu > %u (extra %u reserved %u/%u)", + vstruct_bytes(jset), w->sectors << 9, + u64s, w->u64s_reserved, j->entry_u64s_reserved); + goto err; + } + + jset->magic = cpu_to_le64(jset_magic(c)); + jset->version = cpu_to_le32(c->sb.version); + + SET_JSET_BIG_ENDIAN(jset, CPU_BIG_ENDIAN); + SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c)); + + if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset)) + j->last_empty_seq = le64_to_cpu(jset->seq); + + if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset))) + validate_before_checksum = true; + + if (le32_to_cpu(jset->version) < bcachefs_metadata_version_current) + validate_before_checksum = true; + + if (validate_before_checksum && + jset_validate(c, NULL, jset, 0, WRITE)) + goto err; + + ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), + jset->encrypted_start, + vstruct_end(jset) - (void *) jset->encrypted_start); + if (bch2_fs_fatal_err_on(ret, c, + "error decrypting journal entry: %i", ret)) + goto err; + + jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), + journal_nonce(jset), jset); + + if (!validate_before_checksum && + jset_validate(c, NULL, jset, 0, WRITE)) + goto err; + + memset((void *) jset + bytes, 0, (sectors << 9) - bytes); + +retry_alloc: + spin_lock(&j->lock); + ret = journal_write_alloc(j, w, sectors); + + if (ret && j->can_discard) { + spin_unlock(&j->lock); + bch2_journal_do_discards(j); + goto retry_alloc; + } + + if (ret) + __bch2_journal_debug_to_text(&journal_debug_buf, j); + + /* + * write is allocated, no longer need to account for it in + * bch2_journal_space_available(): + */ + w->sectors = 0; + + /* + * journal entry has been compacted and allocated, recalculate space + * available: + */ + bch2_journal_space_available(j); + spin_unlock(&j->lock); + + if (ret) { + bch_err(c, "Unable to allocate journal write:\n%s", + journal_debug_buf.buf); + printbuf_exit(&journal_debug_buf); + goto err; + } + + w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); + + if (c->opts.nochanges) + goto no_io; + + for_each_rw_member(ca, c, i) + nr_rw_members++; + + if (nr_rw_members > 1) + w->separate_flush = true; + + /* + * Mark journal replicas before we submit the write to guarantee + * recovery will find the journal entries after a crash. + */ + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + w->devs_written); + ret = bch2_mark_replicas(c, &replicas.e); + if (ret) + goto err; + + if (!JSET_NO_FLUSH(jset) && w->separate_flush) { + for_each_rw_member(ca, c, i) { + percpu_ref_get(&ca->io_ref); + + bio = ca->journal.bio; + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH); + bio->bi_end_io = journal_write_endio; + bio->bi_private = ca; + closure_bio_submit(bio, cl); + } + } + + continue_at(cl, do_journal_write, c->io_complete_wq); + return; +no_io: + continue_at(cl, journal_write_done, c->io_complete_wq); + return; +err: + bch2_fatal_error(c); + continue_at(cl, journal_write_done, c->io_complete_wq); +} diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h new file mode 100644 index 000000000..8801e9810 --- /dev/null +++ b/fs/bcachefs/journal_io.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_IO_H +#define _BCACHEFS_JOURNAL_IO_H + +/* + * Only used for holding the journal entries we read in btree_journal_read() + * during cache_registration + */ +struct journal_replay { + struct journal_ptr { + bool csum_good; + u8 dev; + u32 bucket; + u32 bucket_offset; + u64 sector; + } ptrs[BCH_REPLICAS_MAX]; + unsigned nr_ptrs; + + bool csum_good; + bool ignore; + /* must be last: */ + struct jset j; +}; + +static inline struct jset_entry *__jset_entry_type_next(struct jset *jset, + struct jset_entry *entry, unsigned type) +{ + while (entry < vstruct_last(jset)) { + if (entry->type == type) + return entry; + + entry = vstruct_next(entry); + } + + return NULL; +} + +#define for_each_jset_entry_type(entry, jset, type) \ + for (entry = (jset)->start; \ + (entry = __jset_entry_type_next(jset, entry, type)); \ + entry = vstruct_next(entry)) + +#define jset_entry_for_each_key(_e, _k) \ + for (_k = (_e)->start; \ + _k < vstruct_last(_e); \ + _k = bkey_next(_k)) + +#define for_each_jset_key(k, entry, jset) \ + for_each_jset_entry_type(entry, jset, BCH_JSET_ENTRY_btree_keys)\ + jset_entry_for_each_key(entry, k) + +int bch2_journal_entry_validate(struct bch_fs *, struct jset *, + struct jset_entry *, unsigned, int, int); +void bch2_journal_entry_to_text(struct printbuf *, struct bch_fs *, + struct jset_entry *); + +void bch2_journal_ptrs_to_text(struct printbuf *, struct bch_fs *, + struct journal_replay *); + +int bch2_journal_read(struct bch_fs *, u64 *, u64 *, u64 *); + +void bch2_journal_write(struct closure *); + +#endif /* _BCACHEFS_JOURNAL_IO_H */ diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c new file mode 100644 index 000000000..8de83e103 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.c @@ -0,0 +1,873 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "replicas.h" +#include "super.h" +#include "trace.h" + +#include +#include + +/* Free space calculations: */ + +static unsigned journal_space_from(struct journal_device *ja, + enum journal_space_from from) +{ + switch (from) { + case journal_space_discarded: + return ja->discard_idx; + case journal_space_clean_ondisk: + return ja->dirty_idx_ondisk; + case journal_space_clean: + return ja->dirty_idx; + default: + BUG(); + } +} + +unsigned bch2_journal_dev_buckets_available(struct journal *j, + struct journal_device *ja, + enum journal_space_from from) +{ + unsigned available = (journal_space_from(ja, from) - + ja->cur_idx - 1 + ja->nr) % ja->nr; + + /* + * Don't use the last bucket unless writing the new last_seq + * will make another bucket available: + */ + if (available && ja->dirty_idx_ondisk == ja->dirty_idx) + --available; + + return available; +} + +static void journal_set_remaining(struct journal *j, unsigned u64s_remaining) +{ + union journal_preres_state old, new; + u64 v = atomic64_read(&j->prereserved.counter); + + do { + old.v = new.v = v; + new.remaining = u64s_remaining; + } while ((v = atomic64_cmpxchg(&j->prereserved.counter, + old.v, new.v)) != old.v); +} + +static struct journal_space +journal_dev_space_available(struct journal *j, struct bch_dev *ca, + enum journal_space_from from) +{ + struct journal_device *ja = &ca->journal; + unsigned sectors, buckets, unwritten; + u64 seq; + + if (from == journal_space_total) + return (struct journal_space) { + .next_entry = ca->mi.bucket_size, + .total = ca->mi.bucket_size * ja->nr, + }; + + buckets = bch2_journal_dev_buckets_available(j, ja, from); + sectors = ja->sectors_free; + + /* + * We that we don't allocate the space for a journal entry + * until we write it out - thus, account for it here: + */ + for (seq = journal_last_unwritten_seq(j); + seq <= journal_cur_seq(j); + seq++) { + unwritten = j->buf[seq & JOURNAL_BUF_MASK].sectors; + + if (!unwritten) + continue; + + /* entry won't fit on this device, skip: */ + if (unwritten > ca->mi.bucket_size) + continue; + + if (unwritten >= sectors) { + if (!buckets) { + sectors = 0; + break; + } + + buckets--; + sectors = ca->mi.bucket_size; + } + + sectors -= unwritten; + } + + if (sectors < ca->mi.bucket_size && buckets) { + buckets--; + sectors = ca->mi.bucket_size; + } + + return (struct journal_space) { + .next_entry = sectors, + .total = sectors + buckets * ca->mi.bucket_size, + }; +} + +static struct journal_space __journal_space_available(struct journal *j, unsigned nr_devs_want, + enum journal_space_from from) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned i, pos, nr_devs = 0; + struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX]; + + BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + if (!ca->journal.nr) + continue; + + space = journal_dev_space_available(j, ca, from); + if (!space.next_entry) + continue; + + for (pos = 0; pos < nr_devs; pos++) + if (space.total > dev_space[pos].total) + break; + + array_insert_item(dev_space, nr_devs, pos, space); + } + rcu_read_unlock(); + + if (nr_devs < nr_devs_want) + return (struct journal_space) { 0, 0 }; + + /* + * We sorted largest to smallest, and we want the smallest out of the + * @nr_devs_want largest devices: + */ + return dev_space[nr_devs_want - 1]; +} + +void bch2_journal_space_available(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned clean, clean_ondisk, total; + s64 u64s_remaining = 0; + unsigned max_entry_size = min(j->buf[0].buf_size >> 9, + j->buf[1].buf_size >> 9); + unsigned i, nr_online = 0, nr_devs_want; + bool can_discard = false; + int ret = 0; + + lockdep_assert_held(&j->lock); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, + &c->rw_devs[BCH_DATA_journal]) { + struct journal_device *ja = &ca->journal; + + if (!ja->nr) + continue; + + while (ja->dirty_idx != ja->cur_idx && + ja->bucket_seq[ja->dirty_idx] < journal_last_seq(j)) + ja->dirty_idx = (ja->dirty_idx + 1) % ja->nr; + + while (ja->dirty_idx_ondisk != ja->dirty_idx && + ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) + ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; + + if (ja->discard_idx != ja->dirty_idx_ondisk) + can_discard = true; + + max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); + nr_online++; + } + rcu_read_unlock(); + + j->can_discard = can_discard; + + if (nr_online < c->opts.metadata_replicas_required) { + ret = JOURNAL_ERR_insufficient_devices; + goto out; + } + + nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas); + + for (i = 0; i < journal_space_nr; i++) + j->space[i] = __journal_space_available(j, nr_devs_want, i); + + clean_ondisk = j->space[journal_space_clean_ondisk].total; + clean = j->space[journal_space_clean].total; + total = j->space[journal_space_total].total; + + if (!j->space[journal_space_discarded].next_entry) + ret = JOURNAL_ERR_journal_full; + + if ((j->space[journal_space_clean_ondisk].next_entry < + j->space[journal_space_clean_ondisk].total) && + (clean - clean_ondisk <= total / 8) && + (clean_ondisk * 2 > clean)) + set_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + else + clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags); + + u64s_remaining = (u64) clean << 6; + u64s_remaining -= (u64) total << 3; + u64s_remaining = max(0LL, u64s_remaining); + u64s_remaining /= 4; + u64s_remaining = min_t(u64, u64s_remaining, U32_MAX); +out: + j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_error = ret; + journal_set_remaining(j, u64s_remaining); + journal_set_watermark(j); + + if (!ret) + journal_wake(j); +} + +/* Discards - last part of journal reclaim: */ + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ + bool ret; + + spin_lock(&j->lock); + ret = ja->discard_idx != ja->dirty_idx_ondisk; + spin_unlock(&j->lock); + + return ret; +} + +/* + * Advance ja->discard_idx as long as it points to buckets that are no longer + * dirty, issuing discards if necessary: + */ +void bch2_journal_do_discards(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + unsigned iter; + + mutex_lock(&j->discard_lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + + while (should_discard_bucket(j, ja)) { + if (!c->opts.nochanges && + ca->mi.discard && + bdev_max_discard_sectors(ca->disk_sb.bdev)) + blkdev_issue_discard(ca->disk_sb.bdev, + bucket_to_sector(ca, + ja->buckets[ja->discard_idx]), + ca->mi.bucket_size, GFP_NOFS); + + spin_lock(&j->lock); + ja->discard_idx = (ja->discard_idx + 1) % ja->nr; + + bch2_journal_space_available(j); + spin_unlock(&j->lock); + } + } + + mutex_unlock(&j->discard_lock); +} + +/* + * Journal entry pinning - machinery for holding a reference on a given journal + * entry, holding it open to ensure it gets replayed during recovery: + */ + +static void bch2_journal_reclaim_fast(struct journal *j) +{ + struct journal_entry_pin_list temp; + bool popped = false; + + lockdep_assert_held(&j->lock); + + /* + * Unpin journal entries whose reference counts reached zero, meaning + * all btree nodes got written out + */ + while (!fifo_empty(&j->pin) && + !atomic_read(&fifo_peek_front(&j->pin).count)) { + fifo_pop(&j->pin, temp); + popped = true; + } + + if (popped) + bch2_journal_space_available(j); +} + +void __bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) + bch2_journal_reclaim_fast(j); +} + +void bch2_journal_pin_put(struct journal *j, u64 seq) +{ + struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq); + + if (atomic_dec_and_test(&pin_list->count)) { + spin_lock(&j->lock); + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + } +} + +static inline bool __journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + struct journal_entry_pin_list *pin_list; + + if (!journal_pin_active(pin)) + return false; + + if (j->flush_in_progress == pin) + j->flush_in_progress_dropped = true; + + pin_list = journal_seq_pin(j, pin->seq); + pin->seq = 0; + list_del_init(&pin->list); + + /* + * Unpinning a journal entry make make journal_next_bucket() succeed, if + * writing a new last_seq will now make another bucket available: + */ + return atomic_dec_and_test(&pin_list->count) && + pin_list == &fifo_peek_front(&j->pin); +} + +void bch2_journal_pin_drop(struct journal *j, + struct journal_entry_pin *pin) +{ + spin_lock(&j->lock); + if (__journal_pin_drop(j, pin)) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); +} + +static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn) +{ + if (fn == bch2_btree_node_flush0 || + fn == bch2_btree_node_flush1) + return JOURNAL_PIN_btree; + else if (fn == bch2_btree_key_cache_journal_flush) + return JOURNAL_PIN_key_cache; + else + return JOURNAL_PIN_other; +} + +void bch2_journal_pin_set(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + struct journal_entry_pin_list *pin_list; + bool reclaim; + + spin_lock(&j->lock); + + if (seq < journal_last_seq(j)) { + /* + * bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on + * the src pin - with the pin dropped, the entry to pin might no + * longer to exist, but that means there's no longer anything to + * copy and we can bail out here: + */ + spin_unlock(&j->lock); + return; + } + + pin_list = journal_seq_pin(j, seq); + + reclaim = __journal_pin_drop(j, pin); + + atomic_inc(&pin_list->count); + pin->seq = seq; + pin->flush = flush_fn; + + if (flush_fn) + list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]); + else + list_add(&pin->list, &pin_list->flushed); + + if (reclaim) + bch2_journal_reclaim_fast(j); + spin_unlock(&j->lock); + + /* + * If the journal is currently full, we might want to call flush_fn + * immediately: + */ + journal_wake(j); +} + +/** + * bch2_journal_pin_flush: ensure journal pin callback is no longer running + */ +void bch2_journal_pin_flush(struct journal *j, struct journal_entry_pin *pin) +{ + BUG_ON(journal_pin_active(pin)); + + wait_event(j->pin_flush_wait, j->flush_in_progress != pin); +} + +/* + * Journal reclaim: flush references to open journal entries to reclaim space in + * the journal + * + * May be done by the journal code in the background as needed to free up space + * for more journal entries, or as part of doing a clean shutdown, or to migrate + * data off of a specific device: + */ + +static struct journal_entry_pin * +journal_get_next_pin(struct journal *j, + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, + u64 *seq) +{ + struct journal_entry_pin_list *pin_list; + struct journal_entry_pin *ret = NULL; + unsigned i; + + fifo_for_each_entry_ptr(pin_list, &j->pin, *seq) { + if (*seq > seq_to_flush && !allowed_above_seq) + break; + + for (i = 0; i < JOURNAL_PIN_NR; i++) + if ((((1U << i) & allowed_below_seq) && *seq <= seq_to_flush) || + ((1U << i) & allowed_above_seq)) { + ret = list_first_entry_or_null(&pin_list->list[i], + struct journal_entry_pin, list); + if (ret) + return ret; + } + } + + return NULL; +} + +/* returns true if we did work */ +static size_t journal_flush_pins(struct journal *j, + u64 seq_to_flush, + unsigned allowed_below_seq, + unsigned allowed_above_seq, + unsigned min_any, + unsigned min_key_cache) +{ + struct journal_entry_pin *pin; + size_t nr_flushed = 0; + journal_pin_flush_fn flush_fn; + u64 seq; + int err; + + lockdep_assert_held(&j->reclaim_lock); + + while (1) { + unsigned allowed_above = allowed_above_seq; + unsigned allowed_below = allowed_below_seq; + + if (min_any) { + allowed_above |= ~0; + allowed_below |= ~0; + } + + if (min_key_cache) { + allowed_above |= 1U << JOURNAL_PIN_key_cache; + allowed_below |= 1U << JOURNAL_PIN_key_cache; + } + + cond_resched(); + + j->last_flushed = jiffies; + + spin_lock(&j->lock); + pin = journal_get_next_pin(j, seq_to_flush, allowed_below, allowed_above, &seq); + if (pin) { + BUG_ON(j->flush_in_progress); + j->flush_in_progress = pin; + j->flush_in_progress_dropped = false; + flush_fn = pin->flush; + } + spin_unlock(&j->lock); + + if (!pin) + break; + + if (min_key_cache && pin->flush == bch2_btree_key_cache_journal_flush) + min_key_cache--; + + if (min_any) + min_any--; + + err = flush_fn(j, pin, seq); + + spin_lock(&j->lock); + /* Pin might have been dropped or rearmed: */ + if (likely(!err && !j->flush_in_progress_dropped)) + list_move(&pin->list, &journal_seq_pin(j, seq)->flushed); + j->flush_in_progress = NULL; + j->flush_in_progress_dropped = false; + spin_unlock(&j->lock); + + wake_up(&j->pin_flush_wait); + + if (err) + break; + + nr_flushed++; + } + + return nr_flushed; +} + +static u64 journal_seq_to_flush(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct bch_dev *ca; + u64 seq_to_flush = 0; + unsigned iter; + + spin_lock(&j->lock); + + for_each_rw_member(ca, c, iter) { + struct journal_device *ja = &ca->journal; + unsigned nr_buckets, bucket_to_flush; + + if (!ja->nr) + continue; + + /* Try to keep the journal at most half full: */ + nr_buckets = ja->nr / 2; + + /* And include pre-reservations: */ + nr_buckets += DIV_ROUND_UP(j->prereserved.reserved, + (ca->mi.bucket_size << 6) - + journal_entry_overhead(j)); + + nr_buckets = min(nr_buckets, ja->nr); + + bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; + seq_to_flush = max(seq_to_flush, + ja->bucket_seq[bucket_to_flush]); + } + + /* Also flush if the pin fifo is more than half full */ + seq_to_flush = max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); + spin_unlock(&j->lock); + + return seq_to_flush; +} + +/** + * bch2_journal_reclaim - free up journal buckets + * + * Background journal reclaim writes out btree nodes. It should be run + * early enough so that we never completely run out of journal buckets. + * + * High watermarks for triggering background reclaim: + * - FIFO has fewer than 512 entries left + * - fewer than 25% journal buckets free + * + * Background reclaim runs until low watermarks are reached: + * - FIFO has more than 1024 entries left + * - more than 50% journal buckets free + * + * As long as a reclaim can complete in the time it takes to fill up + * 512 journal entries or 25% of all journal buckets, then + * journal_next_bucket() should not stall. + */ +static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + bool kthread = (current->flags & PF_KTHREAD) != 0; + u64 seq_to_flush; + size_t min_nr, min_key_cache, nr_flushed; + unsigned flags; + int ret = 0; + + /* + * We can't invoke memory reclaim while holding the reclaim_lock - + * journal reclaim is required to make progress for memory reclaim + * (cleaning the caches), so we can't get stuck in memory reclaim while + * we're holding the reclaim lock: + */ + lockdep_assert_held(&j->reclaim_lock); + flags = memalloc_noreclaim_save(); + + do { + if (kthread && kthread_should_stop()) + break; + + if (bch2_journal_error(j)) { + ret = -EIO; + break; + } + + bch2_journal_do_discards(j); + + seq_to_flush = journal_seq_to_flush(j); + min_nr = 0; + + /* + * If it's been longer than j->reclaim_delay_ms since we last flushed, + * make sure to flush at least one journal pin: + */ + if (time_after(jiffies, j->last_flushed + + msecs_to_jiffies(c->opts.journal_reclaim_delay))) + min_nr = 1; + + if (j->prereserved.reserved * 4 > j->prereserved.remaining) + min_nr = 1; + + if (fifo_free(&j->pin) <= 32) + min_nr = 1; + + if (atomic_read(&c->btree_cache.dirty) * 2 > c->btree_cache.used) + min_nr = 1; + + min_key_cache = min(bch2_nr_btree_keys_need_flush(c), (size_t) 128); + + trace_and_count(c, journal_reclaim_start, c, + direct, kicked, + min_nr, min_key_cache, + j->prereserved.reserved, + j->prereserved.remaining, + atomic_read(&c->btree_cache.dirty), + c->btree_cache.used, + atomic_long_read(&c->btree_key_cache.nr_dirty), + atomic_long_read(&c->btree_key_cache.nr_keys)); + + nr_flushed = journal_flush_pins(j, seq_to_flush, + ~0, 0, + min_nr, min_key_cache); + + if (direct) + j->nr_direct_reclaim += nr_flushed; + else + j->nr_background_reclaim += nr_flushed; + trace_and_count(c, journal_reclaim_finish, c, nr_flushed); + + if (nr_flushed) + wake_up(&j->reclaim_wait); + } while ((min_nr || min_key_cache) && nr_flushed && !direct); + + memalloc_noreclaim_restore(flags); + + return ret; +} + +int bch2_journal_reclaim(struct journal *j) +{ + return __bch2_journal_reclaim(j, true, true); +} + +static int bch2_journal_reclaim_thread(void *arg) +{ + struct journal *j = arg; + struct bch_fs *c = container_of(j, struct bch_fs, journal); + unsigned long delay, now; + bool journal_empty; + int ret = 0; + + set_freezable(); + + j->last_flushed = jiffies; + + while (!ret && !kthread_should_stop()) { + bool kicked = j->reclaim_kicked; + + j->reclaim_kicked = false; + + mutex_lock(&j->reclaim_lock); + ret = __bch2_journal_reclaim(j, false, kicked); + mutex_unlock(&j->reclaim_lock); + + now = jiffies; + delay = msecs_to_jiffies(c->opts.journal_reclaim_delay); + j->next_reclaim = j->last_flushed + delay; + + if (!time_in_range(j->next_reclaim, now, now + delay)) + j->next_reclaim = now + delay; + + while (1) { + set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE); + if (kthread_should_stop()) + break; + if (j->reclaim_kicked) + break; + + spin_lock(&j->lock); + journal_empty = fifo_empty(&j->pin); + spin_unlock(&j->lock); + + if (journal_empty) + schedule(); + else if (time_after(j->next_reclaim, jiffies)) + schedule_timeout(j->next_reclaim - jiffies); + else + break; + } + __set_current_state(TASK_RUNNING); + } + + return 0; +} + +void bch2_journal_reclaim_stop(struct journal *j) +{ + struct task_struct *p = j->reclaim_thread; + + j->reclaim_thread = NULL; + + if (p) { + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_journal_reclaim_start(struct journal *j) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct task_struct *p; + int ret; + + if (j->reclaim_thread) + return 0; + + p = kthread_create(bch2_journal_reclaim_thread, j, + "bch-reclaim/%s", c->name); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err(c, "error creating journal reclaim thread: %s", bch2_err_str(ret)); + return ret; + } + + get_task_struct(p); + j->reclaim_thread = p; + wake_up_process(p); + return 0; +} + +static int journal_flush_done(struct journal *j, u64 seq_to_flush, + bool *did_work) +{ + int ret; + + ret = bch2_journal_error(j); + if (ret) + return ret; + + mutex_lock(&j->reclaim_lock); + + if (journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_key_cache)| + (1U << JOURNAL_PIN_other), 0, 0, 0) || + journal_flush_pins(j, seq_to_flush, + (1U << JOURNAL_PIN_btree), 0, 0, 0)) + *did_work = true; + + spin_lock(&j->lock); + /* + * If journal replay hasn't completed, the unreplayed journal entries + * hold refs on their corresponding sequence numbers + */ + ret = !test_bit(JOURNAL_REPLAY_DONE, &j->flags) || + journal_last_seq(j) > seq_to_flush || + !fifo_used(&j->pin); + + spin_unlock(&j->lock); + mutex_unlock(&j->reclaim_lock); + + return ret; +} + +bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush) +{ + bool did_work = false; + + if (!test_bit(JOURNAL_STARTED, &j->flags)) + return false; + + closure_wait_event(&j->async_wait, + journal_flush_done(j, seq_to_flush, &did_work)); + + return did_work; +} + +int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct journal_entry_pin_list *p; + u64 iter, seq = 0; + int ret = 0; + + spin_lock(&j->lock); + fifo_for_each_entry_ptr(p, &j->pin, iter) + if (dev_idx >= 0 + ? bch2_dev_list_has_dev(p->devs, dev_idx) + : p->devs.nr < c->opts.metadata_replicas) + seq = iter; + spin_unlock(&j->lock); + + bch2_journal_flush_pins(j, seq); + + ret = bch2_journal_error(j); + if (ret) + return ret; + + mutex_lock(&c->replicas_gc_lock); + bch2_replicas_gc_start(c, 1 << BCH_DATA_journal); + + /* + * Now that we've populated replicas_gc, write to the journal to mark + * active journal devices. This handles the case where the journal might + * be empty. Otherwise we could clear all journal replicas and + * temporarily put the fs into an unrecoverable state. Journal recovery + * expects to find devices marked for journal data on unclean mount. + */ + ret = bch2_journal_meta(&c->journal); + if (ret) + goto err; + + seq = 0; + spin_lock(&j->lock); + while (!ret) { + struct bch_replicas_padded replicas; + + seq = max(seq, journal_last_seq(j)); + if (seq >= j->pin.back) + break; + bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, + journal_seq_pin(j, seq)->devs); + seq++; + + spin_unlock(&j->lock); + ret = bch2_mark_replicas(c, &replicas.e); + spin_lock(&j->lock); + } + spin_unlock(&j->lock); +err: + ret = bch2_replicas_gc_end(c, ret); + mutex_unlock(&c->replicas_gc_lock); + + return ret; +} diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h new file mode 100644 index 000000000..0fd1af120 --- /dev/null +++ b/fs/bcachefs/journal_reclaim.h @@ -0,0 +1,86 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_RECLAIM_H +#define _BCACHEFS_JOURNAL_RECLAIM_H + +#define JOURNAL_PIN (32 * 1024) + +static inline void journal_reclaim_kick(struct journal *j) +{ + struct task_struct *p = READ_ONCE(j->reclaim_thread); + + j->reclaim_kicked = true; + if (p) + wake_up_process(p); +} + +unsigned bch2_journal_dev_buckets_available(struct journal *, + struct journal_device *, + enum journal_space_from); +void bch2_journal_space_available(struct journal *); + +static inline bool journal_pin_active(struct journal_entry_pin *pin) +{ + return pin->seq != 0; +} + +static inline struct journal_entry_pin_list * +journal_seq_pin(struct journal *j, u64 seq) +{ + EBUG_ON(seq < j->pin.front || seq >= j->pin.back); + + return &j->pin.data[seq & j->pin.mask]; +} + +void __bch2_journal_pin_put(struct journal *, u64); +void bch2_journal_pin_put(struct journal *, u64); +void bch2_journal_pin_drop(struct journal *, struct journal_entry_pin *); + +void bch2_journal_pin_set(struct journal *, u64, struct journal_entry_pin *, + journal_pin_flush_fn); + +static inline void bch2_journal_pin_add(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin) || pin->seq > seq)) + bch2_journal_pin_set(j, seq, pin, flush_fn); +} + +static inline void bch2_journal_pin_copy(struct journal *j, + struct journal_entry_pin *dst, + struct journal_entry_pin *src, + journal_pin_flush_fn flush_fn) +{ + /* Guard against racing with journal_pin_drop(src): */ + u64 seq = READ_ONCE(src->seq); + + if (seq) + bch2_journal_pin_add(j, seq, dst, flush_fn); +} + +static inline void bch2_journal_pin_update(struct journal *j, u64 seq, + struct journal_entry_pin *pin, + journal_pin_flush_fn flush_fn) +{ + if (unlikely(!journal_pin_active(pin) || pin->seq < seq)) + bch2_journal_pin_set(j, seq, pin, flush_fn); +} + +void bch2_journal_pin_flush(struct journal *, struct journal_entry_pin *); + +void bch2_journal_do_discards(struct journal *); +int bch2_journal_reclaim(struct journal *); + +void bch2_journal_reclaim_stop(struct journal *); +int bch2_journal_reclaim_start(struct journal *); + +bool bch2_journal_flush_pins(struct journal *, u64); + +static inline bool bch2_journal_flush_all_pins(struct journal *j) +{ + return bch2_journal_flush_pins(j, U64_MAX); +} + +int bch2_journal_flush_device_pins(struct journal *, int); + +#endif /* _BCACHEFS_JOURNAL_RECLAIM_H */ diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c new file mode 100644 index 000000000..cc41bff86 --- /dev/null +++ b/fs/bcachefs/journal_sb.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "journal_sb.h" +#include "darray.h" + +#include + +/* BCH_SB_FIELD_journal: */ + +static int u64_cmp(const void *_l, const void *_r) +{ + const u64 *l = _l; + const u64 *r = _r; + + return cmp_int(*l, *r); +} + +static int bch2_sb_journal_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -BCH_ERR_invalid_sb_journal; + unsigned nr; + unsigned i; + u64 *b; + + nr = bch2_nr_journal_buckets(journal); + if (!nr) + return 0; + + b = kmalloc_array(nr, sizeof(u64), GFP_KERNEL); + if (!b) + return -BCH_ERR_ENOMEM_sb_journal_validate; + + for (i = 0; i < nr; i++) + b[i] = le64_to_cpu(journal->buckets[i]); + + sort(b, nr, sizeof(u64), u64_cmp, NULL); + + if (!b[0]) { + prt_printf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0] < le16_to_cpu(m->first_bucket)) { + prt_printf(err, "journal bucket %llu before first bucket %u", + b[0], le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1] >= le64_to_cpu(m->nbuckets)) { + prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1], le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) + if (b[i] == b[i + 1]) { + prt_printf(err, "duplicate journal buckets %llu", b[i]); + goto err; + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal *journal = field_to_type(f, journal); + unsigned i, nr = bch2_nr_journal_buckets(journal); + + prt_printf(out, "Buckets: "); + for (i = 0; i < nr; i++) + prt_printf(out, " %llu", le64_to_cpu(journal->buckets[i])); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal = { + .validate = bch2_sb_journal_validate, + .to_text = bch2_sb_journal_to_text, +}; + +struct u64_range { + u64 start; + u64 end; +}; + +static int u64_range_cmp(const void *_l, const void *_r) +{ + const struct u64_range *l = _l; + const struct u64_range *r = _r; + + return cmp_int(l->start, r->start); +} + +static int bch2_sb_journal_v2_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + struct bch_member *m = bch2_sb_get_members(sb)->members + sb->dev_idx; + int ret = -BCH_ERR_invalid_sb_journal; + unsigned nr; + unsigned i; + struct u64_range *b; + + nr = bch2_sb_field_journal_v2_nr_entries(journal); + if (!nr) + return 0; + + b = kmalloc_array(nr, sizeof(*b), GFP_KERNEL); + if (!b) + return -BCH_ERR_ENOMEM_sb_journal_v2_validate; + + for (i = 0; i < nr; i++) { + b[i].start = le64_to_cpu(journal->d[i].start); + b[i].end = b[i].start + le64_to_cpu(journal->d[i].nr); + } + + sort(b, nr, sizeof(*b), u64_range_cmp, NULL); + + if (!b[0].start) { + prt_printf(err, "journal bucket at sector 0"); + goto err; + } + + if (b[0].start < le16_to_cpu(m->first_bucket)) { + prt_printf(err, "journal bucket %llu before first bucket %u", + b[0].start, le16_to_cpu(m->first_bucket)); + goto err; + } + + if (b[nr - 1].end > le64_to_cpu(m->nbuckets)) { + prt_printf(err, "journal bucket %llu past end of device (nbuckets %llu)", + b[nr - 1].end - 1, le64_to_cpu(m->nbuckets)); + goto err; + } + + for (i = 0; i + 1 < nr; i++) { + if (b[i].end > b[i + 1].start) { + prt_printf(err, "duplicate journal buckets in ranges %llu-%llu, %llu-%llu", + b[i].start, b[i].end, b[i + 1].start, b[i + 1].end); + goto err; + } + } + + ret = 0; +err: + kfree(b); + return ret; +} + +static void bch2_sb_journal_v2_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_v2 *journal = field_to_type(f, journal_v2); + unsigned i, nr = bch2_sb_field_journal_v2_nr_entries(journal); + + prt_printf(out, "Buckets: "); + for (i = 0; i < nr; i++) + prt_printf(out, " %llu-%llu", + le64_to_cpu(journal->d[i].start), + le64_to_cpu(journal->d[i].start) + le64_to_cpu(journal->d[i].nr)); + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_v2 = { + .validate = bch2_sb_journal_v2_validate, + .to_text = bch2_sb_journal_v2_to_text, +}; + +int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, + u64 *buckets, unsigned nr) +{ + struct bch_sb_field_journal_v2 *j; + unsigned i, dst = 0, nr_compacted = 1; + + if (c) + lockdep_assert_held(&c->sb_lock); + + if (!nr) { + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal_v2); + return 0; + } + + for (i = 0; i + 1 < nr; i++) + if (buckets[i] + 1 != buckets[i + 1]) + nr_compacted++; + + j = bch2_sb_resize_journal_v2(&ca->disk_sb, + (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); + if (!j) + return -BCH_ERR_ENOSPC_sb_journal; + + bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); + + j->d[dst].start = cpu_to_le64(buckets[0]); + j->d[dst].nr = cpu_to_le64(1); + + for (i = 1; i < nr; i++) { + if (buckets[i] == buckets[i - 1] + 1) { + le64_add_cpu(&j->d[dst].nr, 1); + } else { + dst++; + j->d[dst].start = cpu_to_le64(buckets[i]); + j->d[dst].nr = cpu_to_le64(1); + } + } + + BUG_ON(dst + 1 != nr_compacted); + return 0; +} diff --git a/fs/bcachefs/journal_sb.h b/fs/bcachefs/journal_sb.h new file mode 100644 index 000000000..ba40a7e8d --- /dev/null +++ b/fs/bcachefs/journal_sb.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#include "super-io.h" +#include "vstructs.h" + +static inline unsigned bch2_nr_journal_buckets(struct bch_sb_field_journal *j) +{ + return j + ? (__le64 *) vstruct_end(&j->field) - j->buckets + : 0; +} + +static inline unsigned bch2_sb_field_journal_v2_nr_entries(struct bch_sb_field_journal_v2 *j) +{ + if (!j) + return 0; + + return (struct bch_sb_field_journal_v2_entry *) vstruct_end(&j->field) - &j->d[0]; +} + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal; +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_v2; + +int bch2_journal_buckets_to_sb(struct bch_fs *, struct bch_dev *, u64 *, unsigned); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c new file mode 100644 index 000000000..d6b9f2cdf --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_iter.h" +#include "eytzinger.h" +#include "journal_seq_blacklist.h" +#include "super-io.h" + +/* + * journal_seq_blacklist machinery: + * + * To guarantee order of btree updates after a crash, we need to detect when a + * btree node entry (bset) is newer than the newest journal entry that was + * successfully written, and ignore it - effectively ignoring any btree updates + * that didn't make it into the journal. + * + * If we didn't do this, we might have two btree nodes, a and b, both with + * updates that weren't written to the journal yet: if b was updated after a, + * but b was flushed and not a - oops; on recovery we'll find that the updates + * to b happened, but not the updates to a that happened before it. + * + * Ignoring bsets that are newer than the newest journal entry is always safe, + * because everything they contain will also have been journalled - and must + * still be present in the journal on disk until a journal entry has been + * written _after_ that bset was written. + * + * To accomplish this, bsets record the newest journal sequence number they + * contain updates for; then, on startup, the btree code queries the journal + * code to ask "Is this sequence number newer than the newest journal entry? If + * so, ignore it." + * + * When this happens, we must blacklist that journal sequence number: the + * journal must not write any entries with that sequence number, and it must + * record that it was blacklisted so that a) on recovery we don't think we have + * missing journal entries and b) so that the btree code continues to ignore + * that bset, until that btree node is rewritten. + */ + +static unsigned sb_blacklist_u64s(unsigned nr) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + + return (sizeof(*bl) + sizeof(bl->start[0]) * nr) / sizeof(u64); +} + +static struct bch_sb_field_journal_seq_blacklist * +blacklist_entry_try_merge(struct bch_fs *c, + struct bch_sb_field_journal_seq_blacklist *bl, + unsigned i) +{ + unsigned nr = blacklist_nr_entries(bl); + + if (le64_to_cpu(bl->start[i].end) >= + le64_to_cpu(bl->start[i + 1].start)) { + bl->start[i].end = bl->start[i + 1].end; + --nr; + memmove(&bl->start[i], + &bl->start[i + 1], + sizeof(bl->start[0]) * (nr - i)); + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr)); + BUG_ON(!bl); + } + + return bl; +} + +static bool bl_entry_contig_or_overlaps(struct journal_seq_blacklist_entry *e, + u64 start, u64 end) +{ + return !(end < le64_to_cpu(e->start) || le64_to_cpu(e->end) < start); +} + +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) +{ + struct bch_sb_field_journal_seq_blacklist *bl; + unsigned i, nr; + int ret = 0; + + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + nr = blacklist_nr_entries(bl); + + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = + bl->start + i; + + if (bl_entry_contig_or_overlaps(e, start, end)) { + e->start = cpu_to_le64(min(start, le64_to_cpu(e->start))); + e->end = cpu_to_le64(max(end, le64_to_cpu(e->end))); + + if (i + 1 < nr) + bl = blacklist_entry_try_merge(c, + bl, i); + if (i) + bl = blacklist_entry_try_merge(c, + bl, i - 1); + goto out_write_sb; + } + } + + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + sb_blacklist_u64s(nr + 1)); + if (!bl) { + ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; + goto out; + } + + bl->start[nr].start = cpu_to_le64(start); + bl->start[nr].end = cpu_to_le64(end); +out_write_sb: + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << BCH_FEATURE_journal_seq_blacklist_v3); + + ret = bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); + + return ret ?: bch2_blacklist_table_initialize(c); +} + +static int journal_seq_blacklist_table_cmp(const void *_l, + const void *_r, size_t size) +{ + const struct journal_seq_blacklist_table_entry *l = _l; + const struct journal_seq_blacklist_table_entry *r = _r; + + return cmp_int(l->start, r->start); +} + +bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, + bool dirty) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + struct journal_seq_blacklist_table_entry search = { .start = seq }; + int idx; + + if (!t) + return false; + + idx = eytzinger0_find_le(t->entries, t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + &search); + if (idx < 0) + return false; + + BUG_ON(t->entries[idx].start > seq); + + if (seq >= t->entries[idx].end) + return false; + + if (dirty) + t->entries[idx].dirty = true; + return true; +} + +int bch2_blacklist_table_initialize(struct bch_fs *c) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + struct journal_seq_blacklist_table *t; + unsigned i, nr = blacklist_nr_entries(bl); + + if (!bl) + return 0; + + t = kzalloc(sizeof(*t) + sizeof(t->entries[0]) * nr, + GFP_KERNEL); + if (!t) + return -BCH_ERR_ENOMEM_blacklist_table_init; + + t->nr = nr; + + for (i = 0; i < nr; i++) { + t->entries[i].start = le64_to_cpu(bl->start[i].start); + t->entries[i].end = le64_to_cpu(bl->start[i].end); + } + + eytzinger0_sort(t->entries, + t->nr, + sizeof(t->entries[0]), + journal_seq_blacklist_table_cmp, + NULL); + + kfree(c->journal_seq_blacklist_table); + c->journal_seq_blacklist_table = t; + return 0; +} + +static int bch2_sb_journal_seq_blacklist_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + unsigned i, nr = blacklist_nr_entries(bl); + + for (i = 0; i < nr; i++) { + struct journal_seq_blacklist_entry *e = bl->start + i; + + if (le64_to_cpu(e->start) >= + le64_to_cpu(e->end)) { + prt_printf(err, "entry %u start >= end (%llu >= %llu)", + i, le64_to_cpu(e->start), le64_to_cpu(e->end)); + return -BCH_ERR_invalid_sb_journal_seq_blacklist; + } + + if (i + 1 < nr && + le64_to_cpu(e[0].end) > + le64_to_cpu(e[1].start)) { + prt_printf(err, "entry %u out of order with next entry (%llu > %llu)", + i + 1, le64_to_cpu(e[0].end), le64_to_cpu(e[1].start)); + return -BCH_ERR_invalid_sb_journal_seq_blacklist; + } + } + + return 0; +} + +static void bch2_sb_journal_seq_blacklist_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_journal_seq_blacklist *bl = + field_to_type(f, journal_seq_blacklist); + struct journal_seq_blacklist_entry *i; + unsigned nr = blacklist_nr_entries(bl); + + for (i = bl->start; i < bl->start + nr; i++) { + if (i != bl->start) + prt_printf(out, " "); + + prt_printf(out, "%llu-%llu", + le64_to_cpu(i->start), + le64_to_cpu(i->end)); + } + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist = { + .validate = bch2_sb_journal_seq_blacklist_validate, + .to_text = bch2_sb_journal_seq_blacklist_to_text +}; + +void bch2_blacklist_entries_gc(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + journal_seq_blacklist_gc_work); + struct journal_seq_blacklist_table *t; + struct bch_sb_field_journal_seq_blacklist *bl; + struct journal_seq_blacklist_entry *src, *dst; + struct btree_trans trans; + unsigned i, nr, new_nr; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_iter iter; + struct btree *b; + + bch2_trans_node_iter_init(&trans, &iter, i, POS_MIN, + 0, 0, BTREE_ITER_PREFETCH); +retry: + bch2_trans_begin(&trans); + + b = bch2_btree_iter_peek_node(&iter); + + while (!(ret = PTR_ERR_OR_ZERO(b)) && + b && + !test_bit(BCH_FS_STOPPING, &c->flags)) + b = bch2_btree_iter_next_node(&iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); + } + + bch2_trans_exit(&trans); + if (ret) + return; + + mutex_lock(&c->sb_lock); + bl = bch2_sb_get_journal_seq_blacklist(c->disk_sb.sb); + if (!bl) + goto out; + + nr = blacklist_nr_entries(bl); + dst = bl->start; + + t = c->journal_seq_blacklist_table; + BUG_ON(nr != t->nr); + + for (src = bl->start, i = eytzinger0_first(t->nr); + src < bl->start + nr; + src++, i = eytzinger0_next(i, nr)) { + BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); + BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); + + if (t->entries[i].dirty) + *dst++ = *src; + } + + new_nr = dst - bl->start; + + bch_info(c, "nr blacklist entries was %u, now %u", nr, new_nr); + + if (new_nr != nr) { + bl = bch2_sb_resize_journal_seq_blacklist(&c->disk_sb, + new_nr ? sb_blacklist_u64s(new_nr) : 0); + BUG_ON(new_nr && !bl); + + if (!new_nr) + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_journal_seq_blacklist_v3)); + + bch2_write_super(c); + } +out: + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h new file mode 100644 index 000000000..afb886ec8 --- /dev/null +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H +#define _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H + +static inline unsigned +blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) +{ + return bl + ? ((vstruct_end(&bl->field) - (void *) &bl->start[0]) / + sizeof(struct journal_seq_blacklist_entry)) + : 0; +} + +bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); +int bch2_blacklist_table_initialize(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_journal_seq_blacklist; + +void bch2_blacklist_entries_gc(struct work_struct *); + +#endif /* _BCACHEFS_JOURNAL_SEQ_BLACKLIST_H */ diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h new file mode 100644 index 000000000..42504e16a --- /dev/null +++ b/fs/bcachefs/journal_types.h @@ -0,0 +1,345 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_JOURNAL_TYPES_H +#define _BCACHEFS_JOURNAL_TYPES_H + +#include +#include + +#include "alloc_types.h" +#include "super_types.h" +#include "fifo.h" + +#define JOURNAL_BUF_BITS 2 +#define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) +#define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) + +/* + * We put JOURNAL_BUF_NR of these in struct journal; we used them for writes to + * the journal that are being staged or in flight. + */ +struct journal_buf { + struct jset *data; + + __BKEY_PADDED(key, BCH_REPLICAS_MAX); + struct bch_devs_list devs_written; + + struct closure_waitlist wait; + u64 last_seq; /* copy of data->last_seq */ + long expires; + u64 flush_time; + + unsigned buf_size; /* size in bytes of @data */ + unsigned sectors; /* maximum size for current entry */ + unsigned disk_sectors; /* maximum size entry could have been, if + buf_size was bigger */ + unsigned u64s_reserved; + bool noflush; /* write has already been kicked off, and was noflush */ + bool must_flush; /* something wants a flush */ + bool separate_flush; +}; + +/* + * Something that makes a journal entry dirty - i.e. a btree node that has to be + * flushed: + */ + +enum journal_pin_type { + JOURNAL_PIN_btree, + JOURNAL_PIN_key_cache, + JOURNAL_PIN_other, + JOURNAL_PIN_NR, +}; + +struct journal_entry_pin_list { + struct list_head list[JOURNAL_PIN_NR]; + struct list_head flushed; + atomic_t count; + struct bch_devs_list devs; +}; + +struct journal; +struct journal_entry_pin; +typedef int (*journal_pin_flush_fn)(struct journal *j, + struct journal_entry_pin *, u64); + +struct journal_entry_pin { + struct list_head list; + journal_pin_flush_fn flush; + u64 seq; +}; + +struct journal_res { + bool ref; + u8 idx; + u16 u64s; + u32 offset; + u64 seq; +}; + +/* + * For reserving space in the journal prior to getting a reservation on a + * particular journal entry: + */ +struct journal_preres { + unsigned u64s; +}; + +union journal_res_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 cur_entry_offset:20, + idx:2, + unwritten_idx:2, + buf0_count:10, + buf1_count:10, + buf2_count:10, + buf3_count:10; + }; +}; + +union journal_preres_state { + struct { + atomic64_t counter; + }; + + struct { + u64 v; + }; + + struct { + u64 waiting:1, + reserved:31, + remaining:32; + }; +}; + +/* bytes: */ +#define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ + +/* + * We stash some journal state as sentinal values in cur_entry_offset: + * note - cur_entry_offset is in units of u64s + */ +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) + +#define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) +#define JOURNAL_ENTRY_ERROR_VAL (JOURNAL_ENTRY_OFFSET_MAX) + +struct journal_space { + /* Units of 512 bytes sectors: */ + unsigned next_entry; /* How big the next journal entry can be */ + unsigned total; +}; + +enum journal_space_from { + journal_space_discarded, + journal_space_clean_ondisk, + journal_space_clean, + journal_space_total, + journal_space_nr, +}; + +enum journal_flags { + JOURNAL_REPLAY_DONE, + JOURNAL_STARTED, + JOURNAL_MAY_SKIP_FLUSH, + JOURNAL_NEED_FLUSH_WRITE, +}; + +/* Reasons we may fail to get a journal reservation: */ +#define JOURNAL_ERRORS() \ + x(ok) \ + x(blocked) \ + x(max_in_flight) \ + x(journal_full) \ + x(journal_pin_full) \ + x(journal_stuck) \ + x(insufficient_devices) + +enum journal_errors { +#define x(n) JOURNAL_ERR_##n, + JOURNAL_ERRORS() +#undef x +}; + +typedef DARRAY(u64) darray_u64; + +/* Embedded in struct bch_fs */ +struct journal { + /* Fastpath stuff up front: */ + struct { + + union journal_res_state reservations; + enum bch_watermark watermark; + + union journal_preres_state prereserved; + + } __aligned(SMP_CACHE_BYTES); + + unsigned long flags; + + /* Max size of current journal entry */ + unsigned cur_entry_u64s; + unsigned cur_entry_sectors; + + /* Reserved space in journal entry to be used just prior to write */ + unsigned entry_u64s_reserved; + + + /* + * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if + * insufficient devices: + */ + enum journal_errors cur_entry_error; + + unsigned buf_size_want; + /* + * We may queue up some things to be journalled (log messages) before + * the journal has actually started - stash them here: + */ + darray_u64 early_journal_entries; + + /* + * Two journal entries -- one is currently open for new entries, the + * other is possibly being written out. + */ + struct journal_buf buf[JOURNAL_BUF_NR]; + + spinlock_t lock; + + /* if nonzero, we may not open a new journal entry: */ + unsigned blocked; + + /* Used when waiting because the journal was full */ + wait_queue_head_t wait; + struct closure_waitlist async_wait; + struct closure_waitlist preres_wait; + + struct closure io; + struct delayed_work write_work; + + /* Sequence number of most recent journal entry (last entry in @pin) */ + atomic64_t seq; + + /* seq, last_seq from the most recent journal entry successfully written */ + u64 seq_ondisk; + u64 flushed_seq_ondisk; + u64 last_seq_ondisk; + u64 err_seq; + u64 last_empty_seq; + + /* + * FIFO of journal entries whose btree updates have not yet been + * written out. + * + * Each entry is a reference count. The position in the FIFO is the + * entry's sequence number relative to @seq. + * + * The journal entry itself holds a reference count, put when the + * journal entry is written out. Each btree node modified by the journal + * entry also holds a reference count, put when the btree node is + * written. + * + * When a reference count reaches zero, the journal entry is no longer + * needed. When all journal entries in the oldest journal bucket are no + * longer needed, the bucket can be discarded and reused. + */ + struct { + u64 front, back, size, mask; + struct journal_entry_pin_list *data; + } pin; + + struct journal_space space[journal_space_nr]; + + u64 replay_journal_seq; + u64 replay_journal_seq_end; + + struct write_point wp; + spinlock_t err_lock; + + struct mutex reclaim_lock; + /* + * Used for waiting until journal reclaim has freed up space in the + * journal: + */ + wait_queue_head_t reclaim_wait; + struct task_struct *reclaim_thread; + bool reclaim_kicked; + unsigned long next_reclaim; + u64 nr_direct_reclaim; + u64 nr_background_reclaim; + + unsigned long last_flushed; + struct journal_entry_pin *flush_in_progress; + bool flush_in_progress_dropped; + wait_queue_head_t pin_flush_wait; + + /* protects advancing ja->discard_idx: */ + struct mutex discard_lock; + bool can_discard; + + unsigned long last_flush_write; + + u64 res_get_blocked_start; + u64 write_start_time; + + u64 nr_flush_writes; + u64 nr_noflush_writes; + + struct bch2_time_stats *flush_write_time; + struct bch2_time_stats *noflush_write_time; + struct bch2_time_stats *blocked_time; + struct bch2_time_stats *flush_seq_time; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map res_map; +#endif +} __aligned(SMP_CACHE_BYTES); + +/* + * Embedded in struct bch_dev. First three fields refer to the array of journal + * buckets, in bch_sb. + */ +struct journal_device { + /* + * For each journal bucket, contains the max sequence number of the + * journal writes it contains - so we know when a bucket can be reused. + */ + u64 *bucket_seq; + + unsigned sectors_free; + + /* + * discard_idx <= dirty_idx_ondisk <= dirty_idx <= cur_idx: + */ + unsigned discard_idx; /* Next bucket to discard */ + unsigned dirty_idx_ondisk; + unsigned dirty_idx; + unsigned cur_idx; /* Journal bucket we're currently writing to */ + unsigned nr; + + u64 *buckets; + + /* Bio for journal reads/writes to this device */ + struct bio *bio; + + /* for bch_journal_read_device */ + struct closure read; +}; + +/* + * journal_entry_res - reserve space in every journal entry: + */ +struct journal_entry_res { + unsigned u64s; +}; + +#endif /* _BCACHEFS_JOURNAL_TYPES_H */ diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c new file mode 100644 index 000000000..5699cd487 --- /dev/null +++ b/fs/bcachefs/keylist.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey.h" +#include "keylist.h" + +int bch2_keylist_realloc(struct keylist *l, u64 *inline_u64s, + size_t nr_inline_u64s, size_t new_u64s) +{ + size_t oldsize = bch2_keylist_u64s(l); + size_t newsize = oldsize + new_u64s; + u64 *old_buf = l->keys_p == inline_u64s ? NULL : l->keys_p; + u64 *new_keys; + + newsize = roundup_pow_of_two(newsize); + + if (newsize <= nr_inline_u64s || + (old_buf && roundup_pow_of_two(oldsize) == newsize)) + return 0; + + new_keys = krealloc(old_buf, sizeof(u64) * newsize, GFP_NOFS); + if (!new_keys) + return -ENOMEM; + + if (!old_buf) + memcpy_u64s(new_keys, inline_u64s, oldsize); + + l->keys_p = new_keys; + l->top_p = new_keys + oldsize; + + return 0; +} + +void bch2_keylist_pop_front(struct keylist *l) +{ + l->top_p -= bch2_keylist_front(l)->k.u64s; + + memmove_u64s_down(l->keys, + bkey_next(l->keys), + bch2_keylist_u64s(l)); +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_verify_keylist_sorted(struct keylist *l) +{ + struct bkey_i *k; + + for_each_keylist_key(l, k) + BUG_ON(bkey_next(k) != l->top && + bpos_ge(k->k.p, bkey_next(k)->k.p)); +} +#endif diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h new file mode 100644 index 000000000..fe759c703 --- /dev/null +++ b/fs/bcachefs/keylist.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_KEYLIST_H +#define _BCACHEFS_KEYLIST_H + +#include "keylist_types.h" + +int bch2_keylist_realloc(struct keylist *, u64 *, size_t, size_t); +void bch2_keylist_pop_front(struct keylist *); + +static inline void bch2_keylist_init(struct keylist *l, u64 *inline_keys) +{ + l->top_p = l->keys_p = inline_keys; +} + +static inline void bch2_keylist_free(struct keylist *l, u64 *inline_keys) +{ + if (l->keys_p != inline_keys) + kfree(l->keys_p); +} + +static inline void bch2_keylist_push(struct keylist *l) +{ + l->top = bkey_next(l->top); +} + +static inline void bch2_keylist_add(struct keylist *l, const struct bkey_i *k) +{ + bkey_copy(l->top, k); + bch2_keylist_push(l); +} + +static inline bool bch2_keylist_empty(struct keylist *l) +{ + return l->top == l->keys; +} + +static inline size_t bch2_keylist_u64s(struct keylist *l) +{ + return l->top_p - l->keys_p; +} + +static inline size_t bch2_keylist_bytes(struct keylist *l) +{ + return bch2_keylist_u64s(l) * sizeof(u64); +} + +static inline struct bkey_i *bch2_keylist_front(struct keylist *l) +{ + return l->keys; +} + +#define for_each_keylist_key(_keylist, _k) \ + for (_k = (_keylist)->keys; \ + _k != (_keylist)->top; \ + _k = bkey_next(_k)) + +static inline u64 keylist_sectors(struct keylist *keys) +{ + struct bkey_i *k; + u64 ret = 0; + + for_each_keylist_key(keys, k) + ret += k->k.size; + + return ret; +} + +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_verify_keylist_sorted(struct keylist *); +#else +static inline void bch2_verify_keylist_sorted(struct keylist *l) {} +#endif + +#endif /* _BCACHEFS_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h new file mode 100644 index 000000000..4b3ff7d8a --- /dev/null +++ b/fs/bcachefs/keylist_types.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_KEYLIST_TYPES_H +#define _BCACHEFS_KEYLIST_TYPES_H + +struct keylist { + union { + struct bkey_i *keys; + u64 *keys_p; + }; + union { + struct bkey_i *top; + u64 *top_p; + }; +}; + +#endif /* _BCACHEFS_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c new file mode 100644 index 000000000..07d192953 --- /dev/null +++ b/fs/bcachefs/lru.c @@ -0,0 +1,178 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "error.h" +#include "lru.h" +#include "recovery.h" + +/* KEY_TYPE_lru is obsolete: */ +int bch2_lru_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (!lru_pos_time(k.k->p)) { + prt_printf(err, "lru entry at time=0"); + return -BCH_ERR_invalid_bkey; + + } + + return 0; +} + +void bch2_lru_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct bch_lru *lru = bkey_s_c_to_lru(k).v; + + prt_printf(out, "idx %llu", le64_to_cpu(lru->idx)); +} + +void bch2_lru_pos_to_text(struct printbuf *out, struct bpos lru) +{ + prt_printf(out, "%llu:%llu -> %llu:%llu", + lru_pos_id(lru), + lru_pos_time(lru), + u64_to_bucket(lru.offset).inode, + u64_to_bucket(lru.offset).offset); +} + +static int __bch2_lru_set(struct btree_trans *trans, u16 lru_id, + u64 dev_bucket, u64 time, unsigned key_type) +{ + struct bkey_i *k; + int ret = 0; + + if (!time) + return 0; + + k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k)); + ret = PTR_ERR_OR_ZERO(k); + if (unlikely(ret)) + return ret; + + bkey_init(&k->k); + k->k.type = key_type; + k->k.p = lru_pos(lru_id, dev_bucket, time); + + EBUG_ON(lru_pos_id(k->k.p) != lru_id); + EBUG_ON(lru_pos_time(k->k.p) != time); + EBUG_ON(k->k.p.offset != dev_bucket); + + return bch2_trans_update_buffered(trans, BTREE_ID_lru, k); +} + +int bch2_lru_del(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +{ + return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_deleted); +} + +int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time) +{ + return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); +} + +int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + if (old_time == new_time) + return 0; + + return bch2_lru_del(trans, lru_id, dev_bucket, old_time) ?: + bch2_lru_set(trans, lru_id, dev_bucket, new_time); +} + +static const char * const bch2_lru_types[] = { +#define x(n) #n, + BCH_LRU_TYPES() +#undef x + NULL +}; + +static int bch2_check_lru_key(struct btree_trans *trans, + struct btree_iter *lru_iter, + struct bkey_s_c lru_k, + struct bpos *last_flushed_pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + enum bch_lru_type type = lru_type(lru_k); + struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); + u64 idx; + int ret; + + if (fsck_err_on(!bch2_dev_bucket_exists(c, alloc_pos), c, + "lru key points to nonexistent device:bucket %llu:%llu", + alloc_pos.inode, alloc_pos.offset)) + return bch2_btree_delete_at(trans, lru_iter, 0); + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); + ret = bkey_err(k); + if (ret) + goto err; + + a = bch2_alloc_to_v4(k, &a_convert); + + switch (type) { + case BCH_LRU_read: + idx = alloc_lru_idx_read(*a); + break; + case BCH_LRU_fragmentation: + idx = a->fragmentation_lru; + break; + } + + if (lru_k.k->type != KEY_TYPE_set || + lru_pos_time(lru_k.k->p) != idx) { + if (!bpos_eq(*last_flushed_pos, lru_k.k->p)) { + *last_flushed_pos = lru_k.k->p; + ret = bch2_btree_write_buffer_flush_sync(trans) ?: + -BCH_ERR_transaction_restart_write_buffer_flush; + goto out; + } + + if (c->opts.reconstruct_alloc || + fsck_err(c, "incorrect lru entry: lru %s time %llu\n" + " %s\n" + " for %s", + bch2_lru_types[type], + lru_pos_time(lru_k.k->p), + (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), + (bch2_bkey_val_to_text(&buf2, c, k), buf2.buf))) + ret = bch2_btree_delete_at(trans, lru_iter, 0); + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &iter); + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +int bch2_check_lrus(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bpos last_flushed_pos = POS_MIN; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW, + bch2_check_lru_key(&trans, &iter, k, &last_flushed_pos))); + if (ret) + bch_err_fn(c, ret); + return ret; + +} diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h new file mode 100644 index 000000000..7a3be20a8 --- /dev/null +++ b/fs/bcachefs/lru.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_LRU_H +#define _BCACHEFS_LRU_H + +#define LRU_TIME_BITS 48 +#define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) + +static inline struct bpos lru_pos(u16 lru_id, u64 dev_bucket, u64 time) +{ + EBUG_ON(time > LRU_TIME_MAX); + + return POS(((u64) lru_id << LRU_TIME_BITS)|time, dev_bucket); +} + +static inline u64 lru_pos_id(struct bpos pos) +{ + return pos.inode >> LRU_TIME_BITS; +} + +static inline u64 lru_pos_time(struct bpos pos) +{ + return pos.inode & ~(~0ULL << LRU_TIME_BITS); +} + +#define BCH_LRU_TYPES() \ + x(read) \ + x(fragmentation) + +enum bch_lru_type { +#define x(n) BCH_LRU_##n, + BCH_LRU_TYPES() +#undef x +}; + +#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) + +static inline enum bch_lru_type lru_type(struct bkey_s_c l) +{ + u16 lru_id = l.k->p.inode >> 48; + + if (lru_id == BCH_LRU_FRAGMENTATION_START) + return BCH_LRU_fragmentation; + return BCH_LRU_read; +} + +int bch2_lru_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_lru_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +void bch2_lru_pos_to_text(struct printbuf *, struct bpos); + +#define bch2_bkey_ops_lru ((struct bkey_ops) { \ + .key_invalid = bch2_lru_invalid, \ + .val_to_text = bch2_lru_to_text, \ + .min_val_size = 8, \ +}) + +int bch2_lru_del(struct btree_trans *, u16, u64, u64); +int bch2_lru_set(struct btree_trans *, u16, u64, u64); +int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +int bch2_check_lrus(struct bch_fs *); + +#endif /* _BCACHEFS_LRU_H */ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c new file mode 100644 index 000000000..81c8cdbac --- /dev/null +++ b/fs/bcachefs/migrate.c @@ -0,0 +1,182 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Code for moving data off a device. + */ + +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "errcode.h" +#include "extents.h" +#include "io.h" +#include "journal.h" +#include "keylist.h" +#include "migrate.h" +#include "move.h" +#include "replicas.h" +#include "super-io.h" + +static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, + unsigned dev_idx, int flags, bool metadata) +{ + unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; + unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; + unsigned degraded = metadata ? BCH_FORCE_IF_METADATA_DEGRADED : BCH_FORCE_IF_DATA_DEGRADED; + unsigned nr_good; + + bch2_bkey_drop_device(k, dev_idx); + + nr_good = bch2_bkey_durability(c, k.s_c); + if ((!nr_good && !(flags & lost)) || + (nr_good < replicas && !(flags & degraded))) + return -EINVAL; + + return 0; +} + +static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + int flags) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + if (!bch2_bkey_has_device_c(k, dev_idx)) + return 0; + + n = bch2_bkey_make_mut(trans, iter, &k, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + ret = drop_dev_ptrs(c, bkey_i_to_s(n), dev_idx, flags, false); + if (ret) + return ret; + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k)) + n->k.size = 0; + return 0; +} + +static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id id; + int ret = 0; + + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 0); + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_ptrs(id)) + continue; + + ret = for_each_btree_key_commit(&trans, iter, id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_dev_usrdata_drop_key(&trans, &iter, k, dev_idx, flags)); + if (ret) + break; + } + + bch2_trans_exit(&trans); + + return ret; +} + +static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + struct btree_trans trans; + struct btree_iter iter; + struct closure cl; + struct btree *b; + struct bkey_buf k; + unsigned id; + int ret; + + /* don't handle this yet: */ + if (flags & BCH_FORCE_IF_METADATA_LOST) + return -EINVAL; + + bch2_bkey_buf_init(&k); + bch2_trans_init(&trans, c, 0, 0); + closure_init_stack(&cl); + + for (id = 0; id < BTREE_ID_NR; id++) { + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: + ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) + goto next; + + bch2_bkey_buf_copy(&k, c, &b->key); + + ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), + dev_idx, flags, true); + if (ret) { + bch_err(c, "Cannot drop device without losing data"); + break; + } + + ret = bch2_btree_node_update_key(&trans, &iter, b, k.k, 0, false); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + ret = 0; + continue; + } + + if (ret) { + bch_err(c, "Error updating btree node key: %s", + bch2_err_str(ret)); + break; + } +next: + bch2_btree_iter_next_node(&iter); + } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); + + if (ret) + goto err; + } + + bch2_btree_interior_updates_flush(c); + ret = 0; +err: + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&k, c); + + BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); + + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +{ + return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, dev_idx, flags); +} diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h new file mode 100644 index 000000000..027efaa0d --- /dev/null +++ b/fs/bcachefs/migrate.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MIGRATE_H +#define _BCACHEFS_MIGRATE_H + +int bch2_dev_data_drop(struct bch_fs *, unsigned, int); + +#endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c new file mode 100644 index 000000000..052726739 --- /dev/null +++ b/fs/bcachefs/move.c @@ -0,0 +1,1168 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "backpointers.h" +#include "bkey_buf.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_write_buffer.h" +#include "disk_groups.h" +#include "ec.h" +#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "io.h" +#include "journal_reclaim.h" +#include "keylist.h" +#include "move.h" +#include "replicas.h" +#include "super-io.h" +#include "trace.h" + +#include +#include + +static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_read_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_read(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void trace_move_extent_alloc_mem_fail2(struct bch_fs *c, struct bkey_s_c k) +{ + if (trace_move_extent_alloc_mem_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_bkey_val_to_text(&buf, c, k); + trace_move_extent_alloc_mem_fail(c, buf.buf); + printbuf_exit(&buf); + } +} + +static void progress_list_add(struct bch_fs *c, struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_add(&stats->list, &c->data_progress_list); + mutex_unlock(&c->data_progress_lock); +} + +static void progress_list_del(struct bch_fs *c, struct bch_move_stats *stats) +{ + mutex_lock(&c->data_progress_lock); + list_del(&stats->list); + mutex_unlock(&c->data_progress_lock); +} + +struct moving_io { + struct list_head read_list; + struct list_head io_list; + struct move_bucket_in_flight *b; + struct closure cl; + bool read_completed; + + unsigned read_sectors; + unsigned write_sectors; + + struct bch_read_bio rbio; + + struct data_update write; + /* Must be last since it is variable size */ + struct bio_vec bi_inline_vecs[0]; +}; + +static void move_free(struct moving_io *io) +{ + struct moving_context *ctxt = io->write.ctxt; + + if (io->b) + atomic_dec(&io->b->count); + + bch2_data_update_exit(&io->write); + + mutex_lock(&ctxt->lock); + list_del(&io->io_list); + wake_up(&ctxt->wait); + mutex_unlock(&ctxt->lock); + + kfree(io); +} + +static void move_write_done(struct bch_write_op *op) +{ + struct moving_io *io = container_of(op, struct moving_io, write.op); + struct moving_context *ctxt = io->write.ctxt; + + if (io->write.op.error) + ctxt->write_error = true; + + atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_dec(&io->write.ctxt->write_ios); + move_free(io); + closure_put(&ctxt->cl); +} + +static void move_write(struct moving_io *io) +{ + if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + move_free(io); + return; + } + + closure_get(&io->write.ctxt->cl); + atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); + atomic_inc(&io->write.ctxt->write_ios); + + bch2_data_update_read_done(&io->write, io->rbio.pick.crc); +} + +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) +{ + struct moving_io *io = + list_first_entry_or_null(&ctxt->reads, struct moving_io, read_list); + + return io && io->read_completed ? io : NULL; +} + +static void move_read_endio(struct bio *bio) +{ + struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_context *ctxt = io->write.ctxt; + + atomic_sub(io->read_sectors, &ctxt->read_sectors); + atomic_dec(&ctxt->read_ios); + io->read_completed = true; + + wake_up(&ctxt->wait); + closure_put(&ctxt->cl); +} + +void bch2_moving_ctxt_do_pending_writes(struct moving_context *ctxt, + struct btree_trans *trans) +{ + struct moving_io *io; + + if (trans) + bch2_trans_unlock(trans); + + while ((io = bch2_moving_ctxt_next_pending_write(ctxt))) { + list_del(&io->read_list); + move_write(io); + } +} + +static void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt, + struct btree_trans *trans) +{ + unsigned sectors_pending = atomic_read(&ctxt->write_sectors); + + move_ctxt_wait_event(ctxt, trans, + !atomic_read(&ctxt->write_sectors) || + atomic_read(&ctxt->write_sectors) != sectors_pending); +} + +void bch2_moving_ctxt_exit(struct moving_context *ctxt) +{ + struct bch_fs *c = ctxt->c; + + move_ctxt_wait_event(ctxt, NULL, list_empty(&ctxt->reads)); + closure_sync(&ctxt->cl); + + EBUG_ON(atomic_read(&ctxt->write_sectors)); + EBUG_ON(atomic_read(&ctxt->write_ios)); + EBUG_ON(atomic_read(&ctxt->read_sectors)); + EBUG_ON(atomic_read(&ctxt->read_ios)); + + if (ctxt->stats) { + progress_list_del(c, ctxt->stats); + trace_move_data(c, + atomic64_read(&ctxt->stats->sectors_moved), + atomic64_read(&ctxt->stats->keys_moved)); + } + + mutex_lock(&c->moving_context_lock); + list_del(&ctxt->list); + mutex_unlock(&c->moving_context_lock); +} + +void bch2_moving_ctxt_init(struct moving_context *ctxt, + struct bch_fs *c, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + memset(ctxt, 0, sizeof(*ctxt)); + + ctxt->c = c; + ctxt->fn = (void *) _RET_IP_; + ctxt->rate = rate; + ctxt->stats = stats; + ctxt->wp = wp; + ctxt->wait_on_copygc = wait_on_copygc; + + closure_init_stack(&ctxt->cl); + + mutex_init(&ctxt->lock); + INIT_LIST_HEAD(&ctxt->reads); + INIT_LIST_HEAD(&ctxt->ios); + init_waitqueue_head(&ctxt->wait); + + mutex_lock(&c->moving_context_lock); + list_add(&ctxt->list, &c->moving_context_list); + mutex_unlock(&c->moving_context_lock); + + if (stats) { + progress_list_add(c, stats); + stats->data_type = BCH_DATA_user; + } +} + +void bch2_move_stats_init(struct bch_move_stats *stats, char *name) +{ + memset(stats, 0, sizeof(*stats)); + scnprintf(stats->name, sizeof(stats->name), "%s", name); +} + +static int bch2_extent_drop_ptrs(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct data_update_opts data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_i *n; + int ret; + + n = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + return ret; + + while (data_opts.kill_ptrs) { + unsigned i = 0, drop = __fls(data_opts.kill_ptrs); + struct bch_extent_ptr *ptr; + + bch2_bkey_drop_ptrs(bkey_i_to_s(n), ptr, i++ == drop); + data_opts.kill_ptrs ^= 1U << drop; + } + + /* + * If the new extent no longer has any pointers, bch2_extent_normalize() + * will do the appropriate thing with it (turning it into a + * KEY_TYPE_error key, or just a discard if it was a cached extent) + */ + bch2_extent_normalize(c, bkey_i_to_s(n)); + + /* + * Since we're not inserting through an extent iterator + * (BTREE_ITER_ALL_SNAPSHOTS iterators aren't extent iterators), + * we aren't using the extent overwrite path to delete, we're + * just using the normal key deletion path: + */ + if (bkey_deleted(&n->k)) + n->k.size = 0; + + return bch2_trans_relock(trans) ?: + bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?: + bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL); +} + +static int bch2_move_extent(struct btree_trans *trans, + struct btree_iter *iter, + struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bch_io_opts io_opts, + enum btree_id btree_id, + struct bkey_s_c k, + struct data_update_opts data_opts) +{ + struct bch_fs *c = trans->c; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + struct moving_io *io; + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned sectors = k.k->size, pages; + int ret = -ENOMEM; + + trace_move_extent2(c, k); + + bch2_data_update_opts_normalize(k, &data_opts); + + if (!data_opts.rewrite_ptrs && + !data_opts.extra_replicas) { + if (data_opts.kill_ptrs) + return bch2_extent_drop_ptrs(trans, iter, k, data_opts); + return 0; + } + + /* + * Before memory allocations & taking nocow locks in + * bch2_data_update_init(): + */ + bch2_trans_unlock(trans); + + /* write path might have to decompress data: */ + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); + + pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); + io = kzalloc(sizeof(struct moving_io) + + sizeof(struct bio_vec) * pages, GFP_KERNEL); + if (!io) + goto err; + + INIT_LIST_HEAD(&io->io_list); + io->write.ctxt = ctxt; + io->read_sectors = k.k->size; + io->write_sectors = k.k->size; + + bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); + bio_set_prio(&io->write.op.wbio.bio, + IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + + if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, + GFP_KERNEL)) + goto err_free; + + io->rbio.c = c; + io->rbio.opts = io_opts; + bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); + io->rbio.bio.bi_vcnt = pages; + bio_set_prio(&io->rbio.bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); + io->rbio.bio.bi_iter.bi_size = sectors << 9; + + io->rbio.bio.bi_opf = REQ_OP_READ; + io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); + io->rbio.bio.bi_end_io = move_read_endio; + + ret = bch2_data_update_init(trans, ctxt, &io->write, ctxt->wp, + io_opts, data_opts, btree_id, k); + if (ret && ret != -BCH_ERR_unwritten_extent_update) + goto err_free_pages; + + if (ret == -BCH_ERR_unwritten_extent_update) { + bch2_update_unwritten_extent(trans, &io->write); + move_free(io); + return 0; + } + + BUG_ON(ret); + + io->write.ctxt = ctxt; + io->write.op.end_io = move_write_done; + + if (ctxt->stats) { + atomic64_inc(&ctxt->stats->keys_moved); + atomic64_add(k.k->size, &ctxt->stats->sectors_moved); + } + + if (bucket_in_flight) { + io->b = bucket_in_flight; + atomic_inc(&io->b->count); + } + + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); + this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); + trace_move_extent_read2(c, k); + + mutex_lock(&ctxt->lock); + atomic_add(io->read_sectors, &ctxt->read_sectors); + atomic_inc(&ctxt->read_ios); + + list_add_tail(&io->read_list, &ctxt->reads); + list_add_tail(&io->io_list, &ctxt->ios); + mutex_unlock(&ctxt->lock); + + /* + * dropped by move_read_endio() - guards against use after free of + * ctxt when doing wakeup + */ + closure_get(&ctxt->cl); + bch2_read_extent(trans, &io->rbio, + bkey_start_pos(k.k), + btree_id, k, 0, + BCH_READ_NODECODE| + BCH_READ_LAST_FRAGMENT); + return 0; +err_free_pages: + bio_free_pages(&io->write.op.wbio.bio); +err_free: + kfree(io); +err: + this_cpu_inc(c->counters[BCH_COUNTER_move_extent_alloc_mem_fail]); + trace_move_extent_alloc_mem_fail2(c, k); + return ret; +} + +static int lookup_inode(struct btree_trans *trans, struct bpos pos, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, pos, + BTREE_ITER_ALL_SNAPSHOTS); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !bkey_eq(k.k->p, pos)) { + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bkey_is_inode(k.k) ? 0 : -EIO; + if (ret) + goto err; + + ret = bch2_inode_unpack(k, inode); + if (ret) + goto err; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int move_ratelimit(struct btree_trans *trans, + struct moving_context *ctxt) +{ + struct bch_fs *c = trans->c; + u64 delay; + + if (ctxt->wait_on_copygc) { + bch2_trans_unlock(trans); + wait_event_killable(c->copygc_running_wq, + !c->copygc_running || + kthread_should_stop()); + } + + do { + delay = ctxt->rate ? bch2_ratelimit_delay(ctxt->rate) : 0; + + if (delay) { + bch2_trans_unlock(trans); + set_current_state(TASK_INTERRUPTIBLE); + } + + if ((current->flags & PF_KTHREAD) && kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 1; + } + + if (delay) + schedule_timeout(delay); + + if (unlikely(freezing(current))) { + move_ctxt_wait_event(ctxt, trans, list_empty(&ctxt->reads)); + try_to_freeze(); + } + } while (delay); + + /* + * XXX: these limits really ought to be per device, SSDs and hard drives + * will want different limits + */ + move_ctxt_wait_event(ctxt, trans, + atomic_read(&ctxt->write_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->read_sectors) < c->opts.move_bytes_in_flight >> 9 && + atomic_read(&ctxt->write_ios) < c->opts.move_ios_in_flight && + atomic_read(&ctxt->read_ios) < c->opts.move_ios_in_flight); + + return 0; +} + +static int move_get_io_opts(struct btree_trans *trans, + struct bch_io_opts *io_opts, + struct bkey_s_c k, u64 *cur_inum) +{ + struct bch_inode_unpacked inode; + int ret; + + if (*cur_inum == k.k->p.inode) + return 0; + + ret = lookup_inode(trans, + SPOS(0, k.k->p.inode, k.k->p.snapshot), + &inode); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + + if (!ret) + bch2_inode_opts_get(io_opts, trans->c, &inode); + else + *io_opts = bch2_opts_to_inode_opts(trans->c->opts); + *cur_inum = k.k->p.inode; + return 0; +} + +static int __bch2_move_data(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id) +{ + struct bch_fs *c = ctxt->c; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct bkey_buf sk; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct data_update_opts data_opts; + u64 cur_inum = U64_MAX; + int ret = 0, ret2; + + bch2_bkey_buf_init(&sk); + bch2_trans_init(&trans, c, 0, 0); + + if (ctxt->stats) { + ctxt->stats->data_type = BCH_DATA_user; + ctxt->stats->btree_id = btree_id; + ctxt->stats->pos = start; + } + + bch2_trans_iter_init(&trans, &iter, btree_id, start, + BTREE_ITER_PREFETCH| + BTREE_ITER_ALL_SNAPSHOTS); + + if (ctxt->rate) + bch2_ratelimit_reset(ctxt->rate); + + while (!move_ratelimit(&trans, ctxt)) { + bch2_trans_begin(&trans); + + k = bch2_btree_iter_peek(&iter); + if (!k.k) + break; + + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; + + if (bkey_ge(bkey_start_pos(k.k), end)) + break; + + if (ctxt->stats) + ctxt->stats->pos = iter.pos; + + if (!bkey_extent_is_direct_data(k.k)) + goto next_nondata; + + ret = move_get_io_opts(&trans, &io_opts, k, &cur_inum); + if (ret) + continue; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, k, &io_opts, &data_opts)) + goto next; + + /* + * The iterator gets unlocked by __bch2_read_extent - need to + * save a copy of @k elsewhere: + */ + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret2 = bch2_move_extent(&trans, &iter, ctxt, NULL, + io_opts, btree_id, k, data_opts); + if (ret2) { + if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) + continue; + + if (ret2 == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt, &trans); + continue; + } + + /* XXX signal failure */ + goto next; + } + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); +next: + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); +next_nondata: + bch2_btree_iter_advance(&iter); + } + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&sk, c); + + return ret; +} + +int bch2_move_data(struct bch_fs *c, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + enum btree_id id; + int ret; + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + id++) { + stats->btree_id = id; + + if (id != BTREE_ID_extents && + id != BTREE_ID_reflink) + continue; + + if (!bch2_btree_id_root(c, id)->b) + continue; + + ret = __bch2_move_data(&ctxt, + id == start_btree_id ? start_pos : POS_MIN, + id == end_btree_id ? end_pos : POS_MAX, + pred, arg, id); + if (ret) + break; + } + + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +int __bch2_evacuate_bucket(struct btree_trans *trans, + struct moving_context *ctxt, + struct move_bucket_in_flight *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts _data_opts) +{ + struct bch_fs *c = ctxt->c; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_iter iter; + struct bkey_buf sk; + struct bch_backpointer bp; + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + struct bkey_s_c k; + struct data_update_opts data_opts; + unsigned dirty_sectors, bucket_size; + u64 fragmentation; + u64 cur_inum = U64_MAX; + struct bpos bp_pos = POS_MIN; + int ret = 0; + + trace_bucket_evacuate(c, &bucket); + + bch2_bkey_buf_init(&sk); + + /* + * We're not run in a context that handles transaction restarts: + */ + bch2_trans_begin(trans); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_alloc, + bucket, BTREE_ITER_CACHED); + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + bch2_trans_iter_exit(trans, &iter); + + if (ret) { + bch_err_msg(c, ret, "looking up alloc key"); + goto err; + } + + a = bch2_alloc_to_v4(k, &a_convert); + dirty_sectors = a->dirty_sectors; + bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size; + fragmentation = a->fragmentation_lru; + + ret = bch2_btree_write_buffer_flush(trans); + if (ret) { + bch_err_msg(c, ret, "flushing btree write buffer"); + goto err; + } + + while (!(ret = move_ratelimit(trans, ctxt))) { + bch2_trans_begin(trans); + + ret = bch2_get_next_backpointer(trans, bucket, gen, + &bp_pos, &bp, + BTREE_ITER_CACHED); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (bkey_eq(bp_pos, POS_MAX)) + break; + + if (!bp.level) { + const struct bch_extent_ptr *ptr; + struct bkey_s_c k; + unsigned i = 0; + + k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; + + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); + + ret = move_get_io_opts(trans, &io_opts, k, &cur_inum); + if (ret) { + bch2_trans_iter_exit(trans, &iter); + continue; + } + + data_opts = _data_opts; + data_opts.target = io_opts.background_target; + data_opts.rewrite_ptrs = 0; + + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == bucket.inode) { + data_opts.rewrite_ptrs |= 1U << i; + if (ptr->cached) { + bch2_trans_iter_exit(trans, &iter); + goto next; + } + } + i++; + } + + ret = bch2_move_extent(trans, &iter, ctxt, + bucket_in_flight, + io_opts, bp.btree_id, k, data_opts); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt, trans); + continue; + } + if (ret) + goto err; + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, k.k->size); + if (ctxt->stats) + atomic64_add(k.k->size, &ctxt->stats->sectors_seen); + } else { + struct btree *b; + + b = bch2_backpointer_get_node(trans, &iter, bp_pos, bp); + ret = PTR_ERR_OR_ZERO(b); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + continue; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!b) + goto next; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0); + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + + if (ctxt->rate) + bch2_ratelimit_increment(ctxt->rate, + c->opts.btree_node_size >> 9); + if (ctxt->stats) { + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen); + atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved); + } + } +next: + bp_pos = bpos_nosnap_successor(bp_pos); + } + + trace_evacuate_bucket(c, &bucket, dirty_sectors, bucket_size, fragmentation, ret); +err: + bch2_bkey_buf_exit(&sk, c); + return ret; +} + +int bch2_evacuate_bucket(struct bch_fs *c, + struct bpos bucket, int gen, + struct data_update_opts data_opts, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc) +{ + struct btree_trans trans; + struct moving_context ctxt; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + ret = __bch2_evacuate_bucket(&trans, &ctxt, NULL, bucket, gen, data_opts); + bch2_moving_ctxt_exit(&ctxt); + bch2_trans_exit(&trans); + + return ret; +} + +typedef bool (*move_btree_pred)(struct bch_fs *, void *, + struct btree *, struct bch_io_opts *, + struct data_update_opts *); + +static int bch2_move_btree(struct bch_fs *c, + enum btree_id start_btree_id, struct bpos start_pos, + enum btree_id end_btree_id, struct bpos end_pos, + move_btree_pred pred, void *arg, + struct bch_move_stats *stats) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + struct bch_io_opts io_opts = bch2_opts_to_inode_opts(c->opts); + struct btree_trans trans; + struct btree_iter iter; + struct btree *b; + enum btree_id id; + struct data_update_opts data_opts; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + progress_list_add(c, stats); + + stats->data_type = BCH_DATA_btree; + + for (id = start_btree_id; + id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1); + id++) { + stats->btree_id = id; + + if (!bch2_btree_id_root(c, id)->b) + continue; + + bch2_trans_node_iter_init(&trans, &iter, id, POS_MIN, 0, 0, + BTREE_ITER_PREFETCH); +retry: + ret = 0; + while (bch2_trans_begin(&trans), + (b = bch2_btree_iter_peek_node(&iter)) && + !(ret = PTR_ERR_OR_ZERO(b))) { + if (kthread && kthread_should_stop()) + break; + + if ((cmp_int(id, end_btree_id) ?: + bpos_cmp(b->key.k.p, end_pos)) > 0) + break; + + stats->pos = iter.pos; + + if (!pred(c, arg, b, &io_opts, &data_opts)) + goto next; + + ret = bch2_btree_node_rewrite(&trans, &iter, b, 0) ?: ret; + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + break; +next: + bch2_btree_iter_next_node(&iter); + } + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_iter_exit(&trans, &iter); + + if (kthread && kthread_should_stop()) + break; + } + + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + + bch2_btree_interior_updates_flush(c); + + progress_list_del(c, stats); + return ret; +} + +static bool rereplicate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + unsigned nr_good = bch2_bkey_durability(c, k); + unsigned replicas = bkey_is_btree_ptr(k.k) + ? c->opts.metadata_replicas + : io_opts->data_replicas; + + if (!nr_good || nr_good >= replicas) + return false; + + data_opts->target = 0; + data_opts->extra_replicas = replicas - nr_good; + data_opts->btree_insert_flags = 0; + return true; +} + +static bool migrate_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const struct bch_extent_ptr *ptr; + struct bch_ioctl_data *op = arg; + unsigned i = 0; + + data_opts->rewrite_ptrs = 0; + data_opts->target = 0; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; + + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == op->migrate.dev) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static bool migrate_btree_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); +} + +static bool bformat_needs_redo(struct bkey_format *f) +{ + unsigned i; + + for (i = 0; i < f->nr_fields; i++) { + unsigned unpacked_bits = bch2_bkey_format_current.bits_per_field[i]; + u64 unpacked_mask = ~((~0ULL << 1) << (unpacked_bits - 1)); + u64 field_offset = le64_to_cpu(f->field_offset[i]); + + if (f->bits_per_field[i] > unpacked_bits) + return true; + + if ((f->bits_per_field[i] == unpacked_bits) && field_offset) + return true; + + if (((field_offset + ((1ULL << f->bits_per_field[i]) - 1)) & + unpacked_mask) < + field_offset) + return true; + } + + return false; +} + +static bool rewrite_old_nodes_pred(struct bch_fs *c, void *arg, + struct btree *b, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + if (b->version_ondisk != c->sb.version || + btree_node_need_rewrite(b) || + bformat_needs_redo(&b->format)) { + data_opts->target = 0; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; + return true; + } + + return false; +} + +int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) +{ + int ret; + + ret = bch2_move_btree(c, + 0, POS_MIN, + BTREE_ID_NR, SPOS_MAX, + rewrite_old_nodes_pred, c, stats); + if (!ret) { + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + c->disk_sb.sb->version_min = c->disk_sb.sb->version; + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +int bch2_data_job(struct bch_fs *c, + struct bch_move_stats *stats, + struct bch_ioctl_data op) +{ + int ret = 0; + + switch (op.op) { + case BCH_DATA_OP_REREPLICATE: + bch2_move_stats_init(stats, "rereplicate"); + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, -1); + + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + rereplicate_btree_pred, c, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + rereplicate_pred, c) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + break; + case BCH_DATA_OP_MIGRATE: + if (op.migrate.dev >= c->sb.nr_devices) + return -EINVAL; + + bch2_move_stats_init(stats, "migrate"); + stats->data_type = BCH_DATA_journal; + ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); + + ret = bch2_move_btree(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + migrate_btree_pred, &op, stats) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + + ret = bch2_move_data(c, + op.start_btree, op.start_pos, + op.end_btree, op.end_pos, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + ret = bch2_replicas_gc2(c) ?: ret; + break; + case BCH_DATA_OP_REWRITE_OLD_NODES: + bch2_move_stats_init(stats, "rewrite_old_nodes"); + ret = bch2_scan_old_btree_nodes(c, stats); + break; + default: + ret = -EINVAL; + } + + return ret; +} + +void bch2_data_jobs_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_move_stats *stats; + + mutex_lock(&c->data_progress_lock); + list_for_each_entry(stats, &c->data_progress_list, list) { + prt_printf(out, "%s: data type %s btree_id %s position: ", + stats->name, + bch2_data_types[stats->data_type], + bch2_btree_ids[stats->btree_id]); + bch2_bpos_to_text(out, stats->pos); + prt_printf(out, "%s", "\n"); + } + mutex_unlock(&c->data_progress_lock); +} + +static void bch2_moving_ctxt_to_text(struct printbuf *out, struct moving_context *ctxt) +{ + struct moving_io *io; + + prt_printf(out, "%ps:", ctxt->fn); + prt_newline(out); + printbuf_indent_add(out, 2); + + prt_printf(out, "reads: %u sectors %u", + atomic_read(&ctxt->read_ios), + atomic_read(&ctxt->read_sectors)); + prt_newline(out); + + prt_printf(out, "writes: %u sectors %u", + atomic_read(&ctxt->write_ios), + atomic_read(&ctxt->write_sectors)); + prt_newline(out); + + printbuf_indent_add(out, 2); + + mutex_lock(&ctxt->lock); + list_for_each_entry(io, &ctxt->ios, io_list) { + bch2_write_op_to_text(out, &io->write.op); + } + mutex_unlock(&ctxt->lock); + + printbuf_indent_sub(out, 4); +} + +void bch2_fs_moving_ctxts_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct moving_context *ctxt; + + mutex_lock(&c->moving_context_lock); + list_for_each_entry(ctxt, &c->moving_context_list, list) + bch2_moving_ctxt_to_text(out, ctxt); + mutex_unlock(&c->moving_context_lock); +} + +void bch2_fs_move_init(struct bch_fs *c) +{ + INIT_LIST_HEAD(&c->moving_context_list); + mutex_init(&c->moving_context_lock); + + INIT_LIST_HEAD(&c->data_progress_list); + mutex_init(&c->data_progress_lock); +} diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h new file mode 100644 index 000000000..547ee7b72 --- /dev/null +++ b/fs/bcachefs/move.h @@ -0,0 +1,96 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVE_H +#define _BCACHEFS_MOVE_H + +#include "btree_iter.h" +#include "buckets.h" +#include "data_update.h" +#include "move_types.h" + +struct bch_read_bio; + +struct moving_context { + struct bch_fs *c; + struct list_head list; + void *fn; + + struct bch_ratelimit *rate; + struct bch_move_stats *stats; + struct write_point_specifier wp; + bool wait_on_copygc; + bool write_error; + + /* For waiting on outstanding reads and writes: */ + struct closure cl; + + struct mutex lock; + struct list_head reads; + struct list_head ios; + + /* in flight sectors: */ + atomic_t read_sectors; + atomic_t write_sectors; + atomic_t read_ios; + atomic_t write_ios; + + wait_queue_head_t wait; +}; + +#define move_ctxt_wait_event(_ctxt, _trans, _cond) \ +do { \ + bool cond_finished = false; \ + bch2_moving_ctxt_do_pending_writes(_ctxt, _trans); \ + \ + if (_cond) \ + break; \ + __wait_event((_ctxt)->wait, \ + bch2_moving_ctxt_next_pending_write(_ctxt) || \ + (cond_finished = (_cond))); \ + if (cond_finished) \ + break; \ +} while (1) + +typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, + struct bch_io_opts *, struct data_update_opts *); + +void bch2_moving_ctxt_exit(struct moving_context *); +void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool); +struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *); +void bch2_moving_ctxt_do_pending_writes(struct moving_context *, + struct btree_trans *); + +int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); + +int bch2_move_data(struct bch_fs *, + enum btree_id, struct bpos, + enum btree_id, struct bpos, + struct bch_ratelimit *, + struct bch_move_stats *, + struct write_point_specifier, + bool, + move_pred_fn, void *); + +int __bch2_evacuate_bucket(struct btree_trans *, + struct moving_context *, + struct move_bucket_in_flight *, + struct bpos, int, + struct data_update_opts); +int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int, + struct data_update_opts, + struct bch_ratelimit *, + struct bch_move_stats *, + struct write_point_specifier, + bool); +int bch2_data_job(struct bch_fs *, + struct bch_move_stats *, + struct bch_ioctl_data); + +void bch2_move_stats_init(struct bch_move_stats *stats, char *name); +void bch2_data_jobs_to_text(struct printbuf *, struct bch_fs *); +void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *); + +void bch2_fs_move_init(struct bch_fs *); + +#endif /* _BCACHEFS_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h new file mode 100644 index 000000000..baf1f8570 --- /dev/null +++ b/fs/bcachefs/move_types.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVE_TYPES_H +#define _BCACHEFS_MOVE_TYPES_H + +struct bch_move_stats { + enum bch_data_type data_type; + enum btree_id btree_id; + struct bpos pos; + struct list_head list; + char name[32]; + + atomic64_t keys_moved; + atomic64_t keys_raced; + atomic64_t sectors_moved; + atomic64_t sectors_seen; + atomic64_t sectors_raced; +}; + +struct move_bucket_key { + struct bpos bucket; + u8 gen; +}; + +struct move_bucket { + struct move_bucket_key k; + unsigned sectors; +}; + +struct move_bucket_in_flight { + struct move_bucket_in_flight *next; + struct rhash_head hash; + struct move_bucket bucket; + atomic_t count; +}; + +#endif /* _BCACHEFS_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c new file mode 100644 index 000000000..5242f20bb --- /dev/null +++ b/fs/bcachefs/movinggc.c @@ -0,0 +1,421 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Moving/copying garbage collector + * + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "btree_iter.h" +#include "btree_update.h" +#include "btree_write_buffer.h" +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" +#include "errcode.h" +#include "error.h" +#include "extents.h" +#include "eytzinger.h" +#include "io.h" +#include "keylist.h" +#include "lru.h" +#include "move.h" +#include "movinggc.h" +#include "super-io.h" +#include "trace.h" + +#include +#include +#include +#include +#include +#include +#include + +struct buckets_in_flight { + struct rhashtable table; + struct move_bucket_in_flight *first; + struct move_bucket_in_flight *last; + size_t nr; + size_t sectors; +}; + +static const struct rhashtable_params bch_move_bucket_params = { + .head_offset = offsetof(struct move_bucket_in_flight, hash), + .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .key_len = sizeof(struct move_bucket_key), +}; + +static struct move_bucket_in_flight * +move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +{ + struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); + int ret; + + if (!new) + return ERR_PTR(-ENOMEM); + + new->bucket = b; + + ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, + bch_move_bucket_params); + if (ret) { + kfree(new); + return ERR_PTR(ret); + } + + if (!list->first) + list->first = new; + else + list->last->next = new; + + list->last = new; + list->nr++; + list->sectors += b.sectors; + return new; +} + +static int bch2_bucket_is_movable(struct btree_trans *trans, + struct move_bucket *b, u64 time) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a; + int ret; + + if (bch2_bucket_is_open(trans->c, + b->k.bucket.inode, + b->k.bucket.offset)) + return 0; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_CACHED); + ret = bkey_err(k); + if (ret) + return ret; + + a = bch2_alloc_to_v4(k, &_a); + b->k.gen = a->gen; + b->sectors = a->dirty_sectors; + + ret = data_type_movable(a->data_type) && + a->fragmentation_lru && + a->fragmentation_lru <= time; + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static void move_buckets_wait(struct btree_trans *trans, + struct moving_context *ctxt, + struct buckets_in_flight *list, + bool flush) +{ + struct move_bucket_in_flight *i; + int ret; + + while ((i = list->first)) { + if (flush) + move_ctxt_wait_event(ctxt, trans, !atomic_read(&i->count)); + + if (atomic_read(&i->count)) + break; + + list->first = i->next; + if (!list->first) + list->last = NULL; + + list->nr--; + list->sectors -= i->bucket.sectors; + + ret = rhashtable_remove_fast(&list->table, &i->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(i); + } + + bch2_trans_unlock(trans); +} + +static bool bucket_in_flight(struct buckets_in_flight *list, + struct move_bucket_key k) +{ + return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); +} + +typedef DARRAY(struct move_bucket) move_buckets; + +static int bch2_copygc_get_buckets(struct btree_trans *trans, + struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight, + move_buckets *buckets) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + size_t nr_to_get = max(16UL, buckets_in_flight->nr / 4); + size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0; + int ret; + + move_buckets_wait(trans, ctxt, buckets_in_flight, false); + + ret = bch2_btree_write_buffer_flush(trans); + if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()", + __func__, bch2_err_str(ret))) + return ret; + + ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru, + lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), + lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + 0, k, ({ + struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; + int ret = 0; + + saw++; + + if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p))) + not_movable++; + else if (bucket_in_flight(buckets_in_flight, b.k)) + in_flight++; + else { + ret = darray_push(buckets, b) ?: buckets->nr >= nr_to_get; + if (ret >= 0) + sectors += b.sectors; + } + ret; + })); + + pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", + buckets_in_flight->nr, buckets_in_flight->sectors, + saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); + + return ret < 0 ? ret : 0; +} + +noinline +static int bch2_copygc(struct btree_trans *trans, + struct moving_context *ctxt, + struct buckets_in_flight *buckets_in_flight) +{ + struct bch_fs *c = trans->c; + struct data_update_opts data_opts = { + .btree_insert_flags = BCH_WATERMARK_copygc, + }; + move_buckets buckets = { 0 }; + struct move_bucket_in_flight *f; + struct move_bucket *i; + u64 moved = atomic64_read(&ctxt->stats->sectors_moved); + int ret = 0; + + ret = bch2_copygc_get_buckets(trans, ctxt, buckets_in_flight, &buckets); + if (ret) + goto err; + + darray_for_each(buckets, i) { + if (unlikely(freezing(current))) + break; + + f = move_bucket_in_flight_add(buckets_in_flight, *i); + ret = PTR_ERR_OR_ZERO(f); + if (ret == -EEXIST) /* rare race: copygc_get_buckets returned same bucket more than once */ + continue; + if (ret == -ENOMEM) { /* flush IO, continue later */ + ret = 0; + break; + } + + ret = __bch2_evacuate_bucket(trans, ctxt, f, f->bucket.k.bucket, + f->bucket.k.gen, data_opts); + if (ret) + goto err; + } +err: + darray_exit(&buckets); + + /* no entries in LRU btree found, or got to end: */ + if (bch2_err_matches(ret, ENOENT)) + ret = 0; + + if (ret < 0 && !bch2_err_matches(ret, EROFS)) + bch_err(c, "error from bch2_move_data() in copygc: %s", bch2_err_str(ret)); + + moved = atomic64_read(&ctxt->stats->sectors_moved) - moved; + trace_and_count(c, copygc, c, moved, 0, 0, 0); + return ret; +} + +/* + * Copygc runs when the amount of fragmented data is above some arbitrary + * threshold: + * + * The threshold at the limit - when the device is full - is the amount of space + * we reserved in bch2_recalc_capacity; we can't have more than that amount of + * disk space stranded due to fragmentation and store everything we have + * promised to store. + * + * But we don't want to be running copygc unnecessarily when the device still + * has plenty of free space - rather, we want copygc to smoothly run every so + * often and continually reduce the amount of fragmented space as the device + * fills up. So, we increase the threshold by half the current free space. + */ +unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned dev_idx; + s64 wait = S64_MAX, fragmented_allowed, fragmented; + unsigned i; + + for_each_rw_member(ca, c, dev_idx) { + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + + fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + fragmented = 0; + + for (i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage.d[i].fragmented; + + wait = min(wait, max(0LL, fragmented_allowed - fragmented)); + } + + return wait; +} + +void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) +{ + prt_printf(out, "Currently waiting for: "); + prt_human_readable_u64(out, max(0LL, c->copygc_wait - + atomic64_read(&c->io_clock[WRITE].now)) << 9); + prt_newline(out); + + prt_printf(out, "Currently waiting since: "); + prt_human_readable_u64(out, max(0LL, + atomic64_read(&c->io_clock[WRITE].now) - + c->copygc_wait_at) << 9); + prt_newline(out); + + prt_printf(out, "Currently calculated wait: "); + prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); + prt_newline(out); +} + +static int bch2_copygc_thread(void *arg) +{ + struct bch_fs *c = arg; + struct btree_trans trans; + struct moving_context ctxt; + struct bch_move_stats move_stats; + struct io_clock *clock = &c->io_clock[WRITE]; + struct buckets_in_flight move_buckets; + u64 last, wait; + int ret = 0; + + memset(&move_buckets, 0, sizeof(move_buckets)); + + ret = rhashtable_init(&move_buckets.table, &bch_move_bucket_params); + if (ret) { + bch_err(c, "error allocating copygc buckets in flight: %s", + bch2_err_str(ret)); + return ret; + } + + set_freezable(); + bch2_trans_init(&trans, c, 0, 0); + + bch2_move_stats_init(&move_stats, "copygc"); + bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, + writepoint_ptr(&c->copygc_write_point), + false); + + while (!ret && !kthread_should_stop()) { + bch2_trans_unlock(&trans); + cond_resched(); + + if (!c->copy_gc_enabled) { + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + kthread_wait_freezable(c->copy_gc_enabled); + } + + if (unlikely(freezing(current))) { + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + __refrigerator(false); + continue; + } + + last = atomic64_read(&clock->now); + wait = bch2_copygc_wait_amount(c); + + if (wait > clock->max_slop) { + c->copygc_wait_at = last; + c->copygc_wait = last + wait; + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + trace_and_count(c, copygc_wait, c, wait, last + wait); + bch2_kthread_io_clock_wait(clock, last + wait, + MAX_SCHEDULE_TIMEOUT); + continue; + } + + c->copygc_wait = 0; + + c->copygc_running = true; + ret = bch2_copygc(&trans, &ctxt, &move_buckets); + c->copygc_running = false; + + wake_up(&c->copygc_running_wq); + } + + move_buckets_wait(&trans, &ctxt, &move_buckets, true); + rhashtable_destroy(&move_buckets.table); + bch2_trans_exit(&trans); + bch2_moving_ctxt_exit(&ctxt); + + return 0; +} + +void bch2_copygc_stop(struct bch_fs *c) +{ + if (c->copygc_thread) { + kthread_stop(c->copygc_thread); + put_task_struct(c->copygc_thread); + } + c->copygc_thread = NULL; +} + +int bch2_copygc_start(struct bch_fs *c) +{ + struct task_struct *t; + int ret; + + if (c->copygc_thread) + return 0; + + if (c->opts.nochanges) + return 0; + + if (bch2_fs_init_fault("copygc_start")) + return -ENOMEM; + + t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name); + ret = PTR_ERR_OR_ZERO(t); + if (ret) { + bch_err(c, "error creating copygc thread: %s", bch2_err_str(ret)); + return ret; + } + + get_task_struct(t); + + c->copygc_thread = t; + wake_up_process(c->copygc_thread); + + return 0; +} + +void bch2_fs_copygc_init(struct bch_fs *c) +{ + init_waitqueue_head(&c->copygc_running_wq); + c->copygc_running = false; +} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h new file mode 100644 index 000000000..ea181fef5 --- /dev/null +++ b/fs/bcachefs/movinggc.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_MOVINGGC_H +#define _BCACHEFS_MOVINGGC_H + +unsigned long bch2_copygc_wait_amount(struct bch_fs *); +void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); + +void bch2_copygc_stop(struct bch_fs *); +int bch2_copygc_start(struct bch_fs *); +void bch2_fs_copygc_init(struct bch_fs *); + +#endif /* _BCACHEFS_MOVINGGC_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c new file mode 100644 index 000000000..396357cd8 --- /dev/null +++ b/fs/bcachefs/nocow_locking.c @@ -0,0 +1,123 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "nocow_locking.h" +#include "util.h" + +#include + +bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *t, struct bpos bucket) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + unsigned i; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket && atomic_read(&l->l[i])) + return true; + return false; +} + +#define sign(v) (v < 0 ? -1 : v > 0 ? 1 : 0) + +void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *t, struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + int lock_val = flags ? 1 : -1; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) { + BUG_ON(sign(atomic_read(&l->l[i])) != lock_val); + + if (!atomic_sub_return(lock_val, &l->l[i])) + closure_wake_up(&l->wait); + return; + } + + BUG(); +} + +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) +{ + int v, lock_val = flags ? 1 : -1; + unsigned i; + + spin_lock(&l->lock); + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (l->b[i] == dev_bucket) + goto got_entry; + + for (i = 0; i < ARRAY_SIZE(l->b); i++) + if (!atomic_read(&l->l[i])) { + l->b[i] = dev_bucket; + goto take_lock; + } +fail: + spin_unlock(&l->lock); + return false; +got_entry: + v = atomic_read(&l->l[i]); + if (lock_val > 0 ? v < 0 : v > 0) + goto fail; +take_lock: + atomic_add(lock_val, &l->l[i]); + spin_unlock(&l->lock); + return true; +} + +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct nocow_lock_bucket *l, + u64 dev_bucket, int flags) +{ + if (!__bch2_bucket_nocow_trylock(l, dev_bucket, flags)) { + struct bch_fs *c = container_of(t, struct bch_fs, nocow_locks); + u64 start_time = local_clock(); + + __closure_wait_event(&l->wait, __bch2_bucket_nocow_trylock(l, dev_bucket, flags)); + bch2_time_stats_update(&c->times[BCH_TIME_nocow_lock_contended], start_time); + } +} + +void bch2_nocow_locks_to_text(struct printbuf *out, struct bucket_nocow_lock_table *t) +{ + unsigned i, nr_zero = 0; + struct nocow_lock_bucket *l; + + for (l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) { + unsigned v = 0; + + for (i = 0; i < ARRAY_SIZE(l->l); i++) + v |= atomic_read(&l->l[i]); + + if (!v) { + nr_zero++; + continue; + } + + if (nr_zero) + prt_printf(out, "(%u empty entries)\n", nr_zero); + nr_zero = 0; + + for (i = 0; i < ARRAY_SIZE(l->l); i++) + if (atomic_read(&l->l[i])) + prt_printf(out, "%llu: %i ", l->b[i], atomic_read(&l->l[i])); + prt_newline(out); + } + + if (nr_zero) + prt_printf(out, "(%u empty entries)\n", nr_zero); +} + +int bch2_fs_nocow_locking_init(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->nocow_locks.l); i++) + spin_lock_init(&c->nocow_locks.l[i].lock); + + return 0; +} diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h new file mode 100644 index 000000000..ff8e4af52 --- /dev/null +++ b/fs/bcachefs/nocow_locking.h @@ -0,0 +1,49 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_NOCOW_LOCKING_H +#define _BCACHEFS_NOCOW_LOCKING_H + +#include "bcachefs.h" +#include "alloc_background.h" +#include "nocow_locking_types.h" + +#include + +static inline struct nocow_lock_bucket *bucket_nocow_lock(struct bucket_nocow_lock_table *t, + u64 dev_bucket) +{ + unsigned h = hash_64(dev_bucket, BUCKET_NOCOW_LOCKS_BITS); + + return t->l + (h & (BUCKET_NOCOW_LOCKS - 1)); +} + +#define BUCKET_NOCOW_LOCK_UPDATE (1 << 0) + +bool bch2_bucket_nocow_is_locked(struct bucket_nocow_lock_table *, struct bpos); +void bch2_bucket_nocow_unlock(struct bucket_nocow_lock_table *, struct bpos, int); +bool __bch2_bucket_nocow_trylock(struct nocow_lock_bucket *, u64, int); +void __bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *, + struct nocow_lock_bucket *, u64, int); + +static inline void bch2_bucket_nocow_lock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + + __bch2_bucket_nocow_lock(t, l, dev_bucket, flags); +} + +static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, + struct bpos bucket, int flags) +{ + u64 dev_bucket = bucket_to_u64(bucket); + struct nocow_lock_bucket *l = bucket_nocow_lock(t, dev_bucket); + + return __bch2_bucket_nocow_trylock(l, dev_bucket, flags); +} + +void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); + +int bch2_fs_nocow_locking_init(struct bch_fs *); + +#endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/nocow_locking_types.h b/fs/bcachefs/nocow_locking_types.h new file mode 100644 index 000000000..bd12bf677 --- /dev/null +++ b/fs/bcachefs/nocow_locking_types.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_NOCOW_LOCKING_TYPES_H +#define _BCACHEFS_NOCOW_LOCKING_TYPES_H + +#define BUCKET_NOCOW_LOCKS_BITS 10 +#define BUCKET_NOCOW_LOCKS (1U << BUCKET_NOCOW_LOCKS_BITS) + +struct nocow_lock_bucket { + struct closure_waitlist wait; + spinlock_t lock; + u64 b[4]; + atomic_t l[4]; +} __aligned(SMP_CACHE_BYTES); + +struct bucket_nocow_lock_table { + struct nocow_lock_bucket l[BUCKET_NOCOW_LOCKS]; +}; + +#endif /* _BCACHEFS_NOCOW_LOCKING_TYPES_H */ + diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c new file mode 100644 index 000000000..9dcc61ee5 --- /dev/null +++ b/fs/bcachefs/opts.c @@ -0,0 +1,592 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "bcachefs.h" +#include "compress.h" +#include "disk_groups.h" +#include "error.h" +#include "opts.h" +#include "super-io.h" +#include "util.h" + +#define x(t, n) [n] = #t, + +const char * const bch2_error_actions[] = { + BCH_ERROR_ACTIONS() + NULL +}; + +const char * const bch2_fsck_fix_opts[] = { + BCH_FIX_ERRORS_OPTS() + NULL +}; + +const char * const bch2_version_upgrade_opts[] = { + BCH_VERSION_UPGRADE_OPTS() + NULL +}; + +const char * const bch2_sb_features[] = { + BCH_SB_FEATURES() + NULL +}; + +const char * const bch2_sb_compat[] = { + BCH_SB_COMPAT() + NULL +}; + +const char * const bch2_btree_ids[] = { + BCH_BTREE_IDS() + "interior btree node", + NULL +}; + +const char * const bch2_csum_types[] = { + BCH_CSUM_TYPES() + NULL +}; + +const char * const bch2_csum_opts[] = { + BCH_CSUM_OPTS() + NULL +}; + +const char * const bch2_compression_types[] = { + BCH_COMPRESSION_TYPES() + NULL +}; + +const char * const bch2_compression_opts[] = { + BCH_COMPRESSION_OPTS() + NULL +}; + +const char * const bch2_str_hash_types[] = { + BCH_STR_HASH_TYPES() + NULL +}; + +const char * const bch2_str_hash_opts[] = { + BCH_STR_HASH_OPTS() + NULL +}; + +const char * const bch2_data_types[] = { + BCH_DATA_TYPES() + NULL +}; + +const char * const bch2_member_states[] = { + BCH_MEMBER_STATES() + NULL +}; + +const char * const bch2_jset_entry_types[] = { + BCH_JSET_ENTRY_TYPES() + NULL +}; + +const char * const bch2_fs_usage_types[] = { + BCH_FS_USAGE_TYPES() + NULL +}; + +#undef x + +int bch2_opt_fix_errors_parse(struct bch_fs *c, const char *val, u64 *res, + struct printbuf *err) +{ + if (!val) { + *res = FSCK_FIX_yes; + } else { + int ret = match_string(bch2_fsck_fix_opts, -1, val); + + if (ret < 0 && err) + prt_str(err, "fix_errors: invalid selection"); + if (ret < 0) + return ret; + *res = ret; + } + + return 0; +} + +void bch2_opt_fix_errors_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_sb *sb, + u64 v) +{ + prt_str(out, bch2_fsck_fix_opts[v]); +} + +static const struct bch_opt_fn bch2_opt_fix_errors = { + .parse = bch2_opt_fix_errors_parse, + .to_text = bch2_opt_fix_errors_to_text, +}; + +const char * const bch2_d_types[BCH_DT_MAX] = { + [DT_UNKNOWN] = "unknown", + [DT_FIFO] = "fifo", + [DT_CHR] = "chr", + [DT_DIR] = "dir", + [DT_BLK] = "blk", + [DT_REG] = "reg", + [DT_LNK] = "lnk", + [DT_SOCK] = "sock", + [DT_WHT] = "whiteout", + [DT_SUBVOL] = "subvol", +}; + +u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) +{ + BUG(); +} + +void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) +{ + BUG(); +} + +void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) +{ +#define x(_name, ...) \ + if (opt_defined(src, _name)) \ + opt_set(*dst, _name, src._name); + + BCH_OPTS() +#undef x +} + +bool bch2_opt_defined_by_id(const struct bch_opts *opts, enum bch_opt_id id) +{ + switch (id) { +#define x(_name, ...) \ + case Opt_##_name: \ + return opt_defined(*opts, _name); + BCH_OPTS() +#undef x + default: + BUG(); + } +} + +u64 bch2_opt_get_by_id(const struct bch_opts *opts, enum bch_opt_id id) +{ + switch (id) { +#define x(_name, ...) \ + case Opt_##_name: \ + return opts->_name; + BCH_OPTS() +#undef x + default: + BUG(); + } +} + +void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) +{ + switch (id) { +#define x(_name, ...) \ + case Opt_##_name: \ + opt_set(*opts, _name, v); \ + break; + BCH_OPTS() +#undef x + default: + BUG(); + } +} + +const struct bch_option bch2_opt_table[] = { +#define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 +#define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ + .min = _min, .max = _max +#define OPT_STR(_choices) .type = BCH_OPT_STR, \ + .min = 0, .max = ARRAY_SIZE(_choices), \ + .choices = _choices +#define OPT_FN(_fn) .type = BCH_OPT_FN, .fn = _fn + +#define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ + [Opt_##_name] = { \ + .attr = { \ + .name = #_name, \ + .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + }, \ + .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ + .get_sb = _sb_opt, \ + .set_sb = SET_##_sb_opt, \ + _type \ + }, + + BCH_OPTS() +#undef x +}; + +int bch2_opt_lookup(const char *name) +{ + const struct bch_option *i; + + for (i = bch2_opt_table; + i < bch2_opt_table + ARRAY_SIZE(bch2_opt_table); + i++) + if (!strcmp(name, i->attr.name)) + return i - bch2_opt_table; + + return -1; +} + +struct synonym { + const char *s1, *s2; +}; + +static const struct synonym bch_opt_synonyms[] = { + { "quota", "usrquota" }, +}; + +static int bch2_mount_opt_lookup(const char *name) +{ + const struct synonym *i; + + for (i = bch_opt_synonyms; + i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + i++) + if (!strcmp(name, i->s1)) + name = i->s2; + + return bch2_opt_lookup(name); +} + +int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) +{ + if (v < opt->min) { + if (err) + prt_printf(err, "%s: too small (min %llu)", + opt->attr.name, opt->min); + return -ERANGE; + } + + if (opt->max && v >= opt->max) { + if (err) + prt_printf(err, "%s: too big (max %llu)", + opt->attr.name, opt->max); + return -ERANGE; + } + + if ((opt->flags & OPT_SB_FIELD_SECTORS) && (v & 511)) { + if (err) + prt_printf(err, "%s: not a multiple of 512", + opt->attr.name); + return -EINVAL; + } + + if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) { + if (err) + prt_printf(err, "%s: must be a power of two", + opt->attr.name); + return -EINVAL; + } + + return 0; +} + +int bch2_opt_parse(struct bch_fs *c, + const struct bch_option *opt, + const char *val, u64 *res, + struct printbuf *err) +{ + ssize_t ret; + + switch (opt->type) { + case BCH_OPT_BOOL: + if (val) { + ret = kstrtou64(val, 10, res); + } else { + ret = 0; + *res = 1; + } + + if (ret < 0 || (*res != 0 && *res != 1)) { + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; + } + break; + case BCH_OPT_UINT: + if (!val) { + prt_printf(err, "%s: required value", + opt->attr.name); + return -EINVAL; + } + + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + if (ret < 0) { + if (err) + prt_printf(err, "%s: must be a number", + opt->attr.name); + return ret; + } + break; + case BCH_OPT_STR: + if (!val) { + prt_printf(err, "%s: required value", + opt->attr.name); + return -EINVAL; + } + + ret = match_string(opt->choices, -1, val); + if (ret < 0) { + if (err) + prt_printf(err, "%s: invalid selection", + opt->attr.name); + return ret; + } + + *res = ret; + break; + case BCH_OPT_FN: + ret = opt->fn.parse(c, val, res, err); + if (ret < 0) { + if (err) + prt_printf(err, "%s: parse error", + opt->attr.name); + return ret; + } + } + + return bch2_opt_validate(opt, *res, err); +} + +void bch2_opt_to_text(struct printbuf *out, + struct bch_fs *c, struct bch_sb *sb, + const struct bch_option *opt, u64 v, + unsigned flags) +{ + if (flags & OPT_SHOW_MOUNT_STYLE) { + if (opt->type == BCH_OPT_BOOL) { + prt_printf(out, "%s%s", + v ? "" : "no", + opt->attr.name); + return; + } + + prt_printf(out, "%s=", opt->attr.name); + } + + switch (opt->type) { + case BCH_OPT_BOOL: + case BCH_OPT_UINT: + if (opt->flags & OPT_HUMAN_READABLE) + prt_human_readable_u64(out, v); + else + prt_printf(out, "%lli", v); + break; + case BCH_OPT_STR: + if (flags & OPT_SHOW_FULL_LIST) + prt_string_option(out, opt->choices, v); + else + prt_str(out, opt->choices[v]); + break; + case BCH_OPT_FN: + opt->fn.to_text(out, c, sb, v); + break; + default: + BUG(); + } +} + +int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +{ + int ret = 0; + + switch (id) { + case Opt_compression: + case Opt_background_compression: + ret = bch2_check_set_has_compressed_data(c, v); + break; + case Opt_erasure_code: + if (v) + bch2_check_set_feature(c, BCH_FEATURE_ec); + break; + } + + return ret; +} + +int bch2_opts_check_may_set(struct bch_fs *c) +{ + unsigned i; + int ret; + + for (i = 0; i < bch2_opts_nr; i++) { + ret = bch2_opt_check_may_set(c, i, + bch2_opt_get_by_id(&c->opts, i)); + if (ret) + return ret; + } + + return 0; +} + +int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, + char *options) +{ + char *copied_opts, *copied_opts_start; + char *opt, *name, *val; + int ret, id; + struct printbuf err = PRINTBUF; + u64 v; + + if (!options) + return 0; + + copied_opts = kstrdup(options, GFP_KERNEL); + if (!copied_opts) + return -1; + copied_opts_start = copied_opts; + + while ((opt = strsep(&copied_opts, ",")) != NULL) { + name = strsep(&opt, "="); + val = opt; + + id = bch2_mount_opt_lookup(name); + + /* Check for the form "noopt", negation of a boolean opt: */ + if (id < 0 && + !val && + !strncmp("no", name, 2)) { + id = bch2_mount_opt_lookup(name + 2); + val = "0"; + } + + if (id < 0) + goto bad_opt; + + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) + goto bad_opt; + + if (id == Opt_acl && + !IS_ENABLED(CONFIG_BCACHEFS_POSIX_ACL)) + goto bad_opt; + + if ((id == Opt_usrquota || + id == Opt_grpquota) && + !IS_ENABLED(CONFIG_BCACHEFS_QUOTA)) + goto bad_opt; + + ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); + if (ret < 0) + goto bad_val; + + bch2_opt_set_by_id(opts, id, v); + } + + ret = 0; + goto out; + +bad_opt: + pr_err("Bad mount option %s", name); + ret = -1; + goto out; +bad_val: + pr_err("Invalid mount option %s", err.buf); + ret = -1; + goto out; +out: + kfree(copied_opts_start); + printbuf_exit(&err); + return ret; +} + +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +{ + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + v = opt->get_sb(sb); + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = 1ULL << v; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v <<= 9; + + return v; +} + +/* + * Initial options from superblock - here we don't want any options undefined, + * any options the superblock doesn't specify are set to 0: + */ +int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) +{ + unsigned id; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + + if (opt->get_sb == BCH2_NO_SB_OPT) + continue; + + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + } + + return 0; +} + +void __bch2_opt_set_sb(struct bch_sb *sb, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_BCH2_NO_SB_OPT) + return; + + if (opt->flags & OPT_SB_FIELD_SECTORS) + v >>= 9; + + if (opt->flags & OPT_SB_FIELD_ILOG2) + v = ilog2(v); + + opt->set_sb(sb, v); +} + +void bch2_opt_set_sb(struct bch_fs *c, const struct bch_option *opt, u64 v) +{ + if (opt->set_sb == SET_BCH2_NO_SB_OPT) + return; + + mutex_lock(&c->sb_lock); + __bch2_opt_set_sb(c->disk_sb.sb, opt, v); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + +/* io opts: */ + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts src) +{ + return (struct bch_io_opts) { +#define x(_name, _bits) ._name = src._name, + BCH_INODE_OPTS() +#undef x + }; +} + +bool bch2_opt_is_inode_opt(enum bch_opt_id id) +{ + static const enum bch_opt_id inode_opt_list[] = { +#define x(_name, _bits) Opt_##_name, + BCH_INODE_OPTS() +#undef x + }; + unsigned i; + + for (i = 0; i < ARRAY_SIZE(inode_opt_list); i++) + if (inode_opt_list[i] == id) + return true; + + return false; +} diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h new file mode 100644 index 000000000..8a9db110d --- /dev/null +++ b/fs/bcachefs/opts.h @@ -0,0 +1,563 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_OPTS_H +#define _BCACHEFS_OPTS_H + +#include +#include +#include +#include +#include "bcachefs_format.h" + +struct bch_fs; + +extern const char * const bch2_error_actions[]; +extern const char * const bch2_fsck_fix_opts[]; +extern const char * const bch2_version_upgrade_opts[]; +extern const char * const bch2_sb_features[]; +extern const char * const bch2_sb_compat[]; +extern const char * const bch2_btree_ids[]; +extern const char * const bch2_csum_types[]; +extern const char * const bch2_csum_opts[]; +extern const char * const bch2_compression_types[]; +extern const char * const bch2_compression_opts[]; +extern const char * const bch2_str_hash_types[]; +extern const char * const bch2_str_hash_opts[]; +extern const char * const bch2_data_types[]; +extern const char * const bch2_member_states[]; +extern const char * const bch2_jset_entry_types[]; +extern const char * const bch2_fs_usage_types[]; +extern const char * const bch2_d_types[]; + +static inline const char *bch2_d_type_str(unsigned d_type) +{ + return (d_type < BCH_DT_MAX ? bch2_d_types[d_type] : NULL) ?: "(bad d_type)"; +} + +/* + * Mount options; we also store defaults in the superblock. + * + * Also exposed via sysfs: if an option is writeable, and it's also stored in + * the superblock, changing it via sysfs (currently? might change this) also + * updates the superblock. + * + * We store options as signed integers, where -1 means undefined. This means we + * can pass the mount options to bch2_fs_alloc() as a whole struct, and then only + * apply the options from that struct that are defined. + */ + +/* dummy option, for options that aren't stored in the superblock */ +u64 BCH2_NO_SB_OPT(const struct bch_sb *); +void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); + +/* When can be set: */ +enum opt_flags { + OPT_FS = (1 << 0), /* Filesystem option */ + OPT_DEVICE = (1 << 1), /* Device option */ + OPT_INODE = (1 << 2), /* Inode option */ + OPT_FORMAT = (1 << 3), /* May be specified at format time */ + OPT_MOUNT = (1 << 4), /* May be specified at mount time */ + OPT_RUNTIME = (1 << 5), /* May be specified at runtime */ + OPT_HUMAN_READABLE = (1 << 6), + OPT_MUST_BE_POW_2 = (1 << 7), /* Must be power of 2 */ + OPT_SB_FIELD_SECTORS = (1 << 8),/* Superblock field is >> 9 of actual value */ + OPT_SB_FIELD_ILOG2 = (1 << 9), /* Superblock field is ilog2 of actual value */ +}; + +enum opt_type { + BCH_OPT_BOOL, + BCH_OPT_UINT, + BCH_OPT_STR, + BCH_OPT_FN, +}; + +struct bch_opt_fn { + int (*parse)(struct bch_fs *, const char *, u64 *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_fs *, struct bch_sb *, u64); +}; + +/** + * x(name, shortopt, type, in mem type, mode, sb_opt) + * + * @name - name of mount option, sysfs attribute, and struct bch_opts + * member + * + * @mode - when opt may be set + * + * @sb_option - name of corresponding superblock option + * + * @type - one of OPT_BOOL, OPT_UINT, OPT_STR + */ + +/* + * XXX: add fields for + * - default value + * - helptext + */ + +#ifdef __KERNEL__ +#define RATELIMIT_ERRORS_DEFAULT true +#else +#define RATELIMIT_ERRORS_DEFAULT false +#endif + +#ifdef CONFIG_BCACHEFS_DEBUG +#define BCACHEFS_VERBOSE_DEFAULT true +#else +#define BCACHEFS_VERBOSE_DEFAULT false +#endif + +#define BCH_FIX_ERRORS_OPTS() \ + x(exit, 0) \ + x(yes, 1) \ + x(no, 2) \ + x(ask, 3) + +enum fsck_err_opts { +#define x(t, n) FSCK_FIX_##t, + BCH_FIX_ERRORS_OPTS() +#undef x +}; + +#define BCH_OPTS() \ + x(block_size, u16, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 16), \ + BCH_SB_BLOCK_SIZE, 8, \ + "size", NULL) \ + x(btree_node_size, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(512, 1U << 20), \ + BCH_SB_BTREE_NODE_SIZE, 512, \ + "size", "Btree node size, default 256k") \ + x(errors, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_error_actions), \ + BCH_SB_ERROR_ACTION, BCH_ON_ERROR_ro, \ + NULL, "Action to take on filesystem error") \ + x(metadata_replicas, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_WANT, 1, \ + "#", "Number of metadata replicas") \ + x(data_replicas, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_WANT, 1, \ + "#", "Number of data replicas") \ + x(metadata_replicas_required, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_META_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(data_replicas_required, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(1, BCH_REPLICAS_MAX), \ + BCH_SB_DATA_REPLICAS_REQ, 1, \ + "#", NULL) \ + x(encoded_extent_max, u32, \ + OPT_FS|OPT_FORMAT| \ + OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS|OPT_SB_FIELD_ILOG2,\ + OPT_UINT(4096, 2U << 20), \ + BCH_SB_ENCODED_EXTENT_MAX_BITS, 64 << 10, \ + "size", "Maximum size of checksummed/compressed extents")\ + x(metadata_checksum, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_META_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(data_checksum, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_csum_opts), \ + BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ + NULL, NULL) \ + x(compression, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_compression), \ + BCH_SB_COMPRESSION_TYPE, BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(background_compression, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_compression), \ + BCH_SB_BACKGROUND_COMPRESSION_TYPE,BCH_COMPRESSION_OPT_none, \ + NULL, NULL) \ + x(str_hash, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_STR(bch2_str_hash_opts), \ + BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ + NULL, "Hash function for directory entries and xattrs")\ + x(metadata_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_METADATA_TARGET, 0, \ + "(target)", "Device or label for metadata writes") \ + x(foreground_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_FOREGROUND_TARGET, 0, \ + "(target)", "Device or label for foreground writes") \ + x(background_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_BACKGROUND_TARGET, 0, \ + "(target)", "Device or label to move data to in the background")\ + x(promote_target, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FN(bch2_opt_target), \ + BCH_SB_PROMOTE_TARGET, 0, \ + "(target)", "Device or label to promote data to on read") \ + x(erasure_code, u16, \ + OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_ERASURE_CODE, false, \ + NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(inodes_32bit, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_INODE_32BIT, true, \ + NULL, "Constrain inode numbers to 32 bits") \ + x(shard_inode_numbers, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_SHARD_INUMS, true, \ + NULL, "Shard new inode numbers by CPU id") \ + x(inodes_use_key_cache, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_INODES_USE_KEY_CACHE, true, \ + NULL, "Use the btree key cache for the inodes btree") \ + x(btree_node_mem_ptr_optimization, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Stash pointer to in memory btree node in btree ptr")\ + x(btree_write_buffer_size, u32, \ + OPT_FS|OPT_MOUNT, \ + OPT_UINT(16, (1U << 20) - 1), \ + BCH2_NO_SB_OPT, 1U << 13, \ + NULL, "Number of btree write buffer entries") \ + x(gc_reserve_percent, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(5, 21), \ + BCH_SB_GC_RESERVE, 8, \ + "%", "Percentage of disk space to reserve for copygc")\ + x(gc_reserve_bytes, u64, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME| \ + OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ + OPT_UINT(0, U64_MAX), \ + BCH_SB_GC_RESERVE_BYTES, 0, \ + "%", "Amount of disk space to reserve for copygc\n" \ + "Takes precedence over gc_reserve_percent if set")\ + x(root_reserve_percent, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_UINT(0, 100), \ + BCH_SB_ROOT_RESERVE, 0, \ + "%", "Percentage of disk space to reserve for superuser")\ + x(wide_macs, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_128_BIT_MACS, false, \ + NULL, "Store full 128 bits of cryptographic MACs, instead of 80")\ + x(inline_data, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable inline data extents") \ + x(acl, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_POSIX_ACL, true, \ + NULL, "Enable POSIX acls") \ + x(usrquota, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_USRQUOTA, false, \ + NULL, "Enable user quotas") \ + x(grpquota, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_GRPQUOTA, false, \ + NULL, "Enable group quotas") \ + x(prjquota, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH_SB_PRJQUOTA, false, \ + NULL, "Enable project quotas") \ + x(degraded, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in degraded mode") \ + x(very_degraded, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allow mounting in when data will be missing") \ + x(discard, u8, \ + OPT_FS|OPT_MOUNT|OPT_DEVICE, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable discard/TRIM support") \ + x(verbose, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, BCACHEFS_VERBOSE_DEFAULT, \ + NULL, "Extra debugging information during mount/recovery")\ + x(journal_flush_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, U32_MAX), \ + BCH_SB_JOURNAL_FLUSH_DELAY, 1000, \ + NULL, "Delay in milliseconds before automatic journal commits")\ + x(journal_flush_disabled, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_FLUSH_DISABLED,false, \ + NULL, "Disable journal flush on sync/fsync\n" \ + "If enabled, writes can be lost, but only since the\n"\ + "last journal write (default 1 second)") \ + x(journal_reclaim_delay, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, U32_MAX), \ + BCH_SB_JOURNAL_RECLAIM_DELAY, 100, \ + NULL, "Delay in milliseconds before automatic journal reclaim")\ + x(move_bytes_in_flight, u32, \ + OPT_HUMAN_READABLE|OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1024, U32_MAX), \ + BCH2_NO_SB_OPT, 1U << 20, \ + NULL, "Maximum Amount of IO to keep in flight by the move path")\ + x(move_ios_in_flight, u32, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 1024), \ + BCH2_NO_SB_OPT, 32, \ + NULL, "Maximum number of IOs to keep in flight by the move path")\ + x(fsck, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Run fsck on mount") \ + x(fix_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_FN(bch2_opt_fix_errors), \ + BCH2_NO_SB_OPT, FSCK_FIX_exit, \ + NULL, "Fix errors during fsck without asking") \ + x(ratelimit_errors, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, RATELIMIT_ERRORS_DEFAULT, \ + NULL, "Ratelimit error messages during fsck") \ + x(nochanges, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Super read only mode - no writes at all will be issued,\n"\ + "even if we have to replay the journal") \ + x(norecovery, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't replay the journal") \ + x(keep_journal, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't free journal entries/keys after startup")\ + x(read_entire_journal, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Read all journal entries, not just dirty ones")\ + x(read_journal_only, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Only read the journal, skip the rest of recovery")\ + x(journal_transaction_names, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_JOURNAL_TRANSACTION_NAMES, true, \ + NULL, "Log transaction function names in journal") \ + x(noexcl, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don't open device in exclusive mode") \ + x(direct_io, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Use O_DIRECT (userspace only)") \ + x(sb, u64, \ + OPT_MOUNT, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, BCH_SB_SECTOR, \ + "offset", "Sector offset of superblock") \ + x(read_only, u8, \ + OPT_FS, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(nostart, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Don\'t start filesystem, only open devices") \ + x(reconstruct_alloc, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Reconstruct alloc btree") \ + x(version_upgrade, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_STR(bch2_version_upgrade_opts), \ + BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ + NULL, "Set superblock to latest version,\n" \ + "allowing any new features to be used") \ + x(buckets_nouse, u8, \ + 0, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Allocate the buckets_nouse bitmap") \ + x(project, u8, \ + OPT_INODE, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, NULL) \ + x(nocow, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME|OPT_INODE, \ + OPT_BOOL(), \ + BCH_SB_NOCOW, false, \ + NULL, "Nocow mode: Writes will be done in place when possible.\n"\ + "Snapshots and reflink will still caused writes to be COW\n"\ + "Implicitly disables data checksumming, compression and encryption")\ + x(nocow_enabled, u8, \ + OPT_FS|OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable nocow mode: enables runtime locking in\n"\ + "data move path needed if nocow will ever be in use\n")\ + x(no_data_io, u8, \ + OPT_MOUNT, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, false, \ + NULL, "Skip submit_bio() for data reads and writes, " \ + "for performance testing purposes") \ + x(fs_size, u64, \ + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(bucket, u32, \ + OPT_DEVICE, \ + OPT_UINT(0, S64_MAX), \ + BCH2_NO_SB_OPT, 0, \ + "size", "Size of filesystem on device") \ + x(durability, u8, \ + OPT_DEVICE, \ + OPT_UINT(0, BCH_REPLICAS_MAX), \ + BCH2_NO_SB_OPT, 1, \ + "n", "Data written to this device will be considered\n"\ + "to have already been replicated n times") + +struct bch_opts { +#define x(_name, _bits, ...) unsigned _name##_defined:1; + BCH_OPTS() +#undef x + +#define x(_name, _bits, ...) _bits _name; + BCH_OPTS() +#undef x +}; + +static const struct bch_opts bch2_opts_default = { +#define x(_name, _bits, _mode, _type, _sb_opt, _default, ...) \ + ._name##_defined = true, \ + ._name = _default, \ + + BCH_OPTS() +#undef x +}; + +#define opt_defined(_opts, _name) ((_opts)._name##_defined) + +#define opt_get(_opts, _name) \ + (opt_defined(_opts, _name) ? (_opts)._name : bch2_opts_default._name) + +#define opt_set(_opts, _name, _v) \ +do { \ + (_opts)._name##_defined = true; \ + (_opts)._name = _v; \ +} while (0) + +static inline struct bch_opts bch2_opts_empty(void) +{ + return (struct bch_opts) { 0 }; +} + +void bch2_opts_apply(struct bch_opts *, struct bch_opts); + +enum bch_opt_id { +#define x(_name, ...) Opt_##_name, + BCH_OPTS() +#undef x + bch2_opts_nr +}; + +struct bch_fs; +struct printbuf; + +struct bch_option { + struct attribute attr; + u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); + enum opt_type type; + enum opt_flags flags; + u64 min, max; + + const char * const *choices; + + struct bch_opt_fn fn; + + const char *hint; + const char *help; + +}; + +extern const struct bch_option bch2_opt_table[]; + +bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); +u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); +void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); + +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); +int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); +void __bch2_opt_set_sb(struct bch_sb *, const struct bch_option *, u64); +void bch2_opt_set_sb(struct bch_fs *, const struct bch_option *, u64); + +int bch2_opt_lookup(const char *); +int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); +int bch2_opt_parse(struct bch_fs *, const struct bch_option *, + const char *, u64 *, struct printbuf *); + +#define OPT_SHOW_FULL_LIST (1 << 0) +#define OPT_SHOW_MOUNT_STYLE (1 << 1) + +void bch2_opt_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, + const struct bch_option *, u64, unsigned); + +int bch2_opt_check_may_set(struct bch_fs *, int, u64); +int bch2_opts_check_may_set(struct bch_fs *); +int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, char *); + +/* inode opts: */ + +struct bch_io_opts { +#define x(_name, _bits) u##_bits _name; + BCH_INODE_OPTS() +#undef x +}; + +struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts); +bool bch2_opt_is_inode_opt(enum bch_opt_id); + +#endif /* _BCACHEFS_OPTS_H */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c new file mode 100644 index 000000000..c41daa180 --- /dev/null +++ b/fs/bcachefs/printbuf.c @@ -0,0 +1,415 @@ +// SPDX-License-Identifier: LGPL-2.1+ +/* Copyright (C) 2022 Kent Overstreet */ + +#include +#include +#include +#include +#include + +#include "printbuf.h" + +static inline unsigned printbuf_linelen(struct printbuf *buf) +{ + return buf->pos - buf->last_newline; +} + +int bch2_printbuf_make_room(struct printbuf *out, unsigned extra) +{ + unsigned new_size; + char *buf; + + if (!out->heap_allocated) + return 0; + + /* Reserved space for terminating nul: */ + extra += 1; + + if (out->pos + extra < out->size) + return 0; + + new_size = roundup_pow_of_two(out->size + extra); + + /* + * Note: output buffer must be freeable with kfree(), it's not required + * that the user use printbuf_exit(). + */ + buf = krealloc(out->buf, new_size, !out->atomic ? GFP_KERNEL : GFP_NOWAIT); + + if (!buf) { + out->allocation_failure = true; + return -ENOMEM; + } + + out->buf = buf; + out->size = new_size; + return 0; +} + +void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args) +{ + int len; + + do { + va_list args2; + + va_copy(args2, args); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +void bch2_prt_printf(struct printbuf *out, const char *fmt, ...) +{ + va_list args; + int len; + + do { + va_start(args, fmt); + len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args); + va_end(args); + } while (len + 1 >= printbuf_remaining(out) && + !bch2_printbuf_make_room(out, len + 1)); + + len = min_t(size_t, len, + printbuf_remaining(out) ? printbuf_remaining(out) - 1 : 0); + out->pos += len; +} + +/** + * printbuf_str - returns printbuf's buf as a C string, guaranteed to be null + * terminated + */ +const char *bch2_printbuf_str(const struct printbuf *buf) +{ + /* + * If we've written to a printbuf then it's guaranteed to be a null + * terminated string - but if we haven't, then we might not have + * allocated a buffer at all: + */ + return buf->pos + ? buf->buf + : ""; +} + +/** + * printbuf_exit - exit a printbuf, freeing memory it owns and poisoning it + * against accidental use. + */ +void bch2_printbuf_exit(struct printbuf *buf) +{ + if (buf->heap_allocated) { + kfree(buf->buf); + buf->buf = ERR_PTR(-EINTR); /* poison value */ + } +} + +void bch2_printbuf_tabstops_reset(struct printbuf *buf) +{ + buf->nr_tabstops = 0; +} + +void bch2_printbuf_tabstop_pop(struct printbuf *buf) +{ + if (buf->nr_tabstops) + --buf->nr_tabstops; +} + +/* + * printbuf_tabstop_set - add a tabstop, n spaces from the previous tabstop + * + * @buf: printbuf to control + * @spaces: number of spaces from previous tabpstop + * + * In the future this function may allocate memory if setting more than + * PRINTBUF_INLINE_TABSTOPS or setting tabstops more than 255 spaces from start + * of line. + */ +int bch2_printbuf_tabstop_push(struct printbuf *buf, unsigned spaces) +{ + unsigned prev_tabstop = buf->nr_tabstops + ? buf->_tabstops[buf->nr_tabstops - 1] + : 0; + + if (WARN_ON(buf->nr_tabstops >= ARRAY_SIZE(buf->_tabstops))) + return -EINVAL; + + buf->_tabstops[buf->nr_tabstops++] = prev_tabstop + spaces; + buf->has_indent_or_tabstops = true; + return 0; +} + +/** + * printbuf_indent_add - add to the current indent level + * + * @buf: printbuf to control + * @spaces: number of spaces to add to the current indent level + * + * Subsequent lines, and the current line if the output position is at the start + * of the current line, will be indented by @spaces more spaces. + */ +void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) + spaces = 0; + + buf->indent += spaces; + prt_chars(buf, ' ', spaces); + + buf->has_indent_or_tabstops = true; +} + +/** + * printbuf_indent_sub - subtract from the current indent level + * + * @buf: printbuf to control + * @spaces: number of spaces to subtract from the current indent level + * + * Subsequent lines, and the current line if the output position is at the start + * of the current line, will be indented by @spaces less spaces. + */ +void bch2_printbuf_indent_sub(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(spaces > buf->indent)) + spaces = buf->indent; + + if (buf->last_newline + buf->indent == buf->pos) { + buf->pos -= spaces; + printbuf_nul_terminate(buf); + } + buf->indent -= spaces; + + if (!buf->indent && !buf->nr_tabstops) + buf->has_indent_or_tabstops = false; +} + +void bch2_prt_newline(struct printbuf *buf) +{ + unsigned i; + + bch2_printbuf_make_room(buf, 1 + buf->indent); + + __prt_char(buf, '\n'); + + buf->last_newline = buf->pos; + + for (i = 0; i < buf->indent; i++) + __prt_char(buf, ' '); + + printbuf_nul_terminate(buf); + + buf->last_field = buf->pos; + buf->cur_tabstop = 0; +} + +/* + * Returns spaces from start of line, if set, or 0 if unset: + */ +static inline unsigned cur_tabstop(struct printbuf *buf) +{ + return buf->cur_tabstop < buf->nr_tabstops + ? buf->_tabstops[buf->cur_tabstop] + : 0; +} + +static void __prt_tab(struct printbuf *out) +{ + int spaces = max_t(int, 0, cur_tabstop(out) - printbuf_linelen(out)); + + prt_chars(out, ' ', spaces); + + out->last_field = out->pos; + out->cur_tabstop++; +} + +/** + * prt_tab - Advance printbuf to the next tabstop + * + * @buf: printbuf to control + * + * Advance output to the next tabstop by printing spaces. + */ +void bch2_prt_tab(struct printbuf *out) +{ + if (WARN_ON(!cur_tabstop(out))) + return; + + __prt_tab(out); +} + +static void __prt_tab_rjust(struct printbuf *buf) +{ + unsigned move = buf->pos - buf->last_field; + int pad = (int) cur_tabstop(buf) - (int) printbuf_linelen(buf); + + if (pad > 0) { + bch2_printbuf_make_room(buf, pad); + + if (buf->last_field + pad < buf->size) + memmove(buf->buf + buf->last_field + pad, + buf->buf + buf->last_field, + min(move, buf->size - 1 - buf->last_field - pad)); + + if (buf->last_field < buf->size) + memset(buf->buf + buf->last_field, ' ', + min((unsigned) pad, buf->size - buf->last_field)); + + buf->pos += pad; + printbuf_nul_terminate(buf); + } + + buf->last_field = buf->pos; + buf->cur_tabstop++; +} + +/** + * prt_tab_rjust - Advance printbuf to the next tabstop, right justifying + * previous output + * + * @buf: printbuf to control + * + * Advance output to the next tabstop by inserting spaces immediately after the + * previous tabstop, right justifying previously outputted text. + */ +void bch2_prt_tab_rjust(struct printbuf *buf) +{ + if (WARN_ON(!cur_tabstop(buf))) + return; + + __prt_tab_rjust(buf); +} + +/** + * prt_bytes_indented - Print an array of chars, handling embedded control characters + * + * @out: printbuf to output to + * @str: string to print + * @count: number of bytes to print + * + * The following contol characters are handled as so: + * \n: prt_newline newline that obeys current indent level + * \t: prt_tab advance to next tabstop + * \r: prt_tab_rjust advance to next tabstop, with right justification + */ +void bch2_prt_bytes_indented(struct printbuf *out, const char *str, unsigned count) +{ + const char *unprinted_start = str; + const char *end = str + count; + + if (!out->has_indent_or_tabstops || out->suppress_indent_tabstop_handling) { + prt_bytes(out, str, count); + return; + } + + while (str != end) { + switch (*str) { + case '\n': + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + bch2_prt_newline(out); + break; + case '\t': + if (likely(cur_tabstop(out))) { + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + __prt_tab(out); + } + break; + case '\r': + if (likely(cur_tabstop(out))) { + prt_bytes(out, unprinted_start, str - unprinted_start); + unprinted_start = str + 1; + __prt_tab_rjust(out); + } + break; + } + + str++; + } + + prt_bytes(out, unprinted_start, str - unprinted_start); +} + +/** + * prt_human_readable_u64 - Print out a u64 in human readable units + * + * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + */ +void bch2_prt_human_readable_u64(struct printbuf *buf, u64 v) +{ + bch2_printbuf_make_room(buf, 10); + buf->pos += string_get_size(v, 1, !buf->si_units, + buf->buf + buf->pos, + printbuf_remaining_size(buf)); +} + +/** + * prt_human_readable_s64 - Print out a s64 in human readable units + * + * Units of 2^10 (default) or 10^3 are controlled via @buf->si_units + */ +void bch2_prt_human_readable_s64(struct printbuf *buf, s64 v) +{ + if (v < 0) + prt_char(buf, '-'); + bch2_prt_human_readable_u64(buf, abs(v)); +} + +/** + * prt_units_u64 - Print out a u64 according to printbuf unit options + * + * Units are either raw (default), or human reabable units (controlled via + * @buf->human_readable_units) + */ +void bch2_prt_units_u64(struct printbuf *out, u64 v) +{ + if (out->human_readable_units) + bch2_prt_human_readable_u64(out, v); + else + bch2_prt_printf(out, "%llu", v); +} + +/** + * prt_units_s64 - Print out a s64 according to printbuf unit options + * + * Units are either raw (default), or human reabable units (controlled via + * @buf->human_readable_units) + */ +void bch2_prt_units_s64(struct printbuf *out, s64 v) +{ + if (v < 0) + prt_char(out, '-'); + bch2_prt_units_u64(out, abs(v)); +} + +void bch2_prt_string_option(struct printbuf *out, + const char * const list[], + size_t selected) +{ + size_t i; + + for (i = 0; list[i]; i++) + bch2_prt_printf(out, i == selected ? "[%s] " : "%s ", list[i]); +} + +void bch2_prt_bitflags(struct printbuf *out, + const char * const list[], u64 flags) +{ + unsigned bit, nr = 0; + bool first = true; + + while (list[nr]) + nr++; + + while (flags && (bit = __ffs(flags)) < nr) { + if (!first) + bch2_prt_printf(out, ","); + first = false; + bch2_prt_printf(out, "%s", list[bit]); + flags ^= 1 << bit; + } +} diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h new file mode 100644 index 000000000..2191423d9 --- /dev/null +++ b/fs/bcachefs/printbuf.h @@ -0,0 +1,284 @@ +/* SPDX-License-Identifier: LGPL-2.1+ */ +/* Copyright (C) 2022 Kent Overstreet */ + +#ifndef _BCACHEFS_PRINTBUF_H +#define _BCACHEFS_PRINTBUF_H + +/* + * Printbufs: Simple strings for printing to, with optional heap allocation + * + * This code has provisions for use in userspace, to aid in making other code + * portable between kernelspace and userspace. + * + * Basic example: + * struct printbuf buf = PRINTBUF; + * + * prt_printf(&buf, "foo="); + * foo_to_text(&buf, foo); + * printk("%s", buf.buf); + * printbuf_exit(&buf); + * + * Or + * struct printbuf buf = PRINTBUF_EXTERN(char_buf, char_buf_size) + * + * We can now write pretty printers instead of writing code that dumps + * everything to the kernel log buffer, and then those pretty-printers can be + * used by other code that outputs to kernel log, sysfs, debugfs, etc. + * + * Memory allocation: Outputing to a printbuf may allocate memory. This + * allocation is done with GFP_KERNEL, by default: use the newer + * memalloc_*_(save|restore) functions as needed. + * + * Since no equivalent yet exists for GFP_ATOMIC/GFP_NOWAIT, memory allocations + * will be done with GFP_NOWAIT if printbuf->atomic is nonzero. + * + * It's allowed to grab the output buffer and free it later with kfree() instead + * of using printbuf_exit(), if the user just needs a heap allocated string at + * the end. + * + * Memory allocation failures: We don't return errors directly, because on + * memory allocation failure we usually don't want to bail out and unwind - we + * want to print what we've got, on a best-effort basis. But code that does want + * to return -ENOMEM may check printbuf.allocation_failure. + * + * Indenting, tabstops: + * + * To aid is writing multi-line pretty printers spread across multiple + * functions, printbufs track the current indent level. + * + * printbuf_indent_push() and printbuf_indent_pop() increase and decrease the current indent + * level, respectively. + * + * To use tabstops, set printbuf->tabstops[]; they are in units of spaces, from + * start of line. Once set, prt_tab() will output spaces up to the next tabstop. + * prt_tab_rjust() will also advance the current line of text up to the next + * tabstop, but it does so by shifting text since the previous tabstop up to the + * next tabstop - right justifying it. + * + * Make sure you use prt_newline() instead of \n in the format string for indent + * level and tabstops to work corretly. + * + * Output units: printbuf->units exists to tell pretty-printers how to output + * numbers: a raw value (e.g. directly from a superblock field), as bytes, or as + * human readable bytes. prt_units() obeys it. + */ + +#include +#include + +enum printbuf_si { + PRINTBUF_UNITS_2, /* use binary powers of 2^10 */ + PRINTBUF_UNITS_10, /* use powers of 10^3 (standard SI) */ +}; + +#define PRINTBUF_INLINE_TABSTOPS 6 + +struct printbuf { + char *buf; + unsigned size; + unsigned pos; + unsigned last_newline; + unsigned last_field; + unsigned indent; + /* + * If nonzero, allocations will be done with GFP_ATOMIC: + */ + u8 atomic; + bool allocation_failure:1; + bool heap_allocated:1; + enum printbuf_si si_units:1; + bool human_readable_units:1; + bool has_indent_or_tabstops:1; + bool suppress_indent_tabstop_handling:1; + u8 nr_tabstops; + + /* + * Do not modify directly: use printbuf_tabstop_add(), + * printbuf_tabstop_get() + */ + u8 cur_tabstop; + u8 _tabstops[PRINTBUF_INLINE_TABSTOPS]; +}; + +int bch2_printbuf_make_room(struct printbuf *, unsigned); +__printf(2, 3) void bch2_prt_printf(struct printbuf *out, const char *fmt, ...); +__printf(2, 0) void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list); +const char *bch2_printbuf_str(const struct printbuf *); +void bch2_printbuf_exit(struct printbuf *); + +void bch2_printbuf_tabstops_reset(struct printbuf *); +void bch2_printbuf_tabstop_pop(struct printbuf *); +int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); + +void bch2_printbuf_indent_add(struct printbuf *, unsigned); +void bch2_printbuf_indent_sub(struct printbuf *, unsigned); + +void bch2_prt_newline(struct printbuf *); +void bch2_prt_tab(struct printbuf *); +void bch2_prt_tab_rjust(struct printbuf *); + +void bch2_prt_bytes_indented(struct printbuf *, const char *, unsigned); +void bch2_prt_human_readable_u64(struct printbuf *, u64); +void bch2_prt_human_readable_s64(struct printbuf *, s64); +void bch2_prt_units_u64(struct printbuf *, u64); +void bch2_prt_units_s64(struct printbuf *, s64); +void bch2_prt_string_option(struct printbuf *, const char * const[], size_t); +void bch2_prt_bitflags(struct printbuf *, const char * const[], u64); + +/* Initializer for a heap allocated printbuf: */ +#define PRINTBUF ((struct printbuf) { .heap_allocated = true }) + +/* Initializer a printbuf that points to an external buffer: */ +#define PRINTBUF_EXTERN(_buf, _size) \ +((struct printbuf) { \ + .buf = _buf, \ + .size = _size, \ +}) + +/* + * Returns size remaining of output buffer: + */ +static inline unsigned printbuf_remaining_size(struct printbuf *out) +{ + return out->pos < out->size ? out->size - out->pos : 0; +} + +/* + * Returns number of characters we can print to the output buffer - i.e. + * excluding the terminating nul: + */ +static inline unsigned printbuf_remaining(struct printbuf *out) +{ + return out->pos < out->size ? out->size - out->pos - 1 : 0; +} + +static inline unsigned printbuf_written(struct printbuf *out) +{ + return out->size ? min(out->pos, out->size - 1) : 0; +} + +/* + * Returns true if output was truncated: + */ +static inline bool printbuf_overflowed(struct printbuf *out) +{ + return out->pos >= out->size; +} + +static inline void printbuf_nul_terminate(struct printbuf *out) +{ + bch2_printbuf_make_room(out, 1); + + if (out->pos < out->size) + out->buf[out->pos] = 0; + else if (out->size) + out->buf[out->size - 1] = 0; +} + +/* Doesn't call bch2_printbuf_make_room(), doesn't nul terminate: */ +static inline void __prt_char_reserved(struct printbuf *out, char c) +{ + if (printbuf_remaining(out)) + out->buf[out->pos] = c; + out->pos++; +} + +/* Doesn't nul terminate: */ +static inline void __prt_char(struct printbuf *out, char c) +{ + bch2_printbuf_make_room(out, 1); + __prt_char_reserved(out, c); +} + +static inline void prt_char(struct printbuf *out, char c) +{ + __prt_char(out, c); + printbuf_nul_terminate(out); +} + +static inline void __prt_chars_reserved(struct printbuf *out, char c, unsigned n) +{ + unsigned i, can_print = min(n, printbuf_remaining(out)); + + for (i = 0; i < can_print; i++) + out->buf[out->pos++] = c; + out->pos += n - can_print; +} + +static inline void prt_chars(struct printbuf *out, char c, unsigned n) +{ + bch2_printbuf_make_room(out, n); + __prt_chars_reserved(out, c, n); + printbuf_nul_terminate(out); +} + +static inline void prt_bytes(struct printbuf *out, const void *b, unsigned n) +{ + unsigned i, can_print; + + bch2_printbuf_make_room(out, n); + + can_print = min(n, printbuf_remaining(out)); + + for (i = 0; i < can_print; i++) + out->buf[out->pos++] = ((char *) b)[i]; + out->pos += n - can_print; + + printbuf_nul_terminate(out); +} + +static inline void prt_str(struct printbuf *out, const char *str) +{ + prt_bytes(out, str, strlen(str)); +} + +static inline void prt_str_indented(struct printbuf *out, const char *str) +{ + bch2_prt_bytes_indented(out, str, strlen(str)); +} + +static inline void prt_hex_byte(struct printbuf *out, u8 byte) +{ + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, hex_asc_hi(byte)); + __prt_char_reserved(out, hex_asc_lo(byte)); + printbuf_nul_terminate(out); +} + +static inline void prt_hex_byte_upper(struct printbuf *out, u8 byte) +{ + bch2_printbuf_make_room(out, 2); + __prt_char_reserved(out, hex_asc_upper_hi(byte)); + __prt_char_reserved(out, hex_asc_upper_lo(byte)); + printbuf_nul_terminate(out); +} + +/** + * printbuf_reset - re-use a printbuf without freeing and re-initializing it: + */ +static inline void printbuf_reset(struct printbuf *buf) +{ + buf->pos = 0; + buf->allocation_failure = 0; + buf->indent = 0; + buf->nr_tabstops = 0; + buf->cur_tabstop = 0; +} + +/** + * printbuf_atomic_inc - mark as entering an atomic section + */ +static inline void printbuf_atomic_inc(struct printbuf *buf) +{ + buf->atomic++; +} + +/** + * printbuf_atomic_inc - mark as leaving an atomic section + */ +static inline void printbuf_atomic_dec(struct printbuf *buf) +{ + buf->atomic--; +} + +#endif /* _BCACHEFS_PRINTBUF_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c new file mode 100644 index 000000000..4f0654ff8 --- /dev/null +++ b/fs/bcachefs/quota.c @@ -0,0 +1,981 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "inode.h" +#include "quota.h" +#include "subvolume.h" +#include "super-io.h" + +static const char * const bch2_quota_types[] = { + "user", + "group", + "project", +}; + +static const char * const bch2_quota_counters[] = { + "space", + "inodes", +}; + +static int bch2_sb_quota_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + + if (vstruct_bytes(&q->field) < sizeof(*q)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&q->field), sizeof(*q)); + return -BCH_ERR_invalid_sb_quota; + } + + return 0; +} + +static void bch2_sb_quota_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_quota *q = field_to_type(f, quota); + unsigned qtyp, counter; + + for (qtyp = 0; qtyp < ARRAY_SIZE(q->q); qtyp++) { + prt_printf(out, "%s: flags %llx", + bch2_quota_types[qtyp], + le64_to_cpu(q->q[qtyp].flags)); + + for (counter = 0; counter < Q_COUNTERS; counter++) + prt_printf(out, " %s timelimit %u warnlimit %u", + bch2_quota_counters[counter], + le32_to_cpu(q->q[qtyp].c[counter].timelimit), + le32_to_cpu(q->q[qtyp].c[counter].warnlimit)); + + prt_newline(out); + } +} + +const struct bch_sb_field_ops bch_sb_field_ops_quota = { + .validate = bch2_sb_quota_validate, + .to_text = bch2_sb_quota_to_text, +}; + +int bch2_quota_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (k.k->p.inode >= QTYP_NR) { + prt_printf(err, "invalid quota type (%llu >= %u)", + k.k->p.inode, QTYP_NR); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_quota_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_quota dq = bkey_s_c_to_quota(k); + unsigned i; + + for (i = 0; i < Q_COUNTERS; i++) + prt_printf(out, "%s hardlimit %llu softlimit %llu", + bch2_quota_counters[i], + le64_to_cpu(dq.v->c[i].hardlimit), + le64_to_cpu(dq.v->c[i].softlimit)); +} + +#ifdef CONFIG_BCACHEFS_QUOTA + +#include +#include +#include + +static void qc_info_to_text(struct printbuf *out, struct qc_info *i) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "i_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", i->i_fieldmask); + prt_newline(out); + + prt_str(out, "i_flags"); + prt_tab(out); + prt_printf(out, "%u", i->i_flags); + prt_newline(out); + + prt_str(out, "i_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_ino_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_timelimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_timelimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_timelimit); + prt_newline(out); + + prt_str(out, "i_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_spc_warnlimit); + prt_newline(out); + + prt_str(out, "i_ino_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_ino_warnlimit); + prt_newline(out); + + prt_str(out, "i_rt_spc_warnlimit"); + prt_tab(out); + prt_printf(out, "%u", i->i_rt_spc_warnlimit); + prt_newline(out); +} + +static void qc_dqblk_to_text(struct printbuf *out, struct qc_dqblk *q) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 20); + + prt_str(out, "d_fieldmask"); + prt_tab(out); + prt_printf(out, "%x", q->d_fieldmask); + prt_newline(out); + + prt_str(out, "d_spc_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_hardlimit); + prt_newline(out); + + prt_str(out, "d_spc_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_softlimit); + prt_newline(out); + + prt_str(out, "d_ino_hardlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_hardlimit); + prt_newline(out); + + prt_str(out, "d_ino_softlimit"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_softlimit); + prt_newline(out); + + prt_str(out, "d_space"); + prt_tab(out); + prt_printf(out, "%llu", q->d_space); + prt_newline(out); + + prt_str(out, "d_ino_count"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_count); + prt_newline(out); + + prt_str(out, "d_ino_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_ino_timer); + prt_newline(out); + + prt_str(out, "d_spc_timer"); + prt_tab(out); + prt_printf(out, "%llu", q->d_spc_timer); + prt_newline(out); + + prt_str(out, "d_ino_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_ino_warns); + prt_newline(out); + + prt_str(out, "d_spc_warns"); + prt_tab(out); + prt_printf(out, "%i", q->d_spc_warns); + prt_newline(out); +} + +static inline unsigned __next_qtype(unsigned i, unsigned qtypes) +{ + qtypes >>= i; + return qtypes ? i + __ffs(qtypes) : QTYP_NR; +} + +#define for_each_set_qtype(_c, _i, _q, _qtypes) \ + for (_i = 0; \ + (_i = __next_qtype(_i, _qtypes), \ + _q = &(_c)->quotas[_i], \ + _i < QTYP_NR); \ + _i++) + +static bool ignore_hardlimit(struct bch_memquota_type *q) +{ + if (capable(CAP_SYS_RESOURCE)) + return true; +#if 0 + struct mem_dqinfo *info = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type]; + + return capable(CAP_SYS_RESOURCE) && + (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD || + !(info->dqi_flags & DQF_ROOT_SQUASH)); +#endif + return false; +} + +enum quota_msg { + SOFTWARN, /* Softlimit reached */ + SOFTLONGWARN, /* Grace time expired */ + HARDWARN, /* Hardlimit reached */ + + HARDBELOW, /* Usage got below inode hardlimit */ + SOFTBELOW, /* Usage got below inode softlimit */ +}; + +static int quota_nl[][Q_COUNTERS] = { + [HARDWARN][Q_SPC] = QUOTA_NL_BHARDWARN, + [SOFTLONGWARN][Q_SPC] = QUOTA_NL_BSOFTLONGWARN, + [SOFTWARN][Q_SPC] = QUOTA_NL_BSOFTWARN, + [HARDBELOW][Q_SPC] = QUOTA_NL_BHARDBELOW, + [SOFTBELOW][Q_SPC] = QUOTA_NL_BSOFTBELOW, + + [HARDWARN][Q_INO] = QUOTA_NL_IHARDWARN, + [SOFTLONGWARN][Q_INO] = QUOTA_NL_ISOFTLONGWARN, + [SOFTWARN][Q_INO] = QUOTA_NL_ISOFTWARN, + [HARDBELOW][Q_INO] = QUOTA_NL_IHARDBELOW, + [SOFTBELOW][Q_INO] = QUOTA_NL_ISOFTBELOW, +}; + +struct quota_msgs { + u8 nr; + struct { + u8 qtype; + u8 msg; + } m[QTYP_NR * Q_COUNTERS]; +}; + +static void prepare_msg(unsigned qtype, + enum quota_counters counter, + struct quota_msgs *msgs, + enum quota_msg msg_type) +{ + BUG_ON(msgs->nr >= ARRAY_SIZE(msgs->m)); + + msgs->m[msgs->nr].qtype = qtype; + msgs->m[msgs->nr].msg = quota_nl[msg_type][counter]; + msgs->nr++; +} + +static void prepare_warning(struct memquota_counter *qc, + unsigned qtype, + enum quota_counters counter, + struct quota_msgs *msgs, + enum quota_msg msg_type) +{ + if (qc->warning_issued & (1 << msg_type)) + return; + + prepare_msg(qtype, counter, msgs, msg_type); +} + +static void flush_warnings(struct bch_qid qid, + struct super_block *sb, + struct quota_msgs *msgs) +{ + unsigned i; + + for (i = 0; i < msgs->nr; i++) + quota_send_warning(make_kqid(&init_user_ns, msgs->m[i].qtype, qid.q[i]), + sb->s_dev, msgs->m[i].msg); +} + +static int bch2_quota_check_limit(struct bch_fs *c, + unsigned qtype, + struct bch_memquota *mq, + struct quota_msgs *msgs, + enum quota_counters counter, + s64 v, + enum quota_acct_mode mode) +{ + struct bch_memquota_type *q = &c->quotas[qtype]; + struct memquota_counter *qc = &mq->c[counter]; + u64 n = qc->v + v; + + BUG_ON((s64) n < 0); + + if (mode == KEY_TYPE_QUOTA_NOCHECK) + return 0; + + if (v <= 0) { + if (n < qc->hardlimit && + (qc->warning_issued & (1 << HARDWARN))) { + qc->warning_issued &= ~(1 << HARDWARN); + prepare_msg(qtype, counter, msgs, HARDBELOW); + } + + if (n < qc->softlimit && + (qc->warning_issued & (1 << SOFTWARN))) { + qc->warning_issued &= ~(1 << SOFTWARN); + prepare_msg(qtype, counter, msgs, SOFTBELOW); + } + + qc->warning_issued = 0; + return 0; + } + + if (qc->hardlimit && + qc->hardlimit < n && + !ignore_hardlimit(q)) { + prepare_warning(qc, qtype, counter, msgs, HARDWARN); + return -EDQUOT; + } + + if (qc->softlimit && + qc->softlimit < n) { + if (qc->timer == 0) { + qc->timer = ktime_get_real_seconds() + q->limits[counter].timelimit; + prepare_warning(qc, qtype, counter, msgs, SOFTWARN); + } else if (ktime_get_real_seconds() >= qc->timer && + !ignore_hardlimit(q)) { + prepare_warning(qc, qtype, counter, msgs, SOFTLONGWARN); + return -EDQUOT; + } + } + + return 0; +} + +int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, + enum quota_counters counter, s64 v, + enum quota_acct_mode mode) +{ + unsigned qtypes = enabled_qtypes(c); + struct bch_memquota_type *q; + struct bch_memquota *mq[QTYP_NR]; + struct quota_msgs msgs; + unsigned i; + int ret = 0; + + memset(&msgs, 0, sizeof(msgs)); + + for_each_set_qtype(c, i, q, qtypes) { + mq[i] = genradix_ptr_alloc(&q->table, qid.q[i], GFP_KERNEL); + if (!mq[i]) + return -ENOMEM; + } + + for_each_set_qtype(c, i, q, qtypes) + mutex_lock_nested(&q->lock, i); + + for_each_set_qtype(c, i, q, qtypes) { + ret = bch2_quota_check_limit(c, i, mq[i], &msgs, counter, v, mode); + if (ret) + goto err; + } + + for_each_set_qtype(c, i, q, qtypes) + mq[i]->c[counter].v += v; +err: + for_each_set_qtype(c, i, q, qtypes) + mutex_unlock(&q->lock); + + flush_warnings(qid, c->vfs_sb, &msgs); + + return ret; +} + +static void __bch2_quota_transfer(struct bch_memquota *src_q, + struct bch_memquota *dst_q, + enum quota_counters counter, s64 v) +{ + BUG_ON(v > src_q->c[counter].v); + BUG_ON(v + dst_q->c[counter].v < v); + + src_q->c[counter].v -= v; + dst_q->c[counter].v += v; +} + +int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, + struct bch_qid dst, + struct bch_qid src, u64 space, + enum quota_acct_mode mode) +{ + struct bch_memquota_type *q; + struct bch_memquota *src_q[3], *dst_q[3]; + struct quota_msgs msgs; + unsigned i; + int ret = 0; + + qtypes &= enabled_qtypes(c); + + memset(&msgs, 0, sizeof(msgs)); + + for_each_set_qtype(c, i, q, qtypes) { + src_q[i] = genradix_ptr_alloc(&q->table, src.q[i], GFP_KERNEL); + dst_q[i] = genradix_ptr_alloc(&q->table, dst.q[i], GFP_KERNEL); + if (!src_q[i] || !dst_q[i]) + return -ENOMEM; + } + + for_each_set_qtype(c, i, q, qtypes) + mutex_lock_nested(&q->lock, i); + + for_each_set_qtype(c, i, q, qtypes) { + ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_SPC, + dst_q[i]->c[Q_SPC].v + space, + mode); + if (ret) + goto err; + + ret = bch2_quota_check_limit(c, i, dst_q[i], &msgs, Q_INO, + dst_q[i]->c[Q_INO].v + 1, + mode); + if (ret) + goto err; + } + + for_each_set_qtype(c, i, q, qtypes) { + __bch2_quota_transfer(src_q[i], dst_q[i], Q_SPC, space); + __bch2_quota_transfer(src_q[i], dst_q[i], Q_INO, 1); + } + +err: + for_each_set_qtype(c, i, q, qtypes) + mutex_unlock(&q->lock); + + flush_warnings(dst, c->vfs_sb, &msgs); + + return ret; +} + +static int __bch2_quota_set(struct bch_fs *c, struct bkey_s_c k, + struct qc_dqblk *qdq) +{ + struct bkey_s_c_quota dq; + struct bch_memquota_type *q; + struct bch_memquota *mq; + unsigned i; + + BUG_ON(k.k->p.inode >= QTYP_NR); + + if (!((1U << k.k->p.inode) & enabled_qtypes(c))) + return 0; + + switch (k.k->type) { + case KEY_TYPE_quota: + dq = bkey_s_c_to_quota(k); + q = &c->quotas[k.k->p.inode]; + + mutex_lock(&q->lock); + mq = genradix_ptr_alloc(&q->table, k.k->p.offset, GFP_KERNEL); + if (!mq) { + mutex_unlock(&q->lock); + return -ENOMEM; + } + + for (i = 0; i < Q_COUNTERS; i++) { + mq->c[i].hardlimit = le64_to_cpu(dq.v->c[i].hardlimit); + mq->c[i].softlimit = le64_to_cpu(dq.v->c[i].softlimit); + } + + if (qdq && qdq->d_fieldmask & QC_SPC_TIMER) + mq->c[Q_SPC].timer = qdq->d_spc_timer; + if (qdq && qdq->d_fieldmask & QC_SPC_WARNS) + mq->c[Q_SPC].warns = qdq->d_spc_warns; + if (qdq && qdq->d_fieldmask & QC_INO_TIMER) + mq->c[Q_INO].timer = qdq->d_ino_timer; + if (qdq && qdq->d_fieldmask & QC_INO_WARNS) + mq->c[Q_INO].warns = qdq->d_ino_warns; + + mutex_unlock(&q->lock); + } + + return 0; +} + +void bch2_fs_quota_exit(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->quotas); i++) + genradix_free(&c->quotas[i].table); +} + +void bch2_fs_quota_init(struct bch_fs *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->quotas); i++) + mutex_init(&c->quotas[i].lock); +} + +static struct bch_sb_field_quota *bch2_sb_get_or_create_quota(struct bch_sb_handle *sb) +{ + struct bch_sb_field_quota *sb_quota = bch2_sb_get_quota(sb->sb); + + if (sb_quota) + return sb_quota; + + sb_quota = bch2_sb_resize_quota(sb, sizeof(*sb_quota) / sizeof(u64)); + if (sb_quota) { + unsigned qtype, qc; + + for (qtype = 0; qtype < QTYP_NR; qtype++) + for (qc = 0; qc < Q_COUNTERS; qc++) + sb_quota->q[qtype].c[qc].timelimit = + cpu_to_le32(7 * 24 * 60 * 60); + } + + return sb_quota; +} + +static void bch2_sb_quota_read(struct bch_fs *c) +{ + struct bch_sb_field_quota *sb_quota; + unsigned i, j; + + sb_quota = bch2_sb_get_quota(c->disk_sb.sb); + if (!sb_quota) + return; + + for (i = 0; i < QTYP_NR; i++) { + struct bch_memquota_type *q = &c->quotas[i]; + + for (j = 0; j < Q_COUNTERS; j++) { + q->limits[j].timelimit = + le32_to_cpu(sb_quota->q[i].c[j].timelimit); + q->limits[j].warnlimit = + le32_to_cpu(sb_quota->q[i].c[j].warnlimit); + } + } +} + +static int bch2_fs_quota_read_inode(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bch_inode_unpacked u; + struct bch_snapshot_tree s_t; + int ret; + + ret = bch2_snapshot_tree_lookup(trans, + bch2_snapshot_tree(c, k.k->p.snapshot), &s_t); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, + snapshot_t(c, k.k->p.snapshot)->tree); + if (ret) + return ret; + + if (!s_t.master_subvol) + goto advance; + + ret = bch2_inode_find_by_inum_trans(trans, + (subvol_inum) { + le32_to_cpu(s_t.master_subvol), + k.k->p.offset, + }, &u); + /* + * Inode might be deleted in this snapshot - the easiest way to handle + * that is to just skip it here: + */ + if (bch2_err_matches(ret, ENOENT)) + goto advance; + + if (ret) + return ret; + + bch2_quota_acct(c, bch_qid(&u), Q_SPC, u.bi_sectors, + KEY_TYPE_QUOTA_NOCHECK); + bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, + KEY_TYPE_QUOTA_NOCHECK); +advance: + bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + return 0; +} + +int bch2_fs_quota_read(struct bch_fs *c) +{ + struct bch_sb_field_quota *sb_quota; + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + mutex_unlock(&c->sb_lock); + return -BCH_ERR_ENOSPC_sb_quota; + } + + bch2_sb_quota_read(c); + mutex_unlock(&c->sb_lock); + + bch2_trans_init(&trans, c, 0, 0); + + ret = for_each_btree_key2(&trans, iter, BTREE_ID_quotas, + POS_MIN, BTREE_ITER_PREFETCH, k, + __bch2_quota_set(c, k, NULL)) ?: + for_each_btree_key2(&trans, iter, BTREE_ID_inodes, + POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + bch2_fs_quota_read_inode(&trans, &iter, k)); + + bch2_trans_exit(&trans); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* Enable/disable/delete quotas for an entire filesystem: */ + +static int bch2_quota_enable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + int ret = 0; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + /* Accounting must be enabled at mount time: */ + if (uflags & (FS_QUOTA_UDQ_ACCT|FS_QUOTA_GDQ_ACCT|FS_QUOTA_PDQ_ACCT)) + return -EINVAL; + + /* Can't enable enforcement without accounting: */ + if ((uflags & FS_QUOTA_UDQ_ENFD) && !c->opts.usrquota) + return -EINVAL; + + if ((uflags & FS_QUOTA_GDQ_ENFD) && !c->opts.grpquota) + return -EINVAL; + + if (uflags & FS_QUOTA_PDQ_ENFD && !c->opts.prjquota) + return -EINVAL; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; + } + + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, true); + + if (uflags & FS_QUOTA_GDQ_ENFD) + SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, true); + + if (uflags & FS_QUOTA_PDQ_ENFD) + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, true); + + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); + + return bch2_err_class(ret); +} + +static int bch2_quota_disable(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + mutex_lock(&c->sb_lock); + if (uflags & FS_QUOTA_UDQ_ENFD) + SET_BCH_SB_USRQUOTA(c->disk_sb.sb, false); + + if (uflags & FS_QUOTA_GDQ_ENFD) + SET_BCH_SB_GRPQUOTA(c->disk_sb.sb, false); + + if (uflags & FS_QUOTA_PDQ_ENFD) + SET_BCH_SB_PRJQUOTA(c->disk_sb.sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +} + +static int bch2_quota_remove(struct super_block *sb, unsigned uflags) +{ + struct bch_fs *c = sb->s_fs_info; + int ret; + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + if (uflags & FS_USER_QUOTA) { + if (c->opts.usrquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_USR, 0), + POS(QTYP_USR, U64_MAX), + 0, NULL); + if (ret) + return ret; + } + + if (uflags & FS_GROUP_QUOTA) { + if (c->opts.grpquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_GRP, 0), + POS(QTYP_GRP, U64_MAX), + 0, NULL); + if (ret) + return ret; + } + + if (uflags & FS_PROJ_QUOTA) { + if (c->opts.prjquota) + return -EINVAL; + + ret = bch2_btree_delete_range(c, BTREE_ID_quotas, + POS(QTYP_PRJ, 0), + POS(QTYP_PRJ, U64_MAX), + 0, NULL); + if (ret) + return ret; + } + + return 0; +} + +/* + * Return quota status information, such as enforcements, quota file inode + * numbers etc. + */ +static int bch2_quota_get_state(struct super_block *sb, struct qc_state *state) +{ + struct bch_fs *c = sb->s_fs_info; + unsigned qtypes = enabled_qtypes(c); + unsigned i; + + memset(state, 0, sizeof(*state)); + + for (i = 0; i < QTYP_NR; i++) { + state->s_state[i].flags |= QCI_SYSFILE; + + if (!(qtypes & (1 << i))) + continue; + + state->s_state[i].flags |= QCI_ACCT_ENABLED; + + state->s_state[i].spc_timelimit = c->quotas[i].limits[Q_SPC].timelimit; + state->s_state[i].spc_warnlimit = c->quotas[i].limits[Q_SPC].warnlimit; + + state->s_state[i].ino_timelimit = c->quotas[i].limits[Q_INO].timelimit; + state->s_state[i].ino_warnlimit = c->quotas[i].limits[Q_INO].warnlimit; + } + + return 0; +} + +/* + * Adjust quota timers & warnings + */ +static int bch2_quota_set_info(struct super_block *sb, int type, + struct qc_info *info) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_sb_field_quota *sb_quota; + struct bch_memquota_type *q; + int ret = 0; + + if (0) { + struct printbuf buf = PRINTBUF; + + qc_info_to_text(&buf, info); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + if (type >= QTYP_NR) + return -EINVAL; + + if (!((1 << type) & enabled_qtypes(c))) + return -ESRCH; + + if (info->i_fieldmask & + ~(QC_SPC_TIMER|QC_INO_TIMER|QC_SPC_WARNS|QC_INO_WARNS)) + return -EINVAL; + + q = &c->quotas[type]; + + mutex_lock(&c->sb_lock); + sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); + if (!sb_quota) { + ret = -BCH_ERR_ENOSPC_sb_quota; + goto unlock; + } + + if (info->i_fieldmask & QC_SPC_TIMER) + sb_quota->q[type].c[Q_SPC].timelimit = + cpu_to_le32(info->i_spc_timelimit); + + if (info->i_fieldmask & QC_SPC_WARNS) + sb_quota->q[type].c[Q_SPC].warnlimit = + cpu_to_le32(info->i_spc_warnlimit); + + if (info->i_fieldmask & QC_INO_TIMER) + sb_quota->q[type].c[Q_INO].timelimit = + cpu_to_le32(info->i_ino_timelimit); + + if (info->i_fieldmask & QC_INO_WARNS) + sb_quota->q[type].c[Q_INO].warnlimit = + cpu_to_le32(info->i_ino_warnlimit); + + bch2_sb_quota_read(c); + + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); + + return bch2_err_class(ret); +} + +/* Get/set individual quotas: */ + +static void __bch2_quota_get(struct qc_dqblk *dst, struct bch_memquota *src) +{ + dst->d_space = src->c[Q_SPC].v << 9; + dst->d_spc_hardlimit = src->c[Q_SPC].hardlimit << 9; + dst->d_spc_softlimit = src->c[Q_SPC].softlimit << 9; + dst->d_spc_timer = src->c[Q_SPC].timer; + dst->d_spc_warns = src->c[Q_SPC].warns; + + dst->d_ino_count = src->c[Q_INO].v; + dst->d_ino_hardlimit = src->c[Q_INO].hardlimit; + dst->d_ino_softlimit = src->c[Q_INO].softlimit; + dst->d_ino_timer = src->c[Q_INO].timer; + dst->d_ino_warns = src->c[Q_INO].warns; +} + +static int bch2_get_quota(struct super_block *sb, struct kqid kqid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid.type]; + qid_t qid = from_kqid(&init_user_ns, kqid); + struct bch_memquota *mq; + + memset(qdq, 0, sizeof(*qdq)); + + mutex_lock(&q->lock); + mq = genradix_ptr(&q->table, qid); + if (mq) + __bch2_quota_get(qdq, mq); + mutex_unlock(&q->lock); + + return 0; +} + +static int bch2_get_next_quota(struct super_block *sb, struct kqid *kqid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bch_memquota_type *q = &c->quotas[kqid->type]; + qid_t qid = from_kqid(&init_user_ns, *kqid); + struct genradix_iter iter; + struct bch_memquota *mq; + int ret = 0; + + mutex_lock(&q->lock); + + genradix_for_each_from(&q->table, iter, mq, qid) + if (memcmp(mq, page_address(ZERO_PAGE(0)), sizeof(*mq))) { + __bch2_quota_get(qdq, mq); + *kqid = make_kqid(current_user_ns(), kqid->type, iter.pos); + goto found; + } + + ret = -ENOENT; +found: + mutex_unlock(&q->lock); + return bch2_err_class(ret); +} + +static int bch2_set_quota_trans(struct btree_trans *trans, + struct bkey_i_quota *new_quota, + struct qc_dqblk *qdq) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_quotas, new_quota->k.p, + BTREE_ITER_SLOTS|BTREE_ITER_INTENT); + ret = bkey_err(k); + if (unlikely(ret)) + return ret; + + if (k.k->type == KEY_TYPE_quota) + new_quota->v = *bkey_s_c_to_quota(k).v; + + if (qdq->d_fieldmask & QC_SPC_SOFT) + new_quota->v.c[Q_SPC].softlimit = cpu_to_le64(qdq->d_spc_softlimit >> 9); + if (qdq->d_fieldmask & QC_SPC_HARD) + new_quota->v.c[Q_SPC].hardlimit = cpu_to_le64(qdq->d_spc_hardlimit >> 9); + + if (qdq->d_fieldmask & QC_INO_SOFT) + new_quota->v.c[Q_INO].softlimit = cpu_to_le64(qdq->d_ino_softlimit); + if (qdq->d_fieldmask & QC_INO_HARD) + new_quota->v.c[Q_INO].hardlimit = cpu_to_le64(qdq->d_ino_hardlimit); + + ret = bch2_trans_update(trans, &iter, &new_quota->k_i, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_set_quota(struct super_block *sb, struct kqid qid, + struct qc_dqblk *qdq) +{ + struct bch_fs *c = sb->s_fs_info; + struct bkey_i_quota new_quota; + int ret; + + if (0) { + struct printbuf buf = PRINTBUF; + + qc_dqblk_to_text(&buf, qdq); + pr_info("setting:\n%s", buf.buf); + printbuf_exit(&buf); + } + + if (sb->s_flags & SB_RDONLY) + return -EROFS; + + bkey_quota_init(&new_quota.k_i); + new_quota.k.p = POS(qid.type, from_kqid(&init_user_ns, qid)); + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_set_quota_trans(&trans, &new_quota, qdq)) ?: + __bch2_quota_set(c, bkey_i_to_s_c(&new_quota.k_i), qdq); + + return bch2_err_class(ret); +} + +const struct quotactl_ops bch2_quotactl_operations = { + .quota_enable = bch2_quota_enable, + .quota_disable = bch2_quota_disable, + .rm_xquota = bch2_quota_remove, + + .get_state = bch2_quota_get_state, + .set_info = bch2_quota_set_info, + + .get_dqblk = bch2_get_quota, + .get_nextdqblk = bch2_get_next_quota, + .set_dqblk = bch2_set_quota, +}; + +#endif /* CONFIG_BCACHEFS_QUOTA */ diff --git a/fs/bcachefs/quota.h b/fs/bcachefs/quota.h new file mode 100644 index 000000000..2f463874a --- /dev/null +++ b/fs/bcachefs/quota.h @@ -0,0 +1,74 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_H +#define _BCACHEFS_QUOTA_H + +#include "inode.h" +#include "quota_types.h" + +enum bkey_invalid_flags; +extern const struct bch_sb_field_ops bch_sb_field_ops_quota; + +int bch2_quota_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_quota_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_quota ((struct bkey_ops) { \ + .key_invalid = bch2_quota_invalid, \ + .val_to_text = bch2_quota_to_text, \ + .min_val_size = 32, \ +}) + +static inline struct bch_qid bch_qid(struct bch_inode_unpacked *u) +{ + return (struct bch_qid) { + .q[QTYP_USR] = u->bi_uid, + .q[QTYP_GRP] = u->bi_gid, + .q[QTYP_PRJ] = u->bi_project ? u->bi_project - 1 : 0, + }; +} + +static inline unsigned enabled_qtypes(struct bch_fs *c) +{ + return ((c->opts.usrquota << QTYP_USR)| + (c->opts.grpquota << QTYP_GRP)| + (c->opts.prjquota << QTYP_PRJ)); +} + +#ifdef CONFIG_BCACHEFS_QUOTA + +int bch2_quota_acct(struct bch_fs *, struct bch_qid, enum quota_counters, + s64, enum quota_acct_mode); + +int bch2_quota_transfer(struct bch_fs *, unsigned, struct bch_qid, + struct bch_qid, u64, enum quota_acct_mode); + +void bch2_fs_quota_exit(struct bch_fs *); +void bch2_fs_quota_init(struct bch_fs *); +int bch2_fs_quota_read(struct bch_fs *); + +extern const struct quotactl_ops bch2_quotactl_operations; + +#else + +static inline int bch2_quota_acct(struct bch_fs *c, struct bch_qid qid, + enum quota_counters counter, s64 v, + enum quota_acct_mode mode) +{ + return 0; +} + +static inline int bch2_quota_transfer(struct bch_fs *c, unsigned qtypes, + struct bch_qid dst, + struct bch_qid src, u64 space, + enum quota_acct_mode mode) +{ + return 0; +} + +static inline void bch2_fs_quota_exit(struct bch_fs *c) {} +static inline void bch2_fs_quota_init(struct bch_fs *c) {} +static inline int bch2_fs_quota_read(struct bch_fs *c) { return 0; } + +#endif + +#endif /* _BCACHEFS_QUOTA_H */ diff --git a/fs/bcachefs/quota_types.h b/fs/bcachefs/quota_types.h new file mode 100644 index 000000000..6a136083d --- /dev/null +++ b/fs/bcachefs/quota_types.h @@ -0,0 +1,43 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_QUOTA_TYPES_H +#define _BCACHEFS_QUOTA_TYPES_H + +#include + +struct bch_qid { + u32 q[QTYP_NR]; +}; + +enum quota_acct_mode { + KEY_TYPE_QUOTA_PREALLOC, + KEY_TYPE_QUOTA_WARN, + KEY_TYPE_QUOTA_NOCHECK, +}; + +struct memquota_counter { + u64 v; + u64 hardlimit; + u64 softlimit; + s64 timer; + int warns; + int warning_issued; +}; + +struct bch_memquota { + struct memquota_counter c[Q_COUNTERS]; +}; + +typedef GENRADIX(struct bch_memquota) bch_memquota_table; + +struct quota_limit { + u32 timelimit; + u32 warnlimit; +}; + +struct bch_memquota_type { + struct quota_limit limits[Q_COUNTERS]; + bch_memquota_table table; + struct mutex lock; +}; + +#endif /* _BCACHEFS_QUOTA_TYPES_H */ diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 index 000000000..c3d577236 --- /dev/null +++ b/fs/bcachefs/rebalance.c @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "alloc_foreground.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "compress.h" +#include "disk_groups.h" +#include "errcode.h" +#include "extents.h" +#include "io.h" +#include "move.h" +#include "rebalance.h" +#include "super-io.h" +#include "trace.h" + +#include +#include +#include + +/* + * Check if an extent should be moved: + * returns -1 if it should not be moved, or + * device of pointer that should be moved, if known, or INT_MAX if unknown + */ +static bool rebalance_pred(struct bch_fs *c, void *arg, + struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned i; + + data_opts->rewrite_ptrs = 0; + data_opts->target = io_opts->background_target; + data_opts->extra_replicas = 0; + data_opts->btree_insert_flags = 0; + + if (io_opts->background_compression && + !bch2_bkey_is_incompressible(k)) { + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + i = 0; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (!p.ptr.cached && + p.crc.compression_type != + bch2_compression_opt_to_type(io_opts->background_compression)) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + } + + if (io_opts->background_target) { + const struct bch_extent_ptr *ptr; + + i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (!ptr->cached && + !bch2_dev_in_target(c, ptr->dev, io_opts->background_target) && + bch2_target_accepts_data(c, BCH_DATA_user, io_opts->background_target)) + data_opts->rewrite_ptrs |= 1U << i; + i++; + } + } + + return data_opts->rewrite_ptrs != 0; +} + +void bch2_rebalance_add_key(struct bch_fs *c, + struct bkey_s_c k, + struct bch_io_opts *io_opts) +{ + struct data_update_opts update_opts = { 0 }; + struct bkey_ptrs_c ptrs; + const struct bch_extent_ptr *ptr; + unsigned i; + + if (!rebalance_pred(c, NULL, k, io_opts, &update_opts)) + return; + + i = 0; + ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) { + if ((1U << i) && update_opts.rewrite_ptrs) + if (atomic64_add_return(k.k->size, + &bch_dev_bkey_exists(c, ptr->dev)->rebalance_work) == + k.k->size) + rebalance_wakeup(c); + i++; + } +} + +void bch2_rebalance_add_work(struct bch_fs *c, u64 sectors) +{ + if (atomic64_add_return(sectors, &c->rebalance.work_unknown_dev) == + sectors) + rebalance_wakeup(c); +} + +struct rebalance_work { + int dev_most_full_idx; + unsigned dev_most_full_percent; + u64 dev_most_full_work; + u64 dev_most_full_capacity; + u64 total_work; +}; + +static void rebalance_work_accumulate(struct rebalance_work *w, + u64 dev_work, u64 unknown_dev, u64 capacity, int idx) +{ + unsigned percent_full; + u64 work = dev_work + unknown_dev; + + if (work < dev_work || work < unknown_dev) + work = U64_MAX; + work = min(work, capacity); + + percent_full = div64_u64(work * 100, capacity); + + if (percent_full >= w->dev_most_full_percent) { + w->dev_most_full_idx = idx; + w->dev_most_full_percent = percent_full; + w->dev_most_full_work = work; + w->dev_most_full_capacity = capacity; + } + + if (w->total_work + dev_work >= w->total_work && + w->total_work + dev_work >= dev_work) + w->total_work += dev_work; +} + +static struct rebalance_work rebalance_work(struct bch_fs *c) +{ + struct bch_dev *ca; + struct rebalance_work ret = { .dev_most_full_idx = -1 }; + u64 unknown_dev = atomic64_read(&c->rebalance.work_unknown_dev); + unsigned i; + + for_each_online_member(ca, c, i) + rebalance_work_accumulate(&ret, + atomic64_read(&ca->rebalance_work), + unknown_dev, + bucket_to_sector(ca, ca->mi.nbuckets - + ca->mi.first_bucket), + i); + + rebalance_work_accumulate(&ret, + unknown_dev, 0, c->capacity, -1); + + return ret; +} + +static void rebalance_work_reset(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + for_each_online_member(ca, c, i) + atomic64_set(&ca->rebalance_work, 0); + + atomic64_set(&c->rebalance.work_unknown_dev, 0); +} + +static unsigned long curr_cputime(void) +{ + u64 utime, stime; + + task_cputime_adjusted(current, &utime, &stime); + return nsecs_to_jiffies(utime + stime); +} + +static int bch2_rebalance_thread(void *arg) +{ + struct bch_fs *c = arg; + struct bch_fs_rebalance *r = &c->rebalance; + struct io_clock *clock = &c->io_clock[WRITE]; + struct rebalance_work w, p; + struct bch_move_stats move_stats; + unsigned long start, prev_start; + unsigned long prev_run_time, prev_run_cputime; + unsigned long cputime, prev_cputime; + u64 io_start; + long throttle; + + set_freezable(); + + io_start = atomic64_read(&clock->now); + p = rebalance_work(c); + prev_start = jiffies; + prev_cputime = curr_cputime(); + + bch2_move_stats_init(&move_stats, "rebalance"); + while (!kthread_wait_freezable(r->enabled)) { + cond_resched(); + + start = jiffies; + cputime = curr_cputime(); + + prev_run_time = start - prev_start; + prev_run_cputime = cputime - prev_cputime; + + w = rebalance_work(c); + BUG_ON(!w.dev_most_full_capacity); + + if (!w.total_work) { + r->state = REBALANCE_WAITING; + kthread_wait_freezable(rebalance_work(c).total_work); + continue; + } + + /* + * If there isn't much work to do, throttle cpu usage: + */ + throttle = prev_run_cputime * 100 / + max(1U, w.dev_most_full_percent) - + prev_run_time; + + if (w.dev_most_full_percent < 20 && throttle > 0) { + r->throttled_until_iotime = io_start + + div_u64(w.dev_most_full_capacity * + (20 - w.dev_most_full_percent), + 50); + + if (atomic64_read(&clock->now) + clock->max_slop < + r->throttled_until_iotime) { + r->throttled_until_cputime = start + throttle; + r->state = REBALANCE_THROTTLED; + + bch2_kthread_io_clock_wait(clock, + r->throttled_until_iotime, + throttle); + continue; + } + } + + /* minimum 1 mb/sec: */ + r->pd.rate.rate = + max_t(u64, 1 << 11, + r->pd.rate.rate * + max(p.dev_most_full_percent, 1U) / + max(w.dev_most_full_percent, 1U)); + + io_start = atomic64_read(&clock->now); + p = w; + prev_start = start; + prev_cputime = cputime; + + r->state = REBALANCE_RUNNING; + memset(&move_stats, 0, sizeof(move_stats)); + rebalance_work_reset(c); + + bch2_move_data(c, + 0, POS_MIN, + BTREE_ID_NR, POS_MAX, + /* ratelimiting disabled for now */ + NULL, /* &r->pd.rate, */ + &move_stats, + writepoint_ptr(&c->rebalance_write_point), + true, + rebalance_pred, NULL); + } + + return 0; +} + +void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_rebalance *r = &c->rebalance; + struct rebalance_work w = rebalance_work(c); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); + + prt_printf(out, "fullest_dev (%i):", w.dev_most_full_idx); + prt_tab(out); + + prt_human_readable_u64(out, w.dev_most_full_work << 9); + prt_printf(out, "/"); + prt_human_readable_u64(out, w.dev_most_full_capacity << 9); + prt_newline(out); + + prt_printf(out, "total work:"); + prt_tab(out); + + prt_human_readable_u64(out, w.total_work << 9); + prt_printf(out, "/"); + prt_human_readable_u64(out, c->capacity << 9); + prt_newline(out); + + prt_printf(out, "rate:"); + prt_tab(out); + prt_printf(out, "%u", r->pd.rate.rate); + prt_newline(out); + + switch (r->state) { + case REBALANCE_WAITING: + prt_printf(out, "waiting"); + break; + case REBALANCE_THROTTLED: + prt_printf(out, "throttled for %lu sec or ", + (r->throttled_until_cputime - jiffies) / HZ); + prt_human_readable_u64(out, + (r->throttled_until_iotime - + atomic64_read(&c->io_clock[WRITE].now)) << 9); + prt_printf(out, " io"); + break; + case REBALANCE_RUNNING: + prt_printf(out, "running"); + break; + } + prt_newline(out); +} + +void bch2_rebalance_stop(struct bch_fs *c) +{ + struct task_struct *p; + + c->rebalance.pd.rate.rate = UINT_MAX; + bch2_ratelimit_reset(&c->rebalance.pd.rate); + + p = rcu_dereference_protected(c->rebalance.thread, 1); + c->rebalance.thread = NULL; + + if (p) { + /* for sychronizing with rebalance_wakeup() */ + synchronize_rcu(); + + kthread_stop(p); + put_task_struct(p); + } +} + +int bch2_rebalance_start(struct bch_fs *c) +{ + struct task_struct *p; + int ret; + + if (c->rebalance.thread) + return 0; + + if (c->opts.nochanges) + return 0; + + p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name); + ret = PTR_ERR_OR_ZERO(p); + if (ret) { + bch_err(c, "error creating rebalance thread: %s", bch2_err_str(ret)); + return ret; + } + + get_task_struct(p); + rcu_assign_pointer(c->rebalance.thread, p); + wake_up_process(p); + return 0; +} + +void bch2_fs_rebalance_init(struct bch_fs *c) +{ + bch2_pd_controller_init(&c->rebalance.pd); + + atomic64_set(&c->rebalance.work_unknown_dev, S64_MAX); +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h new file mode 100644 index 000000000..7ade0bb81 --- /dev/null +++ b/fs/bcachefs/rebalance.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_H +#define _BCACHEFS_REBALANCE_H + +#include "rebalance_types.h" + +static inline void rebalance_wakeup(struct bch_fs *c) +{ + struct task_struct *p; + + rcu_read_lock(); + p = rcu_dereference(c->rebalance.thread); + if (p) + wake_up_process(p); + rcu_read_unlock(); +} + +void bch2_rebalance_add_key(struct bch_fs *, struct bkey_s_c, + struct bch_io_opts *); +void bch2_rebalance_add_work(struct bch_fs *, u64); + +void bch2_rebalance_work_to_text(struct printbuf *, struct bch_fs *); + +void bch2_rebalance_stop(struct bch_fs *); +int bch2_rebalance_start(struct bch_fs *); +void bch2_fs_rebalance_init(struct bch_fs *); + +#endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h new file mode 100644 index 000000000..7462a92e9 --- /dev/null +++ b/fs/bcachefs/rebalance_types.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REBALANCE_TYPES_H +#define _BCACHEFS_REBALANCE_TYPES_H + +#include "move_types.h" + +enum rebalance_state { + REBALANCE_WAITING, + REBALANCE_THROTTLED, + REBALANCE_RUNNING, +}; + +struct bch_fs_rebalance { + struct task_struct __rcu *thread; + struct bch_pd_controller pd; + + atomic64_t work_unknown_dev; + + enum rebalance_state state; + u64 throttled_until_iotime; + unsigned long throttled_until_cputime; + + unsigned enabled:1; +}; + +#endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c new file mode 100644 index 000000000..63b385d88 --- /dev/null +++ b/fs/bcachefs/recovery.c @@ -0,0 +1,1669 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "backpointers.h" +#include "bkey_buf.h" +#include "alloc_background.h" +#include "btree_gc.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "buckets.h" +#include "dirent.h" +#include "ec.h" +#include "errcode.h" +#include "error.h" +#include "fs-common.h" +#include "fsck.h" +#include "journal_io.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "lru.h" +#include "move.h" +#include "quota.h" +#include "recovery.h" +#include "replicas.h" +#include "subvolume.h" +#include "super-io.h" + +#include +#include + +#define QSTR(n) { { { .len = strlen(n) } }, .name = n } + +/* for -o reconstruct_alloc: */ +static void drop_alloc_keys(struct journal_keys *keys) +{ + size_t src, dst; + + for (src = 0, dst = 0; src < keys->nr; src++) + if (keys->d[src].btree_id != BTREE_ID_alloc) + keys->d[dst++] = keys->d[src]; + + keys->nr = dst; +} + +/* + * Btree node pointers have a field to stack a pointer to the in memory btree + * node; we need to zero out this field when reading in btree nodes, or when + * reading in keys from the journal: + */ +static void zero_out_btree_mem_ptr(struct journal_keys *keys) +{ + struct journal_key *i; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->k->k.type == KEY_TYPE_btree_ptr_v2) + bkey_i_to_btree_ptr_v2(i->k)->v.mem_ptr = 0; +} + +/* iterate over keys read from the journal: */ + +static int __journal_key_cmp(enum btree_id l_btree_id, + unsigned l_level, + struct bpos l_pos, + const struct journal_key *r) +{ + return (cmp_int(l_btree_id, r->btree_id) ?: + cmp_int(l_level, r->level) ?: + bpos_cmp(l_pos, r->k->k.p)); +} + +static int journal_key_cmp(const struct journal_key *l, const struct journal_key *r) +{ + return __journal_key_cmp(l->btree_id, l->level, l->k->k.p, r); +} + +static inline size_t idx_to_pos(struct journal_keys *keys, size_t idx) +{ + size_t gap_size = keys->size - keys->nr; + + if (idx >= keys->gap) + idx += gap_size; + return idx; +} + +static inline struct journal_key *idx_to_key(struct journal_keys *keys, size_t idx) +{ + return keys->d + idx_to_pos(keys, idx); +} + +static size_t __bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + size_t l = 0, r = keys->nr, m; + + while (l < r) { + m = l + ((r - l) >> 1); + if (__journal_key_cmp(id, level, pos, idx_to_key(keys, m)) > 0) + l = m + 1; + else + r = m; + } + + BUG_ON(l < keys->nr && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l)) > 0); + + BUG_ON(l && + __journal_key_cmp(id, level, pos, idx_to_key(keys, l - 1)) <= 0); + + return l; +} + +static size_t bch2_journal_key_search(struct journal_keys *keys, + enum btree_id id, unsigned level, + struct bpos pos) +{ + return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos)); +} + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos, + struct bpos end_pos, size_t *idx) +{ + struct journal_keys *keys = &c->journal_keys; + unsigned iters = 0; + struct journal_key *k; +search: + if (!*idx) + *idx = __bch2_journal_key_search(keys, btree_id, level, pos); + + while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) { + if (__journal_key_cmp(btree_id, level, end_pos, k) < 0) + return NULL; + + if (__journal_key_cmp(btree_id, level, pos, k) <= 0 && + !k->overwritten) + return k->k; + + (*idx)++; + iters++; + if (iters == 10) { + *idx = 0; + goto search; + } + } + + return NULL; +} + +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *c, enum btree_id btree_id, + unsigned level, struct bpos pos) +{ + size_t idx = 0; + + return bch2_journal_keys_peek_upto(c, btree_id, level, pos, pos, &idx); +} + +static void journal_iters_fix(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + /* The key we just inserted is immediately before the gap: */ + size_t gap_end = keys->gap + (keys->size - keys->nr); + struct btree_and_journal_iter *iter; + + /* + * If an iterator points one after the key we just inserted, decrement + * the iterator so it points at the key we just inserted - if the + * decrement was unnecessary, bch2_btree_and_journal_iter_peek() will + * handle that: + */ + list_for_each_entry(iter, &c->journal_iters, journal.list) + if (iter->journal.idx == gap_end) + iter->journal.idx = keys->gap - 1; +} + +static void journal_iters_move_gap(struct bch_fs *c, size_t old_gap, size_t new_gap) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_iter *iter; + size_t gap_size = keys->size - keys->nr; + + list_for_each_entry(iter, &c->journal_iters, list) { + if (iter->idx > old_gap) + iter->idx -= gap_size; + if (iter->idx >= new_gap) + iter->idx += gap_size; + } +} + +int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct journal_key n = { + .btree_id = id, + .level = level, + .k = k, + .allocated = true, + /* + * Ensure these keys are done last by journal replay, to unblock + * journal reclaim: + */ + .journal_seq = U32_MAX, + }; + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, id, level, k->k.p); + + BUG_ON(test_bit(BCH_FS_RW, &c->flags)); + + if (idx < keys->size && + journal_key_cmp(&n, &keys->d[idx]) == 0) { + if (keys->d[idx].allocated) + kfree(keys->d[idx].k); + keys->d[idx] = n; + return 0; + } + + if (idx > keys->gap) + idx -= keys->size - keys->nr; + + if (keys->nr == keys->size) { + struct journal_keys new_keys = { + .nr = keys->nr, + .size = max_t(size_t, keys->size, 8) * 2, + }; + + new_keys.d = kvmalloc_array(new_keys.size, sizeof(new_keys.d[0]), GFP_KERNEL); + if (!new_keys.d) { + bch_err(c, "%s: error allocating new key array (size %zu)", + __func__, new_keys.size); + return -BCH_ERR_ENOMEM_journal_key_insert; + } + + /* Since @keys was full, there was no gap: */ + memcpy(new_keys.d, keys->d, sizeof(keys->d[0]) * keys->nr); + kvfree(keys->d); + *keys = new_keys; + + /* And now the gap is at the end: */ + keys->gap = keys->nr; + } + + journal_iters_move_gap(c, keys->gap, idx); + + move_gap(keys->d, keys->nr, keys->size, keys->gap, idx); + keys->gap = idx; + + keys->nr++; + keys->d[keys->gap++] = n; + + journal_iters_fix(c); + + return 0; +} + +/* + * Can only be used from the recovery thread while we're still RO - can't be + * used once we've got RW, as journal_keys is at that point used by multiple + * threads: + */ +int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, + unsigned level, struct bkey_i *k) +{ + struct bkey_i *n; + int ret; + + n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_journal_key_insert; + + bkey_copy(n, k); + ret = bch2_journal_key_insert_take(c, id, level, n); + if (ret) + kfree(n); + return ret; +} + +int bch2_journal_key_delete(struct bch_fs *c, enum btree_id id, + unsigned level, struct bpos pos) +{ + struct bkey_i whiteout; + + bkey_init(&whiteout.k); + whiteout.k.p = pos; + + return bch2_journal_key_insert(c, id, level, &whiteout); +} + +void bch2_journal_key_overwritten(struct bch_fs *c, enum btree_id btree, + unsigned level, struct bpos pos) +{ + struct journal_keys *keys = &c->journal_keys; + size_t idx = bch2_journal_key_search(keys, btree, level, pos); + + if (idx < keys->size && + keys->d[idx].btree_id == btree && + keys->d[idx].level == level && + bpos_eq(keys->d[idx].k->k.p, pos)) + keys->d[idx].overwritten = true; +} + +static void bch2_journal_iter_advance(struct journal_iter *iter) +{ + if (iter->idx < iter->keys->size) { + iter->idx++; + if (iter->idx == iter->keys->gap) + iter->idx += iter->keys->size - iter->keys->nr; + } +} + +static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) +{ + struct journal_key *k = iter->keys->d + iter->idx; + + while (k < iter->keys->d + iter->keys->size && + k->btree_id == iter->btree_id && + k->level == iter->level) { + if (!k->overwritten) + return bkey_i_to_s_c(k->k); + + bch2_journal_iter_advance(iter); + k = iter->keys->d + iter->idx; + } + + return bkey_s_c_null; +} + +static void bch2_journal_iter_exit(struct journal_iter *iter) +{ + list_del(&iter->list); +} + +static void bch2_journal_iter_init(struct bch_fs *c, + struct journal_iter *iter, + enum btree_id id, unsigned level, + struct bpos pos) +{ + iter->btree_id = id; + iter->level = level; + iter->keys = &c->journal_keys; + iter->idx = bch2_journal_key_search(&c->journal_keys, id, level, pos); +} + +static struct bkey_s_c bch2_journal_iter_peek_btree(struct btree_and_journal_iter *iter) +{ + return bch2_btree_node_iter_peek_unpack(&iter->node_iter, + iter->b, &iter->unpacked); +} + +static void bch2_journal_iter_advance_btree(struct btree_and_journal_iter *iter) +{ + bch2_btree_node_iter_advance(&iter->node_iter, iter->b); +} + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *iter) +{ + if (bpos_eq(iter->pos, SPOS_MAX)) + iter->at_end = true; + else + iter->pos = bpos_successor(iter->pos); +} + +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *iter) +{ + struct bkey_s_c btree_k, journal_k, ret; +again: + if (iter->at_end) + return bkey_s_c_null; + + while ((btree_k = bch2_journal_iter_peek_btree(iter)).k && + bpos_lt(btree_k.k->p, iter->pos)) + bch2_journal_iter_advance_btree(iter); + + while ((journal_k = bch2_journal_iter_peek(&iter->journal)).k && + bpos_lt(journal_k.k->p, iter->pos)) + bch2_journal_iter_advance(&iter->journal); + + ret = journal_k.k && + (!btree_k.k || bpos_le(journal_k.k->p, btree_k.k->p)) + ? journal_k + : btree_k; + + if (ret.k && iter->b && bpos_gt(ret.k->p, iter->b->data->max_key)) + ret = bkey_s_c_null; + + if (ret.k) { + iter->pos = ret.k->p; + if (bkey_deleted(ret.k)) { + bch2_btree_and_journal_iter_advance(iter); + goto again; + } + } else { + iter->pos = SPOS_MAX; + iter->at_end = true; + } + + return ret; +} + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *iter) +{ + bch2_journal_iter_exit(&iter->journal); +} + +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b, + struct btree_node_iter node_iter, + struct bpos pos) +{ + memset(iter, 0, sizeof(*iter)); + + iter->b = b; + iter->node_iter = node_iter; + bch2_journal_iter_init(c, &iter->journal, b->c.btree_id, b->c.level, pos); + INIT_LIST_HEAD(&iter->journal.list); + iter->pos = b->data->min_key; + iter->at_end = false; +} + +/* + * this version is used by btree_gc before filesystem has gone RW and + * multithreaded, so uses the journal_iters list: + */ +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *iter, + struct bch_fs *c, + struct btree *b) +{ + struct btree_node_iter node_iter; + + bch2_btree_node_iter_init_from_start(&node_iter, b); + __bch2_btree_and_journal_iter_init_node_iter(iter, c, b, node_iter, b->data->min_key); + list_add(&iter->journal.list, &c->journal_iters); +} + +/* sort and dedup all keys in the journal: */ + +void bch2_journal_entries_free(struct bch_fs *c) +{ + struct journal_replay **i; + struct genradix_iter iter; + + genradix_for_each(&c->journal_entries, iter, i) + if (*i) + kvpfree(*i, offsetof(struct journal_replay, j) + + vstruct_bytes(&(*i)->j)); + genradix_free(&c->journal_entries); +} + +/* + * When keys compare equal, oldest compares first: + */ +static int journal_sort_key_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = _l; + const struct journal_key *r = _r; + + return journal_key_cmp(l, r) ?: + cmp_int(l->journal_seq, r->journal_seq) ?: + cmp_int(l->journal_offset, r->journal_offset); +} + +void bch2_journal_keys_free(struct journal_keys *keys) +{ + struct journal_key *i; + + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + for (i = keys->d; i < keys->d + keys->nr; i++) + if (i->allocated) + kfree(i->k); + + kvfree(keys->d); + keys->d = NULL; + keys->nr = keys->gap = keys->size = 0; +} + +static void __journal_keys_sort(struct journal_keys *keys) +{ + struct journal_key *src, *dst; + + sort(keys->d, keys->nr, sizeof(keys->d[0]), journal_sort_key_cmp, NULL); + + src = dst = keys->d; + while (src < keys->d + keys->nr) { + while (src + 1 < keys->d + keys->nr && + src[0].btree_id == src[1].btree_id && + src[0].level == src[1].level && + bpos_eq(src[0].k->k.p, src[1].k->k.p)) + src++; + + *dst++ = *src++; + } + + keys->nr = dst - keys->d; +} + +static int journal_keys_sort(struct bch_fs *c) +{ + struct genradix_iter iter; + struct journal_replay *i, **_i; + struct jset_entry *entry; + struct bkey_i *k; + struct journal_keys *keys = &c->journal_keys; + size_t nr_keys = 0, nr_read = 0; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + for_each_jset_key(k, entry, &i->j) + nr_keys++; + } + + if (!nr_keys) + return 0; + + keys->size = roundup_pow_of_two(nr_keys); + + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + if (!keys->d) { + bch_err(c, "Failed to allocate buffer for sorted journal keys (%zu keys); trying slowpath", + nr_keys); + + do { + keys->size >>= 1; + keys->d = kvmalloc_array(keys->size, sizeof(keys->d[0]), GFP_KERNEL); + } while (!keys->d && keys->size > nr_keys / 8); + + if (!keys->d) { + bch_err(c, "Failed to allocate %zu size buffer for sorted journal keys; exiting", + keys->size); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + cond_resched(); + + for_each_jset_key(k, entry, &i->j) { + if (keys->nr == keys->size) { + __journal_keys_sort(keys); + + if (keys->nr > keys->size * 7 / 8) { + bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu/%zu", + keys->nr, keys->size, nr_read, nr_keys); + return -BCH_ERR_ENOMEM_journal_keys_sort; + } + } + + keys->d[keys->nr++] = (struct journal_key) { + .btree_id = entry->btree_id, + .level = entry->level, + .k = k, + .journal_seq = le64_to_cpu(i->j.seq), + .journal_offset = k->_data - i->j._data, + }; + + nr_read++; + } + } + + __journal_keys_sort(keys); + keys->gap = keys->nr; + + bch_verbose(c, "Journal keys: %zu read, %zu after sorting and compacting", nr_keys, keys->nr); + return 0; +} + +/* journal replay: */ + +static void replay_now_at(struct journal *j, u64 seq) +{ + BUG_ON(seq < j->replay_journal_seq); + + seq = min(seq, j->replay_journal_seq_end); + + while (j->replay_journal_seq < seq) + bch2_journal_pin_put(j, j->replay_journal_seq++); +} + +static int bch2_journal_replay_key(struct btree_trans *trans, + struct journal_key *k) +{ + struct btree_iter iter; + unsigned iter_flags = + BTREE_ITER_INTENT| + BTREE_ITER_NOT_EXTENTS; + unsigned update_flags = BTREE_TRIGGER_NORUN; + int ret; + + /* + * BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to + * keep the key cache coherent with the underlying btree. Nothing + * besides the allocator is doing updates yet so we don't need key cache + * coherency for non-alloc btrees, and key cache fills for snapshots + * btrees use BTREE_ITER_FILTER_SNAPSHOTS, which isn't available until + * the snapshots recovery pass runs. + */ + if (!k->level && k->btree_id == BTREE_ID_alloc) + iter_flags |= BTREE_ITER_CACHED; + else + update_flags |= BTREE_UPDATE_KEY_CACHE_RECLAIM; + + bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, + BTREE_MAX_DEPTH, k->level, + iter_flags); + ret = bch2_btree_iter_traverse(&iter); + if (ret) + goto out; + + /* Must be checked with btree locked: */ + if (k->overwritten) + goto out; + + ret = bch2_trans_update(trans, &iter, k->k, update_flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int journal_sort_seq_cmp(const void *_l, const void *_r) +{ + const struct journal_key *l = *((const struct journal_key **)_l); + const struct journal_key *r = *((const struct journal_key **)_r); + + return cmp_int(l->journal_seq, r->journal_seq); +} + +static int bch2_journal_replay(struct bch_fs *c) +{ + struct journal_keys *keys = &c->journal_keys; + struct journal_key **keys_sorted, *k; + struct journal *j = &c->journal; + u64 start_seq = c->journal_replay_seq_start; + u64 end_seq = c->journal_replay_seq_start; + size_t i; + int ret; + + move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr); + keys->gap = keys->nr; + + keys_sorted = kvmalloc_array(sizeof(*keys_sorted), keys->nr, GFP_KERNEL); + if (!keys_sorted) + return -BCH_ERR_ENOMEM_journal_replay; + + for (i = 0; i < keys->nr; i++) + keys_sorted[i] = &keys->d[i]; + + sort(keys_sorted, keys->nr, + sizeof(keys_sorted[0]), + journal_sort_seq_cmp, NULL); + + if (keys->nr) { + ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)", + keys->nr, start_seq, end_seq); + if (ret) + goto err; + } + + for (i = 0; i < keys->nr; i++) { + k = keys_sorted[i]; + + cond_resched(); + + replay_now_at(j, k->journal_seq); + + ret = bch2_trans_do(c, NULL, NULL, + BTREE_INSERT_LAZY_RW| + BTREE_INSERT_NOFAIL| + (!k->allocated + ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim + : 0), + bch2_journal_replay_key(&trans, k)); + if (ret) { + bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s", + bch2_btree_ids[k->btree_id], k->level, bch2_err_str(ret)); + goto err; + } + } + + replay_now_at(j, j->replay_journal_seq_end); + j->replay_journal_seq = 0; + + bch2_journal_set_replay_done(j); + bch2_journal_flush_all_pins(j); + ret = bch2_journal_error(j); + + if (keys->nr && !ret) + bch2_journal_log_msg(c, "journal replay finished"); +err: + kvfree(keys_sorted); + + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* journal replay early: */ + +static int journal_replay_entry_early(struct bch_fs *c, + struct jset_entry *entry) +{ + int ret = 0; + + switch (entry->type) { + case BCH_JSET_ENTRY_btree_root: { + struct btree_root *r; + + while (entry->btree_id >= c->btree_roots_extra.nr + BTREE_ID_NR) { + ret = darray_push(&c->btree_roots_extra, (struct btree_root) { NULL }); + if (ret) + return ret; + } + + r = bch2_btree_id_root(c, entry->btree_id); + + if (entry->u64s) { + r->level = entry->level; + bkey_copy(&r->key, &entry->start[0]); + r->error = 0; + } else { + r->error = -EIO; + } + r->alive = true; + break; + } + case BCH_JSET_ENTRY_usage: { + struct jset_entry_usage *u = + container_of(entry, struct jset_entry_usage, entry); + + switch (entry->btree_id) { + case BCH_FS_USAGE_reserved: + if (entry->level < BCH_REPLICAS_MAX) + c->usage_base->persistent_reserved[entry->level] = + le64_to_cpu(u->v); + break; + case BCH_FS_USAGE_inodes: + c->usage_base->nr_inodes = le64_to_cpu(u->v); + break; + case BCH_FS_USAGE_key_version: + atomic64_set(&c->key_version, + le64_to_cpu(u->v)); + break; + } + + break; + } + case BCH_JSET_ENTRY_data_usage: { + struct jset_entry_data_usage *u = + container_of(entry, struct jset_entry_data_usage, entry); + + ret = bch2_replicas_set_usage(c, &u->r, + le64_to_cpu(u->v)); + break; + } + case BCH_JSET_ENTRY_dev_usage: { + struct jset_entry_dev_usage *u = + container_of(entry, struct jset_entry_dev_usage, entry); + struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev)); + unsigned i, nr_types = jset_entry_dev_usage_nr_types(u); + + ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec); + + for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) { + ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets); + ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors); + ca->usage_base->d[i].fragmented = le64_to_cpu(u->d[i].fragmented); + } + + break; + } + case BCH_JSET_ENTRY_blacklist: { + struct jset_entry_blacklist *bl_entry = + container_of(entry, struct jset_entry_blacklist, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->seq), + le64_to_cpu(bl_entry->seq) + 1); + break; + } + case BCH_JSET_ENTRY_blacklist_v2: { + struct jset_entry_blacklist_v2 *bl_entry = + container_of(entry, struct jset_entry_blacklist_v2, entry); + + ret = bch2_journal_seq_blacklist_add(c, + le64_to_cpu(bl_entry->start), + le64_to_cpu(bl_entry->end) + 1); + break; + } + case BCH_JSET_ENTRY_clock: { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + atomic64_set(&c->io_clock[clock->rw].now, le64_to_cpu(clock->time)); + } + } + + return ret; +} + +static int journal_replay_early(struct bch_fs *c, + struct bch_sb_field_clean *clean) +{ + struct jset_entry *entry; + int ret; + + if (clean) { + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } else { + struct genradix_iter iter; + struct journal_replay *i, **_i; + + genradix_for_each(&c->journal_entries, iter, _i) { + i = *_i; + + if (!i || i->ignore) + continue; + + vstruct_for_each(&i->j, entry) { + ret = journal_replay_entry_early(c, entry); + if (ret) + return ret; + } + } + } + + bch2_fs_usage_initialize(c); + + return 0; +} + +/* sb clean section: */ + +static struct bkey_i *btree_root_find(struct bch_fs *c, + struct bch_sb_field_clean *clean, + struct jset *j, + enum btree_id id, unsigned *level) +{ + struct bkey_i *k; + struct jset_entry *entry, *start, *end; + + if (clean) { + start = clean->start; + end = vstruct_end(&clean->field); + } else { + start = j->start; + end = vstruct_last(j); + } + + for (entry = start; entry < end; entry = vstruct_next(entry)) + if (entry->type == BCH_JSET_ENTRY_btree_root && + entry->btree_id == id) + goto found; + + return NULL; +found: + if (!entry->u64s) + return ERR_PTR(-EINVAL); + + k = entry->start; + *level = entry->level; + return k; +} + +static int verify_superblock_clean(struct bch_fs *c, + struct bch_sb_field_clean **cleanp, + struct jset *j) +{ + unsigned i; + struct bch_sb_field_clean *clean = *cleanp; + struct printbuf buf1 = PRINTBUF; + struct printbuf buf2 = PRINTBUF; + int ret = 0; + + if (mustfix_fsck_err_on(j->seq != clean->journal_seq, c, + "superblock journal seq (%llu) doesn't match journal (%llu) after clean shutdown", + le64_to_cpu(clean->journal_seq), + le64_to_cpu(j->seq))) { + kfree(clean); + *cleanp = NULL; + return 0; + } + + for (i = 0; i < BTREE_ID_NR; i++) { + struct bkey_i *k1, *k2; + unsigned l1 = 0, l2 = 0; + + k1 = btree_root_find(c, clean, NULL, i, &l1); + k2 = btree_root_find(c, NULL, j, i, &l2); + + if (!k1 && !k2) + continue; + + printbuf_reset(&buf1); + printbuf_reset(&buf2); + + if (k1) + bch2_bkey_val_to_text(&buf1, c, bkey_i_to_s_c(k1)); + else + prt_printf(&buf1, "(none)"); + + if (k2) + bch2_bkey_val_to_text(&buf2, c, bkey_i_to_s_c(k2)); + else + prt_printf(&buf2, "(none)"); + + mustfix_fsck_err_on(!k1 || !k2 || + IS_ERR(k1) || + IS_ERR(k2) || + k1->k.u64s != k2->k.u64s || + memcmp(k1, k2, bkey_bytes(&k1->k)) || + l1 != l2, c, + "superblock btree root %u doesn't match journal after clean shutdown\n" + "sb: l=%u %s\n" + "journal: l=%u %s\n", i, + l1, buf1.buf, + l2, buf2.buf); + } +fsck_err: + printbuf_exit(&buf2); + printbuf_exit(&buf1); + return ret; +} + +static struct bch_sb_field_clean *read_superblock_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean, *sb_clean; + int ret; + + mutex_lock(&c->sb_lock); + sb_clean = bch2_sb_get_clean(c->disk_sb.sb); + + if (fsck_err_on(!sb_clean, c, + "superblock marked clean but clean section not present")) { + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + mutex_unlock(&c->sb_lock); + return NULL; + } + + clean = kmemdup(sb_clean, vstruct_bytes(&sb_clean->field), + GFP_KERNEL); + if (!clean) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(-BCH_ERR_ENOMEM_read_superblock_clean); + } + + ret = bch2_sb_clean_validate_late(c, clean, READ); + if (ret) { + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); + } + + mutex_unlock(&c->sb_lock); + + return clean; +fsck_err: + mutex_unlock(&c->sb_lock); + return ERR_PTR(ret); +} + +static bool btree_id_is_alloc(enum btree_id id) +{ + switch (id) { + case BTREE_ID_alloc: + case BTREE_ID_backpointers: + case BTREE_ID_need_discard: + case BTREE_ID_freespace: + case BTREE_ID_bucket_gens: + return true; + default: + return false; + } +} + +static int read_btree_roots(struct bch_fs *c) +{ + unsigned i; + int ret = 0; + + for (i = 0; i < btree_id_nr_alive(c); i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->alive) + continue; + + if (btree_id_is_alloc(i) && + c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + continue; + } + + if (r->error) { + __fsck_err(c, btree_id_is_alloc(i) + ? FSCK_CAN_IGNORE : 0, + "invalid btree root %s", + bch2_btree_ids[i]); + if (i == BTREE_ID_alloc) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + + ret = bch2_btree_root_read(c, i, &r->key, r->level); + if (ret) { + __fsck_err(c, + btree_id_is_alloc(i) + ? FSCK_CAN_IGNORE : 0, + "error reading btree root %s", + bch2_btree_ids[i]); + if (btree_id_is_alloc(i)) + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + } + } + + for (i = 0; i < BTREE_ID_NR; i++) { + struct btree_root *r = bch2_btree_id_root(c, i); + + if (!r->b) { + r->alive = false; + r->level = 0; + bch2_btree_root_alloc(c, i); + } + } +fsck_err: + return ret; +} + +static int bch2_initialize_subvolumes(struct bch_fs *c) +{ + struct bkey_i_snapshot_tree root_tree; + struct bkey_i_snapshot root_snapshot; + struct bkey_i_subvolume root_volume; + int ret; + + bkey_snapshot_tree_init(&root_tree.k_i); + root_tree.k.p.offset = 1; + root_tree.v.master_subvol = cpu_to_le32(1); + root_tree.v.root_snapshot = cpu_to_le32(U32_MAX); + + bkey_snapshot_init(&root_snapshot.k_i); + root_snapshot.k.p.offset = U32_MAX; + root_snapshot.v.flags = 0; + root_snapshot.v.parent = 0; + root_snapshot.v.subvol = cpu_to_le32(BCACHEFS_ROOT_SUBVOL); + root_snapshot.v.tree = cpu_to_le32(1); + SET_BCH_SNAPSHOT_SUBVOL(&root_snapshot.v, true); + + bkey_subvolume_init(&root_volume.k_i); + root_volume.k.p.offset = BCACHEFS_ROOT_SUBVOL; + root_volume.v.flags = 0; + root_volume.v.snapshot = cpu_to_le32(U32_MAX); + root_volume.v.inode = cpu_to_le64(BCACHEFS_ROOT_INO); + + ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, + &root_tree.k_i, + NULL, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_snapshots, + &root_snapshot.k_i, + NULL, NULL, 0) ?: + bch2_btree_insert(c, BTREE_ID_subvolumes, + &root_volume.k_i, + NULL, NULL, 0); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_inode_unpacked inode; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, BCACHEFS_ROOT_INO, U32_MAX), 0); + ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_is_inode(k.k)) { + bch_err(trans->c, "root inode not found"); + ret = -BCH_ERR_ENOENT_inode; + goto err; + } + + ret = bch2_inode_unpack(k, &inode); + BUG_ON(ret); + + inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + + ret = bch2_inode_write(trans, &iter, &inode); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* set bi_subvol on root inode */ +noinline_for_stack +static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) +{ + int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW, + __bch2_fs_upgrade_for_subvolumes(&trans)); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static void check_version_upgrade(struct bch_fs *c) +{ + unsigned latest_compatible = bch2_version_compatible(c->sb.version); + unsigned latest_version = bcachefs_metadata_version_current; + unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version; + unsigned new_version = 0; + u64 recovery_passes; + + if (old_version < bcachefs_metadata_required_upgrade_below) { + if (c->opts.version_upgrade == BCH_VERSION_UPGRADE_incompatible || + latest_compatible < bcachefs_metadata_required_upgrade_below) + new_version = latest_version; + else + new_version = latest_compatible; + } else { + switch (c->opts.version_upgrade) { + case BCH_VERSION_UPGRADE_compatible: + new_version = latest_compatible; + break; + case BCH_VERSION_UPGRADE_incompatible: + new_version = latest_version; + break; + case BCH_VERSION_UPGRADE_none: + new_version = old_version; + break; + } + } + + if (new_version > old_version) { + struct printbuf buf = PRINTBUF; + + if (old_version < bcachefs_metadata_required_upgrade_below) + prt_str(&buf, "Version upgrade required:\n"); + + if (old_version != c->sb.version) { + prt_str(&buf, "Version upgrade from "); + bch2_version_to_text(&buf, c->sb.version_upgrade_complete); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, " incomplete\n"); + } + + prt_printf(&buf, "Doing %s version upgrade from ", + BCH_VERSION_MAJOR(old_version) != BCH_VERSION_MAJOR(new_version) + ? "incompatible" : "compatible"); + bch2_version_to_text(&buf, old_version); + prt_str(&buf, " to "); + bch2_version_to_text(&buf, new_version); + prt_newline(&buf); + + recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version); + if (recovery_passes) { + prt_str(&buf, "fsck required"); + + c->recovery_passes_explicit |= recovery_passes; + c->opts.fix_errors = FSCK_FIX_yes; + } + + bch_info(c, "%s", buf.buf); + + mutex_lock(&c->sb_lock); + bch2_sb_upgrade(c, new_version); + mutex_unlock(&c->sb_lock); + + printbuf_exit(&buf); + } +} + +static int bch2_check_allocations(struct bch_fs *c) +{ + return bch2_gc(c, true, c->opts.norecovery); +} + +static int bch2_set_may_go_rw(struct bch_fs *c) +{ + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + return 0; +} + +struct recovery_pass_fn { + int (*fn)(struct bch_fs *); + const char *name; + unsigned when; +}; + +static struct recovery_pass_fn recovery_passes[] = { +#define x(_fn, _when) { .fn = bch2_##_fn, .name = #_fn, .when = _when }, + BCH_RECOVERY_PASSES() +#undef x +}; + +u64 bch2_fsck_recovery_passes(void) +{ + u64 ret = 0; + + for (unsigned i = 0; i < ARRAY_SIZE(recovery_passes); i++) + if (recovery_passes[i].when & PASS_FSCK) + ret |= BIT_ULL(i); + return ret; +} + +static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + struct recovery_pass_fn *p = recovery_passes + c->curr_recovery_pass; + + if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read) + return false; + if (c->recovery_passes_explicit & BIT_ULL(pass)) + return true; + if ((p->when & PASS_FSCK) && c->opts.fsck) + return true; + if ((p->when & PASS_UNCLEAN) && !c->sb.clean) + return true; + if (p->when & PASS_ALWAYS) + return true; + return false; +} + +static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +{ + int ret; + + c->curr_recovery_pass = pass; + + if (should_run_recovery_pass(c, pass)) { + struct recovery_pass_fn *p = recovery_passes + pass; + + if (!(p->when & PASS_SILENT)) + printk(KERN_INFO bch2_log_msg(c, "%s..."), p->name); + ret = p->fn(c); + if (ret) + return ret; + if (!(p->when & PASS_SILENT)) + printk(KERN_CONT " done\n"); + } + + return 0; +} + +static int bch2_run_recovery_passes(struct bch_fs *c) +{ + int ret = 0; +again: + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_passes)) { + ret = bch2_run_recovery_pass(c, c->curr_recovery_pass); + if (ret) + break; + c->curr_recovery_pass++; + } + + if (bch2_err_matches(ret, BCH_ERR_need_snapshot_cleanup)) { + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + c->curr_recovery_pass = BCH_RECOVERY_PASS_delete_dead_snapshots; + goto again; + } + + return ret; +} + +int bch2_fs_recovery(struct bch_fs *c) +{ + struct bch_sb_field_clean *clean = NULL; + struct jset *last_journal_entry = NULL; + u64 last_seq, blacklist_seq, journal_seq; + bool write_sb = false; + int ret = 0; + + if (c->sb.clean) + clean = read_superblock_clean(c); + ret = PTR_ERR_OR_ZERO(clean); + if (ret) + goto err; + + if (c->sb.clean) + bch_info(c, "recovering from clean shutdown, journal seq %llu", + le64_to_cpu(clean->journal_seq)); + else + bch_info(c, "recovering from unclean shutdown"); + + if (!(c->sb.features & (1ULL << BCH_FEATURE_new_extent_overwrite))) { + bch_err(c, "feature new_extent_overwrite not set, filesystem no longer supported"); + ret = -EINVAL; + goto err; + } + + if (!c->sb.clean && + !(c->sb.features & (1ULL << BCH_FEATURE_extents_above_btree_updates))) { + bch_err(c, "filesystem needs recovery from older version; run fsck from older bcachefs-tools to fix"); + ret = -EINVAL; + goto err; + } + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done))) { + bch_err(c, "filesystem may have incompatible bkey formats; run fsck from the compat branch to fix"); + ret = -EINVAL; + goto err; + } + + if (c->opts.fsck || !(c->opts.nochanges && c->opts.norecovery)) + check_version_upgrade(c); + + if (c->opts.fsck && c->opts.norecovery) { + bch_err(c, "cannot select both norecovery and fsck"); + ret = -EINVAL; + goto err; + } + + ret = bch2_blacklist_table_initialize(c); + if (ret) { + bch_err(c, "error initializing blacklist table"); + goto err; + } + + if (!c->sb.clean || c->opts.fsck || c->opts.keep_journal) { + struct genradix_iter iter; + struct journal_replay **i; + + bch_verbose(c, "starting journal read"); + ret = bch2_journal_read(c, &last_seq, &blacklist_seq, &journal_seq); + if (ret) + goto err; + + /* + * note: cmd_list_journal needs the blacklist table fully up to date so + * it can asterisk ignored journal entries: + */ + if (c->opts.read_journal_only) + goto out; + + genradix_for_each_reverse(&c->journal_entries, iter, i) + if (*i && !(*i)->ignore) { + last_journal_entry = &(*i)->j; + break; + } + + if (mustfix_fsck_err_on(c->sb.clean && + last_journal_entry && + !journal_entry_empty(last_journal_entry), c, + "filesystem marked clean but journal not empty")) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + c->sb.clean = false; + } + + if (!last_journal_entry) { + fsck_err_on(!c->sb.clean, c, "no journal entries found"); + if (clean) + goto use_clean; + + genradix_for_each_reverse(&c->journal_entries, iter, i) + if (*i) { + last_journal_entry = &(*i)->j; + (*i)->ignore = false; + break; + } + } + + ret = journal_keys_sort(c); + if (ret) + goto err; + + if (c->sb.clean && last_journal_entry) { + ret = verify_superblock_clean(c, &clean, + last_journal_entry); + if (ret) + goto err; + } + } else { +use_clean: + if (!clean) { + bch_err(c, "no superblock clean section found"); + ret = -BCH_ERR_fsck_repair_impossible; + goto err; + + } + blacklist_seq = journal_seq = le64_to_cpu(clean->journal_seq) + 1; + } + + c->journal_replay_seq_start = last_seq; + c->journal_replay_seq_end = blacklist_seq - 1;; + + if (c->opts.reconstruct_alloc) { + c->sb.compat &= ~(1ULL << BCH_COMPAT_alloc_info); + drop_alloc_keys(&c->journal_keys); + } + + zero_out_btree_mem_ptr(&c->journal_keys); + + ret = journal_replay_early(c, clean); + if (ret) + goto err; + + /* + * After an unclean shutdown, skip then next few journal sequence + * numbers as they may have been referenced by btree writes that + * happened before their corresponding journal writes - those btree + * writes need to be ignored, by skipping and blacklisting the next few + * journal sequence numbers: + */ + if (!c->sb.clean) + journal_seq += 8; + + if (blacklist_seq != journal_seq) { + ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", + blacklist_seq, journal_seq) ?: + bch2_journal_seq_blacklist_add(c, + blacklist_seq, journal_seq); + if (ret) { + bch_err(c, "error creating new journal seq blacklist entry"); + goto err; + } + } + + ret = bch2_journal_log_msg(c, "starting journal at entry %llu, replaying %llu-%llu", + journal_seq, last_seq, blacklist_seq - 1) ?: + bch2_fs_journal_start(&c->journal, journal_seq); + if (ret) + goto err; + + if (c->opts.reconstruct_alloc) + bch2_journal_log_msg(c, "dropping alloc info"); + + /* + * Skip past versions that might have possibly been used (as nonces), + * but hadn't had their pointers written: + */ + if (c->sb.encryption_type && !c->sb.clean) + atomic64_add(1 << 16, &c->key_version); + + ret = read_btree_roots(c); + if (ret) + goto err; + + ret = bch2_run_recovery_passes(c); + if (ret) + goto err; + + if (enabled_qtypes(c)) { + bch_verbose(c, "reading quotas"); + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + bch_verbose(c, "quotas done"); + } + + mutex_lock(&c->sb_lock); + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) != c->sb.version) { + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, c->sb.version); + write_sb = true; + } + + if (!test_bit(BCH_FS_ERROR, &c->flags)) { + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + write_sb = true; + } + + if (c->opts.fsck && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) { + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0); + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0); + write_sb = true; + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (!(c->sb.compat & (1ULL << BCH_COMPAT_extents_above_btree_updates_done)) || + !(c->sb.compat & (1ULL << BCH_COMPAT_bformat_overflow_done)) || + c->sb.version_min < bcachefs_metadata_version_btree_ptr_sectors_written) { + struct bch_move_stats stats; + + bch2_move_stats_init(&stats, "recovery"); + + bch_info(c, "scanning for old btree nodes"); + ret = bch2_fs_read_write(c) ?: + bch2_scan_old_btree_nodes(c, &stats); + if (ret) + goto err; + bch_info(c, "scanning for old btree nodes done"); + } + + if (c->journal_seq_blacklist_table && + c->journal_seq_blacklist_table->nr > 128) + queue_work(system_long_wq, &c->journal_seq_blacklist_gc_work); + + ret = 0; +out: + set_bit(BCH_FS_FSCK_DONE, &c->flags); + bch2_flush_fsck_errs(c); + + if (!c->opts.keep_journal && + test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) { + bch2_journal_keys_free(&c->journal_keys); + bch2_journal_entries_free(c); + } + kfree(clean); + + if (!ret && test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) { + bch2_fs_read_write_early(c); + bch2_delete_dead_snapshots_async(c); + } + + if (ret) + bch_err_fn(c, ret); + return ret; +err: +fsck_err: + bch2_fs_emergency_read_only(c); + goto out; +} + +int bch2_fs_initialize(struct bch_fs *c) +{ + struct bch_inode_unpacked root_inode, lostfound_inode; + struct bkey_inode_buf packed_inode; + struct qstr lostfound = QSTR("lost+found"); + struct bch_dev *ca; + unsigned i; + int ret; + + bch_notice(c, "initializing new filesystem"); + + mutex_lock(&c->sb_lock); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_extents_above_btree_updates_done); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_bformat_overflow_done); + + bch2_sb_maybe_downgrade(c); + + if (c->opts.version_upgrade != BCH_VERSION_UPGRADE_none) { + bch2_sb_upgrade(c, bcachefs_metadata_version_current); + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + + c->curr_recovery_pass = ARRAY_SIZE(recovery_passes); + set_bit(BCH_FS_MAY_GO_RW, &c->flags); + set_bit(BCH_FS_FSCK_DONE, &c->flags); + + for (i = 0; i < BTREE_ID_NR; i++) + bch2_btree_root_alloc(c, i); + + for_each_online_member(ca, c, i) + bch2_dev_usage_init(ca); + + for_each_online_member(ca, c, i) { + ret = bch2_dev_journal_alloc(ca); + if (ret) { + percpu_ref_put(&ca->io_ref); + goto err; + } + } + + /* + * journal_res_get() will crash if called before this has + * set up the journal.pin FIFO and journal.cur pointer: + */ + bch2_fs_journal_start(&c->journal, 1); + bch2_journal_set_replay_done(&c->journal); + + ret = bch2_fs_read_write_early(c); + if (ret) + goto err; + + /* + * Write out the superblock and journal buckets, now that we can do + * btree updates + */ + bch_verbose(c, "marking superblocks"); + for_each_member_device(ca, c, i) { + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + percpu_ref_put(&ca->ref); + goto err; + } + + ca->new_fs_bucket_idx = 0; + } + + ret = bch2_fs_freespace_init(c); + if (ret) + goto err; + + ret = bch2_initialize_subvolumes(c); + if (ret) + goto err; + + bch_verbose(c, "reading snapshots table"); + ret = bch2_snapshots_read(c); + if (ret) + goto err; + bch_verbose(c, "reading snapshots done"); + + bch2_inode_init(c, &root_inode, 0, 0, S_IFDIR|0755, 0, NULL); + root_inode.bi_inum = BCACHEFS_ROOT_INO; + root_inode.bi_subvol = BCACHEFS_ROOT_SUBVOL; + bch2_inode_pack(&packed_inode, &root_inode); + packed_inode.inode.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_inodes, + &packed_inode.inode.k_i, + NULL, NULL, 0); + if (ret) { + bch_err_msg(c, ret, "creating root directory"); + goto err; + } + + bch2_inode_init_early(c, &lostfound_inode); + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_create_trans(&trans, + BCACHEFS_ROOT_SUBVOL_INUM, + &root_inode, &lostfound_inode, + &lostfound, + 0, 0, S_IFDIR|0700, 0, + NULL, NULL, (subvol_inum) { 0 }, 0)); + if (ret) { + bch_err_msg(c, ret, "creating lost+found"); + goto err; + } + + if (enabled_qtypes(c)) { + ret = bch2_fs_quota_read(c); + if (ret) + goto err; + } + + ret = bch2_journal_flush(&c->journal); + if (ret) { + bch_err_msg(c, ret, "writing first journal entry"); + goto err; + } + + mutex_lock(&c->sb_lock); + SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return 0; +err: + bch_err_fn(ca, ret); + return ret; +} diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h new file mode 100644 index 000000000..f8e796c0f --- /dev/null +++ b/fs/bcachefs/recovery.h @@ -0,0 +1,60 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_H +#define _BCACHEFS_RECOVERY_H + +struct journal_iter { + struct list_head list; + enum btree_id btree_id; + unsigned level; + size_t idx; + struct journal_keys *keys; +}; + +/* + * Iterate over keys in the btree, with keys from the journal overlaid on top: + */ + +struct btree_and_journal_iter { + struct btree *b; + struct btree_node_iter node_iter; + struct bkey unpacked; + + struct journal_iter journal; + struct bpos pos; + bool at_end; +}; + +struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *, enum btree_id, + unsigned, struct bpos, struct bpos, size_t *); +struct bkey_i *bch2_journal_keys_peek_slot(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +int bch2_journal_key_insert_take(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_insert(struct bch_fs *, enum btree_id, + unsigned, struct bkey_i *); +int bch2_journal_key_delete(struct bch_fs *, enum btree_id, + unsigned, struct bpos); +void bch2_journal_key_overwritten(struct bch_fs *, enum btree_id, + unsigned, struct bpos); + +void bch2_btree_and_journal_iter_advance(struct btree_and_journal_iter *); +struct bkey_s_c bch2_btree_and_journal_iter_peek(struct btree_and_journal_iter *); + +void bch2_btree_and_journal_iter_exit(struct btree_and_journal_iter *); +void __bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, struct btree *, + struct btree_node_iter, struct bpos); +void bch2_btree_and_journal_iter_init_node_iter(struct btree_and_journal_iter *, + struct bch_fs *, + struct btree *); + +void bch2_journal_keys_free(struct journal_keys *); +void bch2_journal_entries_free(struct bch_fs *); + +u64 bch2_fsck_recovery_passes(void); + +int bch2_fs_recovery(struct bch_fs *); +int bch2_fs_initialize(struct bch_fs *); + +#endif /* _BCACHEFS_RECOVERY_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c new file mode 100644 index 000000000..39f711d50 --- /dev/null +++ b/fs/bcachefs/reflink.c @@ -0,0 +1,399 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bkey_buf.h" +#include "btree_update.h" +#include "buckets.h" +#include "extents.h" +#include "inode.h" +#include "io.h" +#include "reflink.h" +#include "subvolume.h" + +#include + +static inline unsigned bkey_type_to_indirect(const struct bkey *k) +{ + switch (k->type) { + case KEY_TYPE_extent: + return KEY_TYPE_reflink_v; + case KEY_TYPE_inline_data: + return KEY_TYPE_indirect_inline_data; + default: + return 0; + } +} + +/* reflink pointers */ + +int bch2_reflink_p_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix && + le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) { + prt_printf(err, "idx < front_pad (%llu < %u)", + le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad)); + return -EINVAL; + } + + return 0; +} + +void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); + + prt_printf(out, "idx %llu front_pad %u back_pad %u", + le64_to_cpu(p.v->idx), + le32_to_cpu(p.v->front_pad), + le32_to_cpu(p.v->back_pad)); +} + +bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_p l = bkey_s_to_reflink_p(_l); + struct bkey_s_c_reflink_p r = bkey_s_c_to_reflink_p(_r); + + /* + * Disabled for now, the triggers code needs to be reworked for merging + * of reflink pointers to work: + */ + return false; + + if (le64_to_cpu(l.v->idx) + l.k->size != le64_to_cpu(r.v->idx)) + return false; + + bch2_key_resize(l.k, l.k->size + r.k->size); + return true; +} + +/* indirect extents */ + +int bch2_reflink_v_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + return bch2_bkey_ptrs_invalid(c, k, flags, err); +} + +void bch2_reflink_v_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(k); + + prt_printf(out, "refcount: %llu ", le64_to_cpu(r.v->refcount)); + + bch2_bkey_ptrs_to_text(out, c, k); +} + +bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r) +{ + struct bkey_s_reflink_v l = bkey_s_to_reflink_v(_l); + struct bkey_s_c_reflink_v r = bkey_s_c_to_reflink_v(_r); + + return l.v->refcount == r.v->refcount && bch2_extent_merge(c, _l, _r); +} + +int bch2_trans_mark_reflink_v(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + struct bkey_i_reflink_v *r = bkey_i_to_reflink_v(new); + + if (!r->v.refcount) { + r->k.type = KEY_TYPE_deleted; + r->k.size = 0; + set_bkey_val_u64s(&r->k, 0); + return 0; + } + } + + return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags); +} + +/* indirect inline data */ + +int bch2_indirect_inline_data_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + return 0; +} + +void bch2_indirect_inline_data_to_text(struct printbuf *out, + struct bch_fs *c, struct bkey_s_c k) +{ + struct bkey_s_c_indirect_inline_data d = bkey_s_c_to_indirect_inline_data(k); + unsigned datalen = bkey_inline_data_bytes(k.k); + + prt_printf(out, "refcount %llu datalen %u: %*phN", + le64_to_cpu(d.v->refcount), datalen, + min(datalen, 32U), d.v->data); +} + +int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans, + enum btree_id btree_id, unsigned level, + struct bkey_s_c old, struct bkey_i *new, + unsigned flags) +{ + if (!(flags & BTREE_TRIGGER_OVERWRITE)) { + struct bkey_i_indirect_inline_data *r = + bkey_i_to_indirect_inline_data(new); + + if (!r->v.refcount) { + r->k.type = KEY_TYPE_deleted; + r->k.size = 0; + set_bkey_val_u64s(&r->k, 0); + } + } + + return 0; +} + +static int bch2_make_extent_indirect(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct bkey_i *orig) +{ + struct bch_fs *c = trans->c; + struct btree_iter reflink_iter = { NULL }; + struct bkey_s_c k; + struct bkey_i *r_v; + struct bkey_i_reflink_p *r_p; + __le64 *refcount; + int ret; + + if (orig->k.type == KEY_TYPE_inline_data) + bch2_check_set_feature(c, BCH_FEATURE_reflink_inline_data); + + bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek_prev(&reflink_iter); + ret = bkey_err(k); + if (ret) + goto err; + + r_v = bch2_trans_kmalloc(trans, sizeof(__le64) + bkey_bytes(&orig->k)); + ret = PTR_ERR_OR_ZERO(r_v); + if (ret) + goto err; + + bkey_init(&r_v->k); + r_v->k.type = bkey_type_to_indirect(&orig->k); + r_v->k.p = reflink_iter.pos; + bch2_key_resize(&r_v->k, orig->k.size); + r_v->k.version = orig->k.version; + + set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k)); + + refcount = bkey_refcount(r_v); + *refcount = 0; + memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k)); + + ret = bch2_trans_update(trans, &reflink_iter, r_v, 0); + if (ret) + goto err; + + /* + * orig is in a bkey_buf which statically allocates 5 64s for the val, + * so we know it will be big enough: + */ + orig->k.type = KEY_TYPE_reflink_p; + r_p = bkey_i_to_reflink_p(orig); + set_bkey_val_bytes(&r_p->k, sizeof(r_p->v)); + + /* FORTIFY_SOURCE is broken here, and doesn't provide unsafe_memset() */ +#if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __underlying_memset(&r_p->v, 0, sizeof(r_p->v)); +#else + memset(&r_p->v, 0, sizeof(r_p->v)); +#endif + + r_p->v.idx = cpu_to_le64(bkey_start_offset(&r_v->k)); + + ret = bch2_trans_update(trans, extent_iter, &r_p->k_i, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); +err: + bch2_trans_iter_exit(trans, &reflink_iter); + + return ret; +} + +static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +{ + struct bkey_s_c k; + int ret; + + for_each_btree_key_upto_continue_norestart(*iter, end, 0, k, ret) { + if (bkey_extent_is_unwritten(k)) + continue; + + if (bkey_extent_is_data(k.k)) + return k; + } + + if (bkey_ge(iter->pos, end)) + bch2_btree_iter_set_pos(iter, end); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; +} + +s64 bch2_remap_range(struct bch_fs *c, + subvol_inum dst_inum, u64 dst_offset, + subvol_inum src_inum, u64 src_offset, + u64 remap_sectors, + u64 new_i_size, s64 *i_sectors_delta) +{ + struct btree_trans trans; + struct btree_iter dst_iter, src_iter; + struct bkey_s_c src_k; + struct bkey_buf new_dst, new_src; + struct bpos dst_start = POS(dst_inum.inum, dst_offset); + struct bpos src_start = POS(src_inum.inum, src_offset); + struct bpos dst_end = dst_start, src_end = src_start; + struct bpos src_want; + u64 dst_done; + u32 dst_snapshot, src_snapshot; + int ret = 0, ret2 = 0; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) + return -BCH_ERR_erofs_no_writes; + + bch2_check_set_feature(c, BCH_FEATURE_reflink); + + dst_end.offset += remap_sectors; + src_end.offset += remap_sectors; + + bch2_bkey_buf_init(&new_dst); + bch2_bkey_buf_init(&new_src); + bch2_trans_init(&trans, c, BTREE_ITER_MAX, 4096); + + bch2_trans_iter_init(&trans, &src_iter, BTREE_ID_extents, src_start, + BTREE_ITER_INTENT); + bch2_trans_iter_init(&trans, &dst_iter, BTREE_ID_extents, dst_start, + BTREE_ITER_INTENT); + + while ((ret == 0 || + bch2_err_matches(ret, BCH_ERR_transaction_restart)) && + bkey_lt(dst_iter.pos, dst_end)) { + struct disk_reservation disk_res = { 0 }; + + bch2_trans_begin(&trans); + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = bch2_subvolume_get_snapshot(&trans, src_inum.subvol, + &src_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + + ret = bch2_subvolume_get_snapshot(&trans, dst_inum.subvol, + &dst_snapshot); + if (ret) + continue; + + bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + + dst_done = dst_iter.pos.offset - dst_start.offset; + src_want = POS(src_start.inode, src_start.offset + dst_done); + bch2_btree_iter_set_pos(&src_iter, src_want); + + src_k = get_next_src(&src_iter, src_end); + ret = bkey_err(src_k); + if (ret) + continue; + + if (bkey_lt(src_want, src_iter.pos)) { + ret = bch2_fpunch_at(&trans, &dst_iter, dst_inum, + min(dst_end.offset, + dst_iter.pos.offset + + src_iter.pos.offset - src_want.offset), + i_sectors_delta); + continue; + } + + if (src_k.k->type != KEY_TYPE_reflink_p) { + bch2_btree_iter_set_pos_to_extent_start(&src_iter); + + bch2_bkey_buf_reassemble(&new_src, c, src_k); + src_k = bkey_i_to_s_c(new_src.k); + + ret = bch2_make_extent_indirect(&trans, &src_iter, + new_src.k); + if (ret) + continue; + + BUG_ON(src_k.k->type != KEY_TYPE_reflink_p); + } + + if (src_k.k->type == KEY_TYPE_reflink_p) { + struct bkey_s_c_reflink_p src_p = + bkey_s_c_to_reflink_p(src_k); + struct bkey_i_reflink_p *dst_p = + bkey_reflink_p_init(new_dst.k); + + u64 offset = le64_to_cpu(src_p.v->idx) + + (src_want.offset - + bkey_start_offset(src_k.k)); + + dst_p->v.idx = cpu_to_le64(offset); + } else { + BUG(); + } + + new_dst.k->k.p = dst_iter.pos; + bch2_key_resize(&new_dst.k->k, + min(src_k.k->p.offset - src_want.offset, + dst_end.offset - dst_iter.pos.offset)); + + ret = bch2_extent_update(&trans, dst_inum, &dst_iter, + new_dst.k, &disk_res, + new_i_size, i_sectors_delta, + true); + bch2_disk_reservation_put(c, &disk_res); + } + bch2_trans_iter_exit(&trans, &dst_iter); + bch2_trans_iter_exit(&trans, &src_iter); + + BUG_ON(!ret && !bkey_eq(dst_iter.pos, dst_end)); + BUG_ON(bkey_gt(dst_iter.pos, dst_end)); + + dst_done = dst_iter.pos.offset - dst_start.offset; + new_i_size = min(dst_iter.pos.offset << 9, new_i_size); + + do { + struct bch_inode_unpacked inode_u; + struct btree_iter inode_iter = { NULL }; + + bch2_trans_begin(&trans); + + ret2 = bch2_inode_peek(&trans, &inode_iter, &inode_u, + dst_inum, BTREE_ITER_INTENT); + + if (!ret2 && + inode_u.bi_size < new_i_size) { + inode_u.bi_size = new_i_size; + ret2 = bch2_inode_write(&trans, &inode_iter, &inode_u) ?: + bch2_trans_commit(&trans, NULL, NULL, + BTREE_INSERT_NOFAIL); + } + + bch2_trans_iter_exit(&trans, &inode_iter); + } while (bch2_err_matches(ret2, BCH_ERR_transaction_restart)); + + bch2_trans_exit(&trans); + bch2_bkey_buf_exit(&new_src, c); + bch2_bkey_buf_exit(&new_dst, c); + + bch2_write_ref_put(c, BCH_WRITE_REF_reflink); + + return dst_done ?: ret ?: ret2; +} diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h new file mode 100644 index 000000000..fe52538ef --- /dev/null +++ b/fs/bcachefs/reflink.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REFLINK_H +#define _BCACHEFS_REFLINK_H + +enum bkey_invalid_flags; + +int bch2_reflink_p_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c); + +#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \ + .key_invalid = bch2_reflink_p_invalid, \ + .val_to_text = bch2_reflink_p_to_text, \ + .key_merge = bch2_reflink_p_merge, \ + .trans_trigger = bch2_trans_mark_reflink_p, \ + .atomic_trigger = bch2_mark_reflink_p, \ + .min_val_size = 16, \ +}) + +int bch2_reflink_v_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *, + struct bkey_s_c); +int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, unsigned); + +#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \ + .key_invalid = bch2_reflink_v_invalid, \ + .val_to_text = bch2_reflink_v_to_text, \ + .swab = bch2_ptr_swab, \ + .trans_trigger = bch2_trans_mark_reflink_v, \ + .atomic_trigger = bch2_mark_extent, \ + .min_val_size = 8, \ +}) + +int bch2_indirect_inline_data_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_indirect_inline_data_to_text(struct printbuf *, + struct bch_fs *, struct bkey_s_c); +int bch2_trans_mark_indirect_inline_data(struct btree_trans *, + enum btree_id, unsigned, + struct bkey_s_c, struct bkey_i *, + unsigned); + +#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \ + .key_invalid = bch2_indirect_inline_data_invalid, \ + .val_to_text = bch2_indirect_inline_data_to_text, \ + .trans_trigger = bch2_trans_mark_indirect_inline_data, \ + .min_val_size = 8, \ +}) + +static inline const __le64 *bkey_refcount_c(struct bkey_s_c k) +{ + switch (k.k->type) { + case KEY_TYPE_reflink_v: + return &bkey_s_c_to_reflink_v(k).v->refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_s_c_to_indirect_inline_data(k).v->refcount; + default: + return NULL; + } +} + +static inline __le64 *bkey_refcount(struct bkey_i *k) +{ + switch (k->k.type) { + case KEY_TYPE_reflink_v: + return &bkey_i_to_reflink_v(k)->v.refcount; + case KEY_TYPE_indirect_inline_data: + return &bkey_i_to_indirect_inline_data(k)->v.refcount; + default: + return NULL; + } +} + +s64 bch2_remap_range(struct bch_fs *, subvol_inum, u64, + subvol_inum, u64, u64, u64, s64 *); + +#endif /* _BCACHEFS_REFLINK_H */ diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c new file mode 100644 index 000000000..5b591c59b --- /dev/null +++ b/fs/bcachefs/replicas.c @@ -0,0 +1,1059 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "buckets.h" +#include "journal.h" +#include "replicas.h" +#include "super-io.h" + +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *, + struct bch_replicas_cpu *); + +/* Replicas tracking - in memory: */ + +static void verify_replicas_entry(struct bch_replicas_entry *e) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned i; + + BUG_ON(e->data_type >= BCH_DATA_NR); + BUG_ON(!e->nr_devs); + BUG_ON(e->nr_required > 1 && + e->nr_required >= e->nr_devs); + + for (i = 0; i + 1 < e->nr_devs; i++) + BUG_ON(e->devs[i] >= e->devs[i + 1]); +#endif +} + +void bch2_replicas_entry_sort(struct bch_replicas_entry *e) +{ + bubble_sort(e->devs, e->nr_devs, u8_cmp); +} + +static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r) +{ + eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL); +} + +static void bch2_replicas_entry_v0_to_text(struct printbuf *out, + struct bch_replicas_entry_v0 *e) +{ + unsigned i; + + if (e->data_type < BCH_DATA_NR) + prt_printf(out, "%s", bch2_data_types[e->data_type]); + else + prt_printf(out, "(invalid data type %u)", e->data_type); + + prt_printf(out, ": %u [", e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + prt_printf(out, i ? " %u" : "%u", e->devs[i]); + prt_printf(out, "]"); +} + +void bch2_replicas_entry_to_text(struct printbuf *out, + struct bch_replicas_entry *e) +{ + unsigned i; + + if (e->data_type < BCH_DATA_NR) + prt_printf(out, "%s", bch2_data_types[e->data_type]); + else + prt_printf(out, "(invalid data type %u)", e->data_type); + + prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs); + for (i = 0; i < e->nr_devs; i++) + prt_printf(out, i ? " %u" : "%u", e->devs[i]); + prt_printf(out, "]"); +} + +void bch2_cpu_replicas_to_text(struct printbuf *out, + struct bch_replicas_cpu *r) +{ + struct bch_replicas_entry *e; + bool first = true; + + for_each_cpu_replicas_entry(r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_to_text(out, e); + } +} + +static void extent_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + r->nr_required = 1; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + if (p.ptr.cached) + continue; + + if (!p.has_ec) + r->devs[r->nr_devs++] = p.ptr.dev; + else + r->nr_required = 0; + } +} + +static void stripe_to_replicas(struct bkey_s_c k, + struct bch_replicas_entry *r) +{ + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + const struct bch_extent_ptr *ptr; + + r->nr_required = s.v->nr_blocks - s.v->nr_redundant; + + for (ptr = s.v->ptrs; + ptr < s.v->ptrs + s.v->nr_blocks; + ptr++) + r->devs[r->nr_devs++] = ptr->dev; +} + +void bch2_bkey_to_replicas(struct bch_replicas_entry *e, + struct bkey_s_c k) +{ + e->nr_devs = 0; + + switch (k.k->type) { + case KEY_TYPE_btree_ptr: + case KEY_TYPE_btree_ptr_v2: + e->data_type = BCH_DATA_btree; + extent_to_replicas(k, e); + break; + case KEY_TYPE_extent: + case KEY_TYPE_reflink_v: + e->data_type = BCH_DATA_user; + extent_to_replicas(k, e); + break; + case KEY_TYPE_stripe: + e->data_type = BCH_DATA_parity; + stripe_to_replicas(k, e); + break; + } + + bch2_replicas_entry_sort(e); +} + +void bch2_devlist_to_replicas(struct bch_replicas_entry *e, + enum bch_data_type data_type, + struct bch_devs_list devs) +{ + unsigned i; + + BUG_ON(!data_type || + data_type == BCH_DATA_sb || + data_type >= BCH_DATA_NR); + + e->data_type = data_type; + e->nr_devs = 0; + e->nr_required = 1; + + for (i = 0; i < devs.nr; i++) + e->devs[e->nr_devs++] = devs.devs[i]; + + bch2_replicas_entry_sort(e); +} + +static struct bch_replicas_cpu +cpu_replicas_add_entry(struct bch_replicas_cpu *old, + struct bch_replicas_entry *new_entry) +{ + unsigned i; + struct bch_replicas_cpu new = { + .nr = old->nr + 1, + .entry_size = max_t(unsigned, old->entry_size, + replicas_entry_bytes(new_entry)), + }; + + BUG_ON(!new_entry->data_type); + verify_replicas_entry(new_entry); + + new.entries = kcalloc(new.nr, new.entry_size, GFP_KERNEL); + if (!new.entries) + return new; + + for (i = 0; i < old->nr; i++) + memcpy(cpu_replicas_entry(&new, i), + cpu_replicas_entry(old, i), + old->entry_size); + + memcpy(cpu_replicas_entry(&new, old->nr), + new_entry, + replicas_entry_bytes(new_entry)); + + bch2_cpu_replicas_sort(&new); + return new; +} + +static inline int __replicas_entry_idx(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + int idx, entry_size = replicas_entry_bytes(search); + + if (unlikely(entry_size > r->entry_size)) + return -1; + + verify_replicas_entry(search); + +#define entry_cmp(_l, _r, size) memcmp(_l, _r, entry_size) + idx = eytzinger0_find(r->entries, r->nr, r->entry_size, + entry_cmp, search); +#undef entry_cmp + + return idx < r->nr ? idx : -1; +} + +int bch2_replicas_entry_idx(struct bch_fs *c, + struct bch_replicas_entry *search) +{ + bch2_replicas_entry_sort(search); + + return __replicas_entry_idx(&c->replicas, search); +} + +static bool __replicas_has_entry(struct bch_replicas_cpu *r, + struct bch_replicas_entry *search) +{ + return __replicas_entry_idx(r, search) >= 0; +} + +bool bch2_replicas_marked(struct bch_fs *c, + struct bch_replicas_entry *search) +{ + bool marked; + + if (!search->nr_devs) + return true; + + verify_replicas_entry(search); + + percpu_down_read(&c->mark_lock); + marked = __replicas_has_entry(&c->replicas, search) && + (likely((!c->replicas_gc.entries)) || + __replicas_has_entry(&c->replicas_gc, search)); + percpu_up_read(&c->mark_lock); + + return marked; +} + +static void __replicas_table_update(struct bch_fs_usage *dst, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage *src, + struct bch_replicas_cpu *src_r) +{ + int src_idx, dst_idx; + + *dst = *src; + + for (src_idx = 0; src_idx < src_r->nr; src_idx++) { + if (!src->replicas[src_idx]) + continue; + + dst_idx = __replicas_entry_idx(dst_r, + cpu_replicas_entry(src_r, src_idx)); + BUG_ON(dst_idx < 0); + + dst->replicas[dst_idx] = src->replicas[src_idx]; + } +} + +static void __replicas_table_update_pcpu(struct bch_fs_usage __percpu *dst_p, + struct bch_replicas_cpu *dst_r, + struct bch_fs_usage __percpu *src_p, + struct bch_replicas_cpu *src_r) +{ + unsigned src_nr = sizeof(struct bch_fs_usage) / sizeof(u64) + src_r->nr; + struct bch_fs_usage *dst, *src = (void *) + bch2_acc_percpu_u64s((u64 __percpu *) src_p, src_nr); + + preempt_disable(); + dst = this_cpu_ptr(dst_p); + preempt_enable(); + + __replicas_table_update(dst, dst_r, src, src_r); +} + +/* + * Resize filesystem accounting: + */ +static int replicas_table_update(struct bch_fs *c, + struct bch_replicas_cpu *new_r) +{ + struct bch_fs_usage __percpu *new_usage[JOURNAL_BUF_NR]; + struct bch_fs_usage_online *new_scratch = NULL; + struct bch_fs_usage __percpu *new_gc = NULL; + struct bch_fs_usage *new_base = NULL; + unsigned i, bytes = sizeof(struct bch_fs_usage) + + sizeof(u64) * new_r->nr; + unsigned scratch_bytes = sizeof(struct bch_fs_usage_online) + + sizeof(u64) * new_r->nr; + int ret = 0; + + memset(new_usage, 0, sizeof(new_usage)); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (!(new_usage[i] = __alloc_percpu_gfp(bytes, + sizeof(u64), GFP_KERNEL))) + goto err; + + if (!(new_base = kzalloc(bytes, GFP_KERNEL)) || + !(new_scratch = kmalloc(scratch_bytes, GFP_KERNEL)) || + (c->usage_gc && + !(new_gc = __alloc_percpu_gfp(bytes, sizeof(u64), GFP_KERNEL)))) + goto err; + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + if (c->usage[i]) + __replicas_table_update_pcpu(new_usage[i], new_r, + c->usage[i], &c->replicas); + if (c->usage_base) + __replicas_table_update(new_base, new_r, + c->usage_base, &c->replicas); + if (c->usage_gc) + __replicas_table_update_pcpu(new_gc, new_r, + c->usage_gc, &c->replicas); + + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + swap(c->usage[i], new_usage[i]); + swap(c->usage_base, new_base); + swap(c->usage_scratch, new_scratch); + swap(c->usage_gc, new_gc); + swap(c->replicas, *new_r); +out: + free_percpu(new_gc); + kfree(new_scratch); + for (i = 0; i < ARRAY_SIZE(new_usage); i++) + free_percpu(new_usage[i]); + kfree(new_base); + return ret; +err: + bch_err(c, "error updating replicas table: memory allocation failure"); + ret = -BCH_ERR_ENOMEM_replicas_table; + goto out; +} + +static unsigned reserve_journal_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_replicas_entry *e; + unsigned journal_res_u64s = 0; + + /* nr_inodes: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); + + /* key_version: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)); + + /* persistent_reserved: */ + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_usage), sizeof(u64)) * + BCH_REPLICAS_MAX; + + for_each_cpu_replicas_entry(r, e) + journal_res_u64s += + DIV_ROUND_UP(sizeof(struct jset_entry_data_usage) + + e->nr_devs, sizeof(u64)); + return journal_res_u64s; +} + +noinline +static int bch2_mark_replicas_slowpath(struct bch_fs *c, + struct bch_replicas_entry *new_entry) +{ + struct bch_replicas_cpu new_r, new_gc; + int ret = 0; + + verify_replicas_entry(new_entry); + + memset(&new_r, 0, sizeof(new_r)); + memset(&new_gc, 0, sizeof(new_gc)); + + mutex_lock(&c->sb_lock); + + if (c->replicas_gc.entries && + !__replicas_has_entry(&c->replicas_gc, new_entry)) { + new_gc = cpu_replicas_add_entry(&c->replicas_gc, new_entry); + if (!new_gc.entries) { + ret = -BCH_ERR_ENOMEM_cpu_replicas; + goto err; + } + } + + if (!__replicas_has_entry(&c->replicas, new_entry)) { + new_r = cpu_replicas_add_entry(&c->replicas, new_entry); + if (!new_r.entries) { + ret = -BCH_ERR_ENOMEM_cpu_replicas; + goto err; + } + + ret = bch2_cpu_replicas_to_sb_replicas(c, &new_r); + if (ret) + goto err; + + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &new_r)); + } + + if (!new_r.entries && + !new_gc.entries) + goto out; + + /* allocations done, now commit: */ + + if (new_r.entries) + bch2_write_super(c); + + /* don't update in memory replicas until changes are persistent */ + percpu_down_write(&c->mark_lock); + if (new_r.entries) + ret = replicas_table_update(c, &new_r); + if (new_gc.entries) + swap(new_gc, c->replicas_gc); + percpu_up_write(&c->mark_lock); +out: + mutex_unlock(&c->sb_lock); + + kfree(new_r.entries); + kfree(new_gc.entries); + + return ret; +err: + bch_err(c, "error adding replicas entry: %s", bch2_err_str(ret)); + goto out; +} + +int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r) +{ + return likely(bch2_replicas_marked(c, r)) + ? 0 : bch2_mark_replicas_slowpath(c, r); +} + +/* replicas delta list: */ + +int bch2_replicas_delta_list_mark(struct bch_fs *c, + struct replicas_delta_list *r) +{ + struct replicas_delta *d = r->d; + struct replicas_delta *top = (void *) r->d + r->used; + int ret = 0; + + for (d = r->d; !ret && d != top; d = replicas_delta_next(d)) + ret = bch2_mark_replicas(c, &d->r); + return ret; +} + +/* + * Old replicas_gc mechanism: only used for journal replicas entries now, should + * die at some point: + */ + +int bch2_replicas_gc_end(struct bch_fs *c, int ret) +{ + lockdep_assert_held(&c->replicas_gc_lock); + + if (ret) + goto err; + + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + + ret = bch2_cpu_replicas_to_sb_replicas(c, &c->replicas_gc); + if (ret) + goto err; + + ret = replicas_table_update(c, &c->replicas_gc); +err: + kfree(c->replicas_gc.entries); + c->replicas_gc.entries = NULL; + + percpu_up_write(&c->mark_lock); + + if (!ret) + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) +{ + struct bch_replicas_entry *e; + unsigned i = 0; + + lockdep_assert_held(&c->replicas_gc_lock); + + mutex_lock(&c->sb_lock); + BUG_ON(c->replicas_gc.entries); + + c->replicas_gc.nr = 0; + c->replicas_gc.entry_size = 0; + + for_each_cpu_replicas_entry(&c->replicas, e) + if (!((1 << e->data_type) & typemask)) { + c->replicas_gc.nr++; + c->replicas_gc.entry_size = + max_t(unsigned, c->replicas_gc.entry_size, + replicas_entry_bytes(e)); + } + + c->replicas_gc.entries = kcalloc(c->replicas_gc.nr, + c->replicas_gc.entry_size, + GFP_KERNEL); + if (!c->replicas_gc.entries) { + mutex_unlock(&c->sb_lock); + bch_err(c, "error allocating c->replicas_gc"); + return -BCH_ERR_ENOMEM_replicas_gc; + } + + for_each_cpu_replicas_entry(&c->replicas, e) + if (!((1 << e->data_type) & typemask)) + memcpy(cpu_replicas_entry(&c->replicas_gc, i++), + e, c->replicas_gc.entry_size); + + bch2_cpu_replicas_sort(&c->replicas_gc); + mutex_unlock(&c->sb_lock); + + return 0; +} + +/* + * New much simpler mechanism for clearing out unneeded replicas entries - drop + * replicas entries that have 0 sectors used. + * + * However, we don't track sector counts for journal usage, so this doesn't drop + * any BCH_DATA_journal entries; the old bch2_replicas_gc_(start|end) mechanism + * is retained for that. + */ +int bch2_replicas_gc2(struct bch_fs *c) +{ + struct bch_replicas_cpu new = { 0 }; + unsigned i, nr; + int ret = 0; + + bch2_journal_meta(&c->journal); +retry: + nr = READ_ONCE(c->replicas.nr); + new.entry_size = READ_ONCE(c->replicas.entry_size); + new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); + if (!new.entries) { + bch_err(c, "error allocating c->replicas_gc"); + return -BCH_ERR_ENOMEM_replicas_gc; + } + + mutex_lock(&c->sb_lock); + percpu_down_write(&c->mark_lock); + + if (nr != c->replicas.nr || + new.entry_size != c->replicas.entry_size) { + percpu_up_write(&c->mark_lock); + mutex_unlock(&c->sb_lock); + kfree(new.entries); + goto retry; + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + + if (e->data_type == BCH_DATA_journal || + c->usage_base->replicas[i] || + percpu_u64_get(&c->usage[0]->replicas[i]) || + percpu_u64_get(&c->usage[1]->replicas[i]) || + percpu_u64_get(&c->usage[2]->replicas[i]) || + percpu_u64_get(&c->usage[3]->replicas[i])) + memcpy(cpu_replicas_entry(&new, new.nr++), + e, new.entry_size); + } + + bch2_cpu_replicas_sort(&new); + + ret = bch2_cpu_replicas_to_sb_replicas(c, &new); + if (ret) + goto err; + + ret = replicas_table_update(c, &new); +err: + kfree(new.entries); + + percpu_up_write(&c->mark_lock); + + if (!ret) + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + + return ret; +} + +int bch2_replicas_set_usage(struct bch_fs *c, + struct bch_replicas_entry *r, + u64 sectors) +{ + int ret, idx = bch2_replicas_entry_idx(c, r); + + if (idx < 0) { + struct bch_replicas_cpu n; + + n = cpu_replicas_add_entry(&c->replicas, r); + if (!n.entries) + return -BCH_ERR_ENOMEM_cpu_replicas; + + ret = replicas_table_update(c, &n); + if (ret) + return ret; + + kfree(n.entries); + + idx = bch2_replicas_entry_idx(c, r); + BUG_ON(ret < 0); + } + + c->usage_base->replicas[idx] = sectors; + + return 0; +} + +/* Replicas tracking - superblock: */ + +static int +__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r, + struct bch_replicas_cpu *cpu_r) +{ + struct bch_replicas_entry *e, *dst; + unsigned nr = 0, entry_size = 0, idx = 0; + + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } + + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); + if (!cpu_r->entries) + return -BCH_ERR_ENOMEM_cpu_replicas; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; + + for_each_replicas_entry(sb_r, e) { + dst = cpu_replicas_entry(cpu_r, idx++); + memcpy(dst, e, replicas_entry_bytes(e)); + bch2_replicas_entry_sort(dst); + } + + return 0; +} + +static int +__bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r, + struct bch_replicas_cpu *cpu_r) +{ + struct bch_replicas_entry_v0 *e; + unsigned nr = 0, entry_size = 0, idx = 0; + + for_each_replicas_entry(sb_r, e) { + entry_size = max_t(unsigned, entry_size, + replicas_entry_bytes(e)); + nr++; + } + + entry_size += sizeof(struct bch_replicas_entry) - + sizeof(struct bch_replicas_entry_v0); + + cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL); + if (!cpu_r->entries) + return -BCH_ERR_ENOMEM_cpu_replicas; + + cpu_r->nr = nr; + cpu_r->entry_size = entry_size; + + for_each_replicas_entry(sb_r, e) { + struct bch_replicas_entry *dst = + cpu_replicas_entry(cpu_r, idx++); + + dst->data_type = e->data_type; + dst->nr_devs = e->nr_devs; + dst->nr_required = 1; + memcpy(dst->devs, e->devs, e->nr_devs); + bch2_replicas_entry_sort(dst); + } + + return 0; +} + +int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *c) +{ + struct bch_sb_field_replicas *sb_v1; + struct bch_sb_field_replicas_v0 *sb_v0; + struct bch_replicas_cpu new_r = { 0, 0, NULL }; + int ret = 0; + + if ((sb_v1 = bch2_sb_get_replicas(c->disk_sb.sb))) + ret = __bch2_sb_replicas_to_cpu_replicas(sb_v1, &new_r); + else if ((sb_v0 = bch2_sb_get_replicas_v0(c->disk_sb.sb))) + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_v0, &new_r); + if (ret) + return ret; + + bch2_cpu_replicas_sort(&new_r); + + percpu_down_write(&c->mark_lock); + + ret = replicas_table_update(c, &new_r); + percpu_up_write(&c->mark_lock); + + kfree(new_r.entries); + + return 0; +} + +static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_sb_field_replicas_v0 *sb_r; + struct bch_replicas_entry_v0 *dst; + struct bch_replicas_entry *src; + size_t bytes; + + bytes = sizeof(struct bch_sb_field_replicas); + + for_each_cpu_replicas_entry(r, src) + bytes += replicas_entry_bytes(src) - 1; + + sb_r = bch2_sb_resize_replicas_v0(&c->disk_sb, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) + return -BCH_ERR_ENOSPC_sb_replicas; + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); + sb_r = bch2_sb_get_replicas_v0(c->disk_sb.sb); + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + dst->data_type = src->data_type; + dst->nr_devs = src->nr_devs; + memcpy(dst->devs, src->devs, src->nr_devs); + + dst = replicas_entry_next(dst); + + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); + } + + return 0; +} + +static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, + struct bch_replicas_cpu *r) +{ + struct bch_sb_field_replicas *sb_r; + struct bch_replicas_entry *dst, *src; + bool need_v1 = false; + size_t bytes; + + bytes = sizeof(struct bch_sb_field_replicas); + + for_each_cpu_replicas_entry(r, src) { + bytes += replicas_entry_bytes(src); + if (src->nr_required != 1) + need_v1 = true; + } + + if (!need_v1) + return bch2_cpu_replicas_to_sb_replicas_v0(c, r); + + sb_r = bch2_sb_resize_replicas(&c->disk_sb, + DIV_ROUND_UP(bytes, sizeof(u64))); + if (!sb_r) + return -BCH_ERR_ENOSPC_sb_replicas; + + bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); + sb_r = bch2_sb_get_replicas(c->disk_sb.sb); + + memset(&sb_r->entries, 0, + vstruct_end(&sb_r->field) - + (void *) &sb_r->entries); + + dst = sb_r->entries; + for_each_cpu_replicas_entry(r, src) { + memcpy(dst, src, replicas_entry_bytes(src)); + + dst = replicas_entry_next(dst); + + BUG_ON((void *) dst > vstruct_end(&sb_r->field)); + } + + return 0; +} + +static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r, + struct bch_sb *sb, + struct printbuf *err) +{ + struct bch_sb_field_members *mi = bch2_sb_get_members(sb); + unsigned i, j; + + sort_cmp_size(cpu_r->entries, + cpu_r->nr, + cpu_r->entry_size, + memcmp, NULL); + + for (i = 0; i < cpu_r->nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(cpu_r, i); + + if (e->data_type >= BCH_DATA_NR) { + prt_printf(err, "invalid data type in entry "); + bch2_replicas_entry_to_text(err, e); + return -BCH_ERR_invalid_sb_replicas; + } + + if (!e->nr_devs) { + prt_printf(err, "no devices in entry "); + bch2_replicas_entry_to_text(err, e); + return -BCH_ERR_invalid_sb_replicas; + } + + if (e->nr_required > 1 && + e->nr_required >= e->nr_devs) { + prt_printf(err, "bad nr_required in entry "); + bch2_replicas_entry_to_text(err, e); + return -BCH_ERR_invalid_sb_replicas; + } + + for (j = 0; j < e->nr_devs; j++) + if (!bch2_dev_exists(sb, mi, e->devs[j])) { + prt_printf(err, "invalid device %u in entry ", e->devs[j]); + bch2_replicas_entry_to_text(err, e); + return -BCH_ERR_invalid_sb_replicas; + } + + if (i + 1 < cpu_r->nr) { + struct bch_replicas_entry *n = + cpu_replicas_entry(cpu_r, i + 1); + + BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0); + + if (!memcmp(e, n, cpu_r->entry_size)) { + prt_printf(err, "duplicate replicas entry "); + bch2_replicas_entry_to_text(err, e); + return -BCH_ERR_invalid_sb_replicas; + } + } + } + + return 0; +} + +static int bch2_sb_replicas_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_replicas *sb_r = field_to_type(f, replicas); + struct bch_replicas_cpu cpu_r; + int ret; + + ret = __bch2_sb_replicas_to_cpu_replicas(sb_r, &cpu_r); + if (ret) + return ret; + + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); + return ret; +} + +static void bch2_sb_replicas_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_replicas *r = field_to_type(f, replicas); + struct bch_replicas_entry *e; + bool first = true; + + for_each_replicas_entry(r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_to_text(out, e); + } + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas = { + .validate = bch2_sb_replicas_validate, + .to_text = bch2_sb_replicas_to_text, +}; + +static int bch2_sb_replicas_v0_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_replicas_cpu cpu_r; + int ret; + + ret = __bch2_sb_replicas_v0_to_cpu_replicas(sb_r, &cpu_r); + if (ret) + return ret; + + ret = bch2_cpu_replicas_validate(&cpu_r, sb, err); + kfree(cpu_r.entries); + return ret; +} + +static void bch2_sb_replicas_v0_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_replicas_v0 *sb_r = field_to_type(f, replicas_v0); + struct bch_replicas_entry_v0 *e; + bool first = true; + + for_each_replicas_entry(sb_r, e) { + if (!first) + prt_printf(out, " "); + first = false; + + bch2_replicas_entry_v0_to_text(out, e); + } + prt_newline(out); +} + +const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = { + .validate = bch2_sb_replicas_v0_validate, + .to_text = bch2_sb_replicas_v0_to_text, +}; + +/* Query replicas: */ + +bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, + unsigned flags, bool print) +{ + struct bch_replicas_entry *e; + bool ret = true; + + percpu_down_read(&c->mark_lock); + for_each_cpu_replicas_entry(&c->replicas, e) { + unsigned i, nr_online = 0, nr_failed = 0, dflags = 0; + bool metadata = e->data_type < BCH_DATA_user; + + if (e->data_type == BCH_DATA_cached) + continue; + + for (i = 0; i < e->nr_devs; i++) { + struct bch_dev *ca = bch_dev_bkey_exists(c, e->devs[i]); + + nr_online += test_bit(e->devs[i], devs.d); + nr_failed += ca->mi.state == BCH_MEMBER_STATE_failed; + } + + if (nr_failed == e->nr_devs) + continue; + + if (nr_online < e->nr_required) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_LOST + : BCH_FORCE_IF_DATA_LOST; + + if (nr_online < e->nr_devs) + dflags |= metadata + ? BCH_FORCE_IF_METADATA_DEGRADED + : BCH_FORCE_IF_DATA_DEGRADED; + + if (dflags & ~flags) { + if (print) { + struct printbuf buf = PRINTBUF; + + bch2_replicas_entry_to_text(&buf, e); + bch_err(c, "insufficient devices online (%u) for replicas entry %s", + nr_online, buf.buf); + printbuf_exit(&buf); + } + ret = false; + break; + } + + } + percpu_up_read(&c->mark_lock); + + return ret; +} + +unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev) +{ + struct bch_sb_field_replicas *replicas; + struct bch_sb_field_replicas_v0 *replicas_v0; + unsigned i, data_has = 0; + + replicas = bch2_sb_get_replicas(sb); + replicas_v0 = bch2_sb_get_replicas_v0(sb); + + if (replicas) { + struct bch_replicas_entry *r; + + for_each_replicas_entry(replicas, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } else if (replicas_v0) { + struct bch_replicas_entry_v0 *r; + + for_each_replicas_entry_v0(replicas_v0, r) + for (i = 0; i < r->nr_devs; i++) + if (r->devs[i] == dev) + data_has |= 1 << r->data_type; + } + + + return data_has; +} + +unsigned bch2_dev_has_data(struct bch_fs *c, struct bch_dev *ca) +{ + unsigned ret; + + mutex_lock(&c->sb_lock); + ret = bch2_sb_dev_has_data(c->disk_sb.sb, ca->dev_idx); + mutex_unlock(&c->sb_lock); + + return ret; +} + +void bch2_fs_replicas_exit(struct bch_fs *c) +{ + unsigned i; + + kfree(c->usage_scratch); + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + free_percpu(c->usage[i]); + kfree(c->usage_base); + kfree(c->replicas.entries); + kfree(c->replicas_gc.entries); + + mempool_exit(&c->replicas_delta_pool); +} + +int bch2_fs_replicas_init(struct bch_fs *c) +{ + bch2_journal_entry_res_resize(&c->journal, + &c->replicas_journal_res, + reserve_journal_replicas(c, &c->replicas)); + + return mempool_init_kmalloc_pool(&c->replicas_delta_pool, 1, + REPLICAS_DELTA_LIST_MAX) ?: + replicas_table_update(c, &c->replicas); +} diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h new file mode 100644 index 000000000..4887675a8 --- /dev/null +++ b/fs/bcachefs/replicas.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_H +#define _BCACHEFS_REPLICAS_H + +#include "bkey.h" +#include "eytzinger.h" +#include "replicas_types.h" + +void bch2_replicas_entry_sort(struct bch_replicas_entry *); +void bch2_replicas_entry_to_text(struct printbuf *, + struct bch_replicas_entry *); +void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *); + +static inline struct bch_replicas_entry * +cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i) +{ + return (void *) r->entries + r->entry_size * i; +} + +int bch2_replicas_entry_idx(struct bch_fs *, + struct bch_replicas_entry *); + +void bch2_devlist_to_replicas(struct bch_replicas_entry *, + enum bch_data_type, + struct bch_devs_list); +bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *); +int bch2_mark_replicas(struct bch_fs *, + struct bch_replicas_entry *); + +static inline struct replicas_delta * +replicas_delta_next(struct replicas_delta *d) +{ + return (void *) d + replicas_entry_bytes(&d->r) + 8; +} + +int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *); + +void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c); + +static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e, + unsigned dev) +{ + e->data_type = BCH_DATA_cached; + e->nr_devs = 1; + e->nr_required = 1; + e->devs[0] = dev; +} + +bool bch2_have_enough_devs(struct bch_fs *, struct bch_devs_mask, + unsigned, bool); + +unsigned bch2_sb_dev_has_data(struct bch_sb *, unsigned); +unsigned bch2_dev_has_data(struct bch_fs *, struct bch_dev *); + +int bch2_replicas_gc_end(struct bch_fs *, int); +int bch2_replicas_gc_start(struct bch_fs *, unsigned); +int bch2_replicas_gc2(struct bch_fs *); + +int bch2_replicas_set_usage(struct bch_fs *, + struct bch_replicas_entry *, + u64); + +#define for_each_cpu_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < (void *) (_r)->entries + (_r)->nr * (_r)->entry_size;\ + _i = (void *) (_i) + (_r)->entry_size) + +/* iterate over superblock replicas - used by userspace tools: */ + +#define replicas_entry_next(_i) \ + ((typeof(_i)) ((void *) (_i) + replicas_entry_bytes(_i))) + +#define for_each_replicas_entry(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + +#define for_each_replicas_entry_v0(_r, _i) \ + for (_i = (_r)->entries; \ + (void *) (_i) < vstruct_end(&(_r)->field) && (_i)->data_type;\ + (_i) = replicas_entry_next(_i)) + +int bch2_sb_replicas_to_cpu_replicas(struct bch_fs *); + +extern const struct bch_sb_field_ops bch_sb_field_ops_replicas; +extern const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0; + +void bch2_fs_replicas_exit(struct bch_fs *); +int bch2_fs_replicas_init(struct bch_fs *); + +#endif /* _BCACHEFS_REPLICAS_H */ diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h new file mode 100644 index 000000000..5cfff489b --- /dev/null +++ b/fs/bcachefs/replicas_types.h @@ -0,0 +1,27 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_REPLICAS_TYPES_H +#define _BCACHEFS_REPLICAS_TYPES_H + +struct bch_replicas_cpu { + unsigned nr; + unsigned entry_size; + struct bch_replicas_entry *entries; +}; + +struct replicas_delta { + s64 delta; + struct bch_replicas_entry r; +} __packed; + +struct replicas_delta_list { + unsigned size; + unsigned used; + + struct {} memset_start; + u64 nr_inodes; + u64 persistent_reserved[BCH_REPLICAS_MAX]; + struct {} memset_end; + struct replicas_delta d[0]; +}; + +#endif /* _BCACHEFS_REPLICAS_TYPES_H */ diff --git a/fs/bcachefs/seqmutex.h b/fs/bcachefs/seqmutex.h new file mode 100644 index 000000000..c1860d816 --- /dev/null +++ b/fs/bcachefs/seqmutex.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SEQMUTEX_H +#define _BCACHEFS_SEQMUTEX_H + +#include + +struct seqmutex { + struct mutex lock; + u32 seq; +}; + +#define seqmutex_init(_lock) mutex_init(&(_lock)->lock) + +static inline bool seqmutex_trylock(struct seqmutex *lock) +{ + return mutex_trylock(&lock->lock); +} + +static inline void seqmutex_lock(struct seqmutex *lock) +{ + mutex_lock(&lock->lock); +} + +static inline void seqmutex_unlock(struct seqmutex *lock) +{ + lock->seq++; + mutex_unlock(&lock->lock); +} + +static inline u32 seqmutex_seq(struct seqmutex *lock) +{ + return lock->seq; +} + +static inline bool seqmutex_relock(struct seqmutex *lock, u32 seq) +{ + if (lock->seq != seq || !mutex_trylock(&lock->lock)) + return false; + + if (lock->seq != seq) { + mutex_unlock(&lock->lock); + return false; + } + + return true; +} + +#endif /* _BCACHEFS_SEQMUTEX_H */ diff --git a/fs/bcachefs/siphash.c b/fs/bcachefs/siphash.c new file mode 100644 index 000000000..dc1a27cc3 --- /dev/null +++ b/fs/bcachefs/siphash.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: BSD-3-Clause +/* $OpenBSD: siphash.c,v 1.3 2015/02/20 11:51:03 tedu Exp $ */ + +/*- + * Copyright (c) 2013 Andre Oppermann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * SipHash is a family of PRFs SipHash-c-d where the integer parameters c and d + * are the number of compression rounds and the number of finalization rounds. + * A compression round is identical to a finalization round and this round + * function is called SipRound. Given a 128-bit key k and a (possibly empty) + * byte string m, SipHash-c-d returns a 64-bit value SipHash-c-d(k; m). + * + * Implemented from the paper "SipHash: a fast short-input PRF", 2012.09.18, + * by Jean-Philippe Aumasson and Daniel J. Bernstein, + * Permanent Document ID b9a943a805fbfc6fde808af9fc0ecdfa + * https://131002.net/siphash/siphash.pdf + * https://131002.net/siphash/ + */ + +#include +#include +#include +#include + +#include "siphash.h" + +static void SipHash_Rounds(SIPHASH_CTX *ctx, int rounds) +{ + while (rounds--) { + ctx->v[0] += ctx->v[1]; + ctx->v[2] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 13); + ctx->v[3] = rol64(ctx->v[3], 16); + + ctx->v[1] ^= ctx->v[0]; + ctx->v[3] ^= ctx->v[2]; + ctx->v[0] = rol64(ctx->v[0], 32); + + ctx->v[2] += ctx->v[1]; + ctx->v[0] += ctx->v[3]; + ctx->v[1] = rol64(ctx->v[1], 17); + ctx->v[3] = rol64(ctx->v[3], 21); + + ctx->v[1] ^= ctx->v[2]; + ctx->v[3] ^= ctx->v[0]; + ctx->v[2] = rol64(ctx->v[2], 32); + } +} + +static void SipHash_CRounds(SIPHASH_CTX *ctx, const void *ptr, int rounds) +{ + u64 m = get_unaligned_le64(ptr); + + ctx->v[3] ^= m; + SipHash_Rounds(ctx, rounds); + ctx->v[0] ^= m; +} + +void SipHash_Init(SIPHASH_CTX *ctx, const SIPHASH_KEY *key) +{ + u64 k0, k1; + + k0 = le64_to_cpu(key->k0); + k1 = le64_to_cpu(key->k1); + + ctx->v[0] = 0x736f6d6570736575ULL ^ k0; + ctx->v[1] = 0x646f72616e646f6dULL ^ k1; + ctx->v[2] = 0x6c7967656e657261ULL ^ k0; + ctx->v[3] = 0x7465646279746573ULL ^ k1; + + memset(ctx->buf, 0, sizeof(ctx->buf)); + ctx->bytes = 0; +} + +void SipHash_Update(SIPHASH_CTX *ctx, int rc, int rf, + const void *src, size_t len) +{ + const u8 *ptr = src; + size_t left, used; + + if (len == 0) + return; + + used = ctx->bytes % sizeof(ctx->buf); + ctx->bytes += len; + + if (used > 0) { + left = sizeof(ctx->buf) - used; + + if (len >= left) { + memcpy(&ctx->buf[used], ptr, left); + SipHash_CRounds(ctx, ctx->buf, rc); + len -= left; + ptr += left; + } else { + memcpy(&ctx->buf[used], ptr, len); + return; + } + } + + while (len >= sizeof(ctx->buf)) { + SipHash_CRounds(ctx, ptr, rc); + len -= sizeof(ctx->buf); + ptr += sizeof(ctx->buf); + } + + if (len > 0) + memcpy(&ctx->buf[used], ptr, len); +} + +void SipHash_Final(void *dst, SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + + r = SipHash_End(ctx, rc, rf); + + *((__le64 *) dst) = cpu_to_le64(r); +} + +u64 SipHash_End(SIPHASH_CTX *ctx, int rc, int rf) +{ + u64 r; + size_t left, used; + + used = ctx->bytes % sizeof(ctx->buf); + left = sizeof(ctx->buf) - used; + memset(&ctx->buf[used], 0, left - 1); + ctx->buf[7] = ctx->bytes; + + SipHash_CRounds(ctx, ctx->buf, rc); + ctx->v[2] ^= 0xff; + SipHash_Rounds(ctx, rf); + + r = (ctx->v[0] ^ ctx->v[1]) ^ (ctx->v[2] ^ ctx->v[3]); + memset(ctx, 0, sizeof(*ctx)); + return r; +} + +u64 SipHash(const SIPHASH_KEY *key, int rc, int rf, const void *src, size_t len) +{ + SIPHASH_CTX ctx; + + SipHash_Init(&ctx, key); + SipHash_Update(&ctx, rc, rf, src, len); + return SipHash_End(&ctx, rc, rf); +} diff --git a/fs/bcachefs/siphash.h b/fs/bcachefs/siphash.h new file mode 100644 index 000000000..3dfaf34a4 --- /dev/null +++ b/fs/bcachefs/siphash.h @@ -0,0 +1,87 @@ +/* SPDX-License-Identifier: BSD-3-Clause */ +/* $OpenBSD: siphash.h,v 1.5 2015/02/20 11:51:03 tedu Exp $ */ +/*- + * Copyright (c) 2013 Andre Oppermann + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * SipHash is a family of pseudorandom functions (a.k.a. keyed hash functions) + * optimized for speed on short messages returning a 64bit hash/digest value. + * + * The number of rounds is defined during the initialization: + * SipHash24_Init() for the fast and resonable strong version + * SipHash48_Init() for the strong version (half as fast) + * + * struct SIPHASH_CTX ctx; + * SipHash24_Init(&ctx); + * SipHash_SetKey(&ctx, "16bytes long key"); + * SipHash_Update(&ctx, pointer_to_string, length_of_string); + * SipHash_Final(output, &ctx); + */ + +#ifndef _SIPHASH_H_ +#define _SIPHASH_H_ + +#include + +#define SIPHASH_BLOCK_LENGTH 8 +#define SIPHASH_KEY_LENGTH 16 +#define SIPHASH_DIGEST_LENGTH 8 + +typedef struct _SIPHASH_CTX { + u64 v[4]; + u8 buf[SIPHASH_BLOCK_LENGTH]; + u32 bytes; +} SIPHASH_CTX; + +typedef struct { + __le64 k0; + __le64 k1; +} SIPHASH_KEY; + +void SipHash_Init(SIPHASH_CTX *, const SIPHASH_KEY *); +void SipHash_Update(SIPHASH_CTX *, int, int, const void *, size_t); +u64 SipHash_End(SIPHASH_CTX *, int, int); +void SipHash_Final(void *, SIPHASH_CTX *, int, int); +u64 SipHash(const SIPHASH_KEY *, int, int, const void *, size_t); + +#define SipHash24_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash24_Update(_c, _p, _l) SipHash_Update((_c), 2, 4, (_p), (_l)) +#define SipHash24_End(_d) SipHash_End((_d), 2, 4) +#define SipHash24_Final(_d, _c) SipHash_Final((_d), (_c), 2, 4) +#define SipHash24(_k, _p, _l) SipHash((_k), 2, 4, (_p), (_l)) + +#define SipHash48_Init(_c, _k) SipHash_Init((_c), (_k)) +#define SipHash48_Update(_c, _p, _l) SipHash_Update((_c), 4, 8, (_p), (_l)) +#define SipHash48_End(_d) SipHash_End((_d), 4, 8) +#define SipHash48_Final(_d, _c) SipHash_Final((_d), (_c), 4, 8) +#define SipHash48(_k, _p, _l) SipHash((_k), 4, 8, (_p), (_l)) + +#endif /* _SIPHASH_H_ */ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h new file mode 100644 index 000000000..ae21a8cca --- /dev/null +++ b/fs/bcachefs/str_hash.h @@ -0,0 +1,370 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_STR_HASH_H +#define _BCACHEFS_STR_HASH_H + +#include "btree_iter.h" +#include "btree_update.h" +#include "checksum.h" +#include "error.h" +#include "inode.h" +#include "siphash.h" +#include "subvolume.h" +#include "super.h" + +#include +#include +#include + +static inline enum bch_str_hash_type +bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) +{ + switch (opt) { + case BCH_STR_HASH_OPT_crc32c: + return BCH_STR_HASH_crc32c; + case BCH_STR_HASH_OPT_crc64: + return BCH_STR_HASH_crc64; + case BCH_STR_HASH_OPT_siphash: + return c->sb.features & (1ULL << BCH_FEATURE_new_siphash) + ? BCH_STR_HASH_siphash + : BCH_STR_HASH_siphash_old; + default: + BUG(); + } +} + +struct bch_hash_info { + u8 type; + /* + * For crc32 or crc64 string hashes the first key value of + * the siphash_key (k0) is used as the key. + */ + SIPHASH_KEY siphash_key; +}; + +static inline struct bch_hash_info +bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) +{ + /* XXX ick */ + struct bch_hash_info info = { + .type = (bi->bi_flags >> INODE_STR_HASH_OFFSET) & + ~(~0U << INODE_STR_HASH_BITS), + .siphash_key = { .k0 = bi->bi_hash_seed } + }; + + if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { + SHASH_DESC_ON_STACK(desc, c->sha256); + u8 digest[SHA256_DIGEST_SIZE]; + + desc->tfm = c->sha256; + + crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); + memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); + } + + return info; +} + +struct bch_str_hash_ctx { + union { + u32 crc32c; + u64 crc64; + SIPHASH_CTX siphash; + }; +}; + +static inline void bch2_str_hash_init(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) +{ + switch (info->type) { + case BCH_STR_HASH_crc32c: + ctx->crc32c = crc32c(~0, &info->siphash_key.k0, + sizeof(info->siphash_key.k0)); + break; + case BCH_STR_HASH_crc64: + ctx->crc64 = crc64_be(~0, &info->siphash_key.k0, + sizeof(info->siphash_key.k0)); + break; + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: + SipHash24_Init(&ctx->siphash, &info->siphash_key); + break; + default: + BUG(); + } +} + +static inline void bch2_str_hash_update(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info, + const void *data, size_t len) +{ + switch (info->type) { + case BCH_STR_HASH_crc32c: + ctx->crc32c = crc32c(ctx->crc32c, data, len); + break; + case BCH_STR_HASH_crc64: + ctx->crc64 = crc64_be(ctx->crc64, data, len); + break; + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: + SipHash24_Update(&ctx->siphash, data, len); + break; + default: + BUG(); + } +} + +static inline u64 bch2_str_hash_end(struct bch_str_hash_ctx *ctx, + const struct bch_hash_info *info) +{ + switch (info->type) { + case BCH_STR_HASH_crc32c: + return ctx->crc32c; + case BCH_STR_HASH_crc64: + return ctx->crc64 >> 1; + case BCH_STR_HASH_siphash_old: + case BCH_STR_HASH_siphash: + return SipHash24_End(&ctx->siphash) >> 1; + default: + BUG(); + } +} + +struct bch_hash_desc { + enum btree_id btree_id; + u8 key_type; + + u64 (*hash_key)(const struct bch_hash_info *, const void *); + u64 (*hash_bkey)(const struct bch_hash_info *, struct bkey_s_c); + bool (*cmp_key)(struct bkey_s_c, const void *); + bool (*cmp_bkey)(struct bkey_s_c, struct bkey_s_c); + bool (*is_visible)(subvol_inum inum, struct bkey_s_c); +}; + +static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, struct bkey_s_c k) +{ + return k.k->type == desc.key_type && + (!desc.is_visible || + !inum.inum || + desc.is_visible(inum, k)); +} + +static __always_inline int +bch2_hash_lookup(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key, + unsigned flags) +{ + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|flags, k, ret) { + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_key(k, key)) + return 0; + } else if (k.k->type == KEY_TYPE_hash_whiteout) { + ; + } else { + /* hole, not found */ + break; + } + } + bch2_trans_iter_exit(trans, iter); + + return ret ?: -BCH_ERR_ENOENT_str_hash_lookup; +} + +static __always_inline int +bch2_hash_hole(struct btree_trans *trans, + struct btree_iter *iter, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key) +{ + struct bkey_s_c k; + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id, + SPOS(inum.inum, desc.hash_key(info, key), snapshot), + POS(inum.inum, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) + if (!is_visible_key(desc, inum, k)) + return 0; + bch2_trans_iter_exit(trans, iter); + + return ret ?: -BCH_ERR_ENOSPC_str_hash_create; +} + +static __always_inline +int bch2_hash_needs_whiteout(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *start) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + bch2_trans_copy_iter(&iter, start); + + bch2_btree_iter_advance(&iter); + + for_each_btree_key_continue_norestart(iter, BTREE_ITER_SLOTS, k, ret) { + if (k.k->type != desc.key_type && + k.k->type != KEY_TYPE_hash_whiteout) + break; + + if (k.k->type == desc.key_type && + desc.hash_bkey(info, k) <= start->pos.offset) { + ret = 1; + break; + } + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static __always_inline +int bch2_hash_set_snapshot(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, u32 snapshot, + struct bkey_i *insert, + int flags, + int update_flags) +{ + struct btree_iter iter, slot = { NULL }; + struct bkey_s_c k; + bool found = false; + int ret; + + for_each_btree_key_upto_norestart(trans, iter, desc.btree_id, + SPOS(insert->k.p.inode, + desc.hash_bkey(info, bkey_i_to_s_c(insert)), + snapshot), + POS(insert->k.p.inode, U64_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, ret) { + if (is_visible_key(desc, inum, k)) { + if (!desc.cmp_bkey(k, bkey_i_to_s_c(insert))) + goto found; + + /* hash collision: */ + continue; + } + + if (!slot.path && + !(flags & BCH_HASH_SET_MUST_REPLACE)) + bch2_trans_copy_iter(&slot, &iter); + + if (k.k->type != KEY_TYPE_hash_whiteout) + goto not_found; + } + + if (!ret) + ret = -BCH_ERR_ENOSPC_str_hash_create; +out: + bch2_trans_iter_exit(trans, &slot); + bch2_trans_iter_exit(trans, &iter); + + return ret; +found: + found = true; +not_found: + + if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) { + ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; + } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) { + ret = -EEXIST; + } else { + if (!found && slot.path) + swap(iter, slot); + + insert->k.p = iter.pos; + ret = bch2_trans_update(trans, &iter, insert, 0); + } + + goto out; +} + +static __always_inline +int bch2_hash_set(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, + struct bkey_i *insert, int flags) +{ + u32 snapshot; + int ret; + + ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); + if (ret) + return ret; + + insert->k.p.inode = inum.inum; + + return bch2_hash_set_snapshot(trans, desc, info, inum, + snapshot, insert, flags, 0); +} + +static __always_inline +int bch2_hash_delete_at(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + struct btree_iter *iter, + unsigned update_flags) +{ + struct bkey_i *delete; + int ret; + + delete = bch2_trans_kmalloc(trans, sizeof(*delete)); + ret = PTR_ERR_OR_ZERO(delete); + if (ret) + return ret; + + ret = bch2_hash_needs_whiteout(trans, desc, info, iter); + if (ret < 0) + return ret; + + bkey_init(&delete->k); + delete->k.p = iter->pos; + delete->k.type = ret ? KEY_TYPE_hash_whiteout : KEY_TYPE_deleted; + + return bch2_trans_update(trans, iter, delete, update_flags); +} + +static __always_inline +int bch2_hash_delete(struct btree_trans *trans, + const struct bch_hash_desc desc, + const struct bch_hash_info *info, + subvol_inum inum, const void *key) +{ + struct btree_iter iter; + int ret; + + ret = bch2_hash_lookup(trans, &iter, desc, info, inum, key, + BTREE_ITER_INTENT); + if (ret) + return ret; + + ret = bch2_hash_delete_at(trans, desc, info, &iter, 0); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +#endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c new file mode 100644 index 000000000..7e6b416d3 --- /dev/null +++ b/fs/bcachefs/subvolume.c @@ -0,0 +1,1734 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "subvolume.h" + +#include + +static int bch2_subvolume_delete(struct btree_trans *, u32); + +static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) +{ + const struct snapshot_t *s = __snapshot_t(t, id); + + if (s->skip[2] <= ancestor) + return s->skip[2]; + if (s->skip[1] <= ancestor) + return s->skip[1]; + if (s->skip[0] <= ancestor) + return s->skip[0]; + return s->parent; +} + +bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + bool ret; + + EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots); + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); + + ret = id && id < ancestor + ? test_bit(ancestor - id - 1, __snapshot_t(t, id)->is_ancestor) + : id == ancestor; + rcu_read_unlock(); + + return ret; +} + +static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) +{ + struct snapshot_table *t; + + rcu_read_lock(); + t = rcu_dereference(c->snapshots); + + while (id && id < ancestor) + id = __snapshot_t(t, id)->parent; + rcu_read_unlock(); + + return id == ancestor; +} + +static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) +{ + u32 depth; + + rcu_read_lock(); + depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; + rcu_read_unlock(); + + return depth; +} + +static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + size_t new_size; + struct snapshot_table *new, *old; + + new_size = max(16UL, roundup_pow_of_two(idx + 1)); + + new = kvzalloc(struct_size(new, s, new_size), GFP_KERNEL); + if (!new) + return NULL; + + old = c->snapshots; + if (old) + memcpy(new->s, + rcu_dereference_protected(c->snapshots, true)->s, + sizeof(new->s[0]) * c->snapshot_table_size); + + rcu_assign_pointer(c->snapshots, new); + c->snapshot_table_size = new_size; + if (old) + kvfree_rcu(old); + + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; +} + +static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id) +{ + size_t idx = U32_MAX - id; + + lockdep_assert_held(&c->snapshot_table_lock); + + if (likely(idx < c->snapshot_table_size)) + return &rcu_dereference_protected(c->snapshots, true)->s[idx]; + + return __snapshot_t_mut(c, id); +} + +/* Snapshot tree: */ + +void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k); + + prt_printf(out, "subvol %u root snapshot %u", + le32_to_cpu(t.v->master_subvol), + le32_to_cpu(t.v->root_snapshot)); +} + +int bch2_snapshot_tree_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + if (bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1))) { + prt_printf(err, "bad pos"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot_tree *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot_tree, s); + + if (bch2_err_matches(ret, ENOENT)) + ret = -BCH_ERR_ENOENT_snapshot_tree; + return ret; +} + +static struct bkey_i_snapshot_tree * +__snapshot_tree_create(struct btree_trans *trans) +{ + struct btree_iter iter; + int ret = bch2_bkey_get_empty_slot(trans, &iter, + BTREE_ID_snapshot_trees, POS(0, U32_MAX)); + struct bkey_i_snapshot_tree *s_t; + + if (ret == -BCH_ERR_ENOSPC_btree_slot) + ret = -BCH_ERR_ENOSPC_snapshot_tree; + if (ret) + return ERR_PTR(ret); + + s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + bch2_trans_iter_exit(trans, &iter); + return ret ? ERR_PTR(ret) : s_t; +} + +static int snapshot_tree_create(struct btree_trans *trans, + u32 root_id, u32 subvol_id, u32 *tree_id) +{ + struct bkey_i_snapshot_tree *n_tree = + __snapshot_tree_create(trans); + + if (IS_ERR(n_tree)) + return PTR_ERR(n_tree); + + n_tree->v.master_subvol = cpu_to_le32(subvol_id); + n_tree->v.root_snapshot = cpu_to_le32(root_id); + *tree_id = n_tree->k.p.offset; + return 0; +} + +/* Snapshot nodes: */ + +void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); + + prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", + BCH_SNAPSHOT_SUBVOL(s.v), + BCH_SNAPSHOT_DELETED(s.v), + le32_to_cpu(s.v->parent), + le32_to_cpu(s.v->children[0]), + le32_to_cpu(s.v->children[1]), + le32_to_cpu(s.v->subvol), + le32_to_cpu(s.v->tree)); +} + +int bch2_snapshot_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + struct bkey_s_c_snapshot s; + u32 i, id; + + if (bkey_gt(k.k->p, POS(0, U32_MAX)) || + bkey_lt(k.k->p, POS(0, 1))) { + prt_printf(err, "bad pos"); + return -BCH_ERR_invalid_bkey; + } + + s = bkey_s_c_to_snapshot(k); + + id = le32_to_cpu(s.v->parent); + if (id && id <= k.k->p.offset) { + prt_printf(err, "bad parent node (%u <= %llu)", + id, k.k->p.offset); + return -BCH_ERR_invalid_bkey; + } + + if (le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1])) { + prt_printf(err, "children not normalized"); + return -BCH_ERR_invalid_bkey; + } + + if (s.v->children[0] && + s.v->children[0] == s.v->children[1]) { + prt_printf(err, "duplicate child nodes"); + return -BCH_ERR_invalid_bkey; + } + + for (i = 0; i < 2; i++) { + id = le32_to_cpu(s.v->children[i]); + + if (id >= k.k->p.offset) { + prt_printf(err, "bad child node (%u >= %llu)", + id, k.k->p.offset); + return -BCH_ERR_invalid_bkey; + } + } + + if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) { + if (le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) || + le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2])) { + prt_printf(err, "skiplist not normalized"); + return -BCH_ERR_invalid_bkey; + } + + for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) { + id = le32_to_cpu(s.v->skip[i]); + + if (!id != !s.v->parent || + (s.v->parent && + id <= k.k->p.offset)) { + prt_printf(err, "bad skiplist node %u)", id); + return -BCH_ERR_invalid_bkey; + } + } + } + + return 0; +} + +int bch2_mark_snapshot(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c old, struct bkey_s_c new, + unsigned flags) +{ + struct bch_fs *c = trans->c; + struct snapshot_t *t; + u32 id = new.k->p.offset; + int ret = 0; + + mutex_lock(&c->snapshot_table_lock); + + t = snapshot_t_mut(c, id); + if (!t) { + ret = -BCH_ERR_ENOMEM_mark_snapshot; + goto err; + } + + if (new.k->type == KEY_TYPE_snapshot) { + struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); + u32 parent = id; + + t->parent = le32_to_cpu(s.v->parent); + t->skip[0] = le32_to_cpu(s.v->skip[0]); + t->skip[1] = le32_to_cpu(s.v->skip[1]); + t->skip[2] = le32_to_cpu(s.v->skip[2]); + t->depth = le32_to_cpu(s.v->depth); + t->children[0] = le32_to_cpu(s.v->children[0]); + t->children[1] = le32_to_cpu(s.v->children[1]); + t->subvol = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0; + t->tree = le32_to_cpu(s.v->tree); + + while ((parent = bch2_snapshot_parent_early(c, parent)) && + parent - id - 1 < IS_ANCESTOR_BITMAP) + __set_bit(parent - id - 1, t->is_ancestor); + + if (BCH_SNAPSHOT_DELETED(s.v)) + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + } else { + memset(t, 0, sizeof(*t)); + } +err: + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + +static int snapshot_lookup(struct btree_trans *trans, u32 id, + struct bch_snapshot *s) +{ + return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_WITH_UPDATES, snapshot, s); +} + +static int snapshot_live(struct btree_trans *trans, u32 id) +{ + struct bch_snapshot v; + int ret; + + if (!id) + return 0; + + ret = snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot node %u not found", id); + if (ret) + return ret; + + return !BCH_SNAPSHOT_DELETED(&v); +} + +static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + unsigned i, nr_live = 0, live_idx = 0; + struct bkey_s_c_snapshot snap; + u32 id = k.k->p.offset, child[2]; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + + child[0] = le32_to_cpu(snap.v->children[0]); + child[1] = le32_to_cpu(snap.v->children[1]); + + for (i = 0; i < 2; i++) { + int ret = snapshot_live(trans, child[i]); + + if (ret < 0) + return ret; + + if (ret) + live_idx = i; + nr_live += ret; + } + + mutex_lock(&c->snapshot_table_lock); + + snapshot_t_mut(c, id)->equiv = nr_live == 1 + ? snapshot_t_mut(c, child[live_idx])->equiv + : id; + + mutex_unlock(&c->snapshot_table_lock); + + return 0; +} + +/* fsck: */ + +static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child) +{ + return snapshot_t(c, id)->children[child]; +} + +static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 0); +} + +static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_child(c, id, 1); +} + +static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) +{ + u32 n, parent; + + n = bch2_snapshot_left_child(c, id); + if (n) + return n; + + while ((parent = bch2_snapshot_parent(c, id))) { + n = bch2_snapshot_right_child(c, parent); + if (n && n != id) + return n; + id = parent; + } + + return 0; +} + +static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +{ + u32 id = snapshot_root; + u32 subvol = 0, s; + + while (id) { + s = snapshot_t(c, id)->subvol; + + if (s && (!subvol || s < subvol)) + subvol = s; + + id = bch2_snapshot_tree_next(c, id); + } + + return subvol; +} + +static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, + u32 snapshot_root, u32 *subvol_id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_subvolume s; + bool found = false; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN, + 0, k, ret) { + if (k.k->type != KEY_TYPE_subvolume) + continue; + + s = bkey_s_c_to_subvolume(k); + if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root)) + continue; + if (!BCH_SUBVOLUME_SNAP(s.v)) { + *subvol_id = s.k->p.offset; + found = true; + break; + } + } + + bch2_trans_iter_exit(trans, &iter); + + if (!ret && !found) { + struct bkey_i_subvolume *s; + + *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, *subvol_id), + 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&s->v, false); + } + + return ret; +} + +static int check_snapshot_tree(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_snapshot_tree st; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct printbuf buf = PRINTBUF; + u32 root_id; + int ret; + + if (k.k->type != KEY_TYPE_snapshot_tree) + return 0; + + st = bkey_s_c_to_snapshot_tree(k); + root_id = le32_to_cpu(st.v->root_snapshot); + + ret = snapshot_lookup(trans, root_id, &s); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret || + root_id != bch2_snapshot_root(c, root_id) || + st.k->p.offset != le32_to_cpu(s.tree), + c, + "snapshot tree points to missing/incorrect snapshot:\n %s", + (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, 0); + goto err; + } + + ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol), + false, 0, &subvol); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + if (fsck_err_on(ret, c, + "snapshot tree points to missing subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(!bch2_snapshot_is_ancestor_early(c, + le32_to_cpu(subvol.snapshot), + root_id), c, + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || + fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), c, + "snapshot tree points to snapshot subvolume:\n %s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { + struct bkey_i_snapshot_tree *u; + u32 subvol_id; + + ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id); + if (ret) + goto err; + + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.master_subvol = cpu_to_le32(subvol_id); + st = snapshot_tree_i_to_s_c(u); + } +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +/* + * For each snapshot_tree, make sure it points to the root of a snapshot tree + * and that snapshot entry points back to it, or delete it. + * + * And, make sure it points to a subvolume within that snapshot tree, or correct + * it to point to the oldest subvolume within that snapshot tree. + */ +int bch2_check_snapshot_trees(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_snapshot_trees, POS_MIN, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot_tree(&trans, &iter, k))); + + if (ret) + bch_err(c, "error %i checking snapshot trees", ret); + return ret; +} + +/* + * Look up snapshot tree for @tree_id and find root, + * make sure @snap_id is a descendent: + */ +static int snapshot_tree_ptr_good(struct btree_trans *trans, + u32 snap_id, u32 tree_id) +{ + struct bch_snapshot_tree s_t; + int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + + if (bch2_err_matches(ret, ENOENT)) + return 0; + if (ret) + return ret; + + return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot)); +} + +static u32 snapshot_skiplist_get(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + + if (!id) + return 0; + + rcu_read_lock(); + s = snapshot_t(c, id); + if (s->parent) + id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); + rcu_read_unlock(); + + return id; +} + +static int snapshot_skiplist_good(struct btree_trans *trans, struct bch_snapshot s) +{ + struct bch_snapshot a; + unsigned i; + int ret; + + for (i = 0; i < 3; i++) { + if (!s.parent != !s.skip[i]) + return false; + + if (!s.parent) + continue; + + ret = snapshot_lookup(trans, le32_to_cpu(s.skip[i]), &a); + if (bch2_err_matches(ret, ENOENT)) + return false; + if (ret) + return ret; + + if (a.tree != s.tree) + return false; + } + + return true; +} + +/* + * snapshot_tree pointer was incorrect: look up root snapshot node, make sure + * its snapshot_tree pointer is correct (allocate new one if necessary), then + * update this node's pointer to root node's pointer: + */ +static int snapshot_tree_ptr_repair(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + struct bch_snapshot *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter root_iter; + struct bch_snapshot_tree s_t; + struct bkey_s_c_snapshot root; + struct bkey_i_snapshot *u; + u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id; + int ret; + + root = bch2_bkey_get_iter_typed(trans, &root_iter, + BTREE_ID_snapshots, POS(0, root_id), + BTREE_ITER_WITH_UPDATES, snapshot); + ret = bkey_err(root); + if (ret) + goto err; + + tree_id = le32_to_cpu(root.v->tree); + + ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t); + if (ret && !bch2_err_matches(ret, ENOENT)) + return ret; + + if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) { + u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u) ?: + snapshot_tree_create(trans, root_id, + bch2_snapshot_tree_oldest_subvol(c, root_id), + &tree_id); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + if (k.k->p.offset == root_id) + *s = u->v; + } + + if (k.k->p.offset != root_id) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.tree = cpu_to_le32(tree_id); + *s = u->v; + } +err: + bch2_trans_iter_exit(trans, &root_iter); + return ret; +} + +static int check_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bch_snapshot s; + struct bch_subvolume subvol; + struct bch_snapshot v; + struct bkey_i_snapshot *u; + u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset); + u32 real_depth; + struct printbuf buf = PRINTBUF; + bool should_have_subvol; + u32 i, id; + int ret = 0; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + memset(&s, 0, sizeof(s)); + memcpy(&s, k.v, bkey_val_bytes(k.k)); + + id = le32_to_cpu(s.parent); + if (id) { + ret = snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot with nonexistent parent:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + if (le32_to_cpu(v.children[0]) != k.k->p.offset && + le32_to_cpu(v.children[1]) != k.k->p.offset) { + bch_err(c, "snapshot parent %u missing pointer to child %llu", + id, k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + + for (i = 0; i < 2 && s.children[i]; i++) { + id = le32_to_cpu(s.children[i]); + + ret = snapshot_lookup(trans, id, &v); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot node %llu has nonexistent child %u", + k.k->p.offset, id); + if (ret) + goto err; + + if (le32_to_cpu(v.parent) != k.k->p.offset) { + bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)", + id, le32_to_cpu(v.parent), k.k->p.offset); + ret = -EINVAL; + goto err; + } + } + + should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && + !BCH_SNAPSHOT_DELETED(&s); + + if (should_have_subvol) { + id = le32_to_cpu(s.subvol); + ret = bch2_subvolume_get(trans, id, 0, false, &subvol); + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "snapshot points to nonexistent subvolume:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + if (ret) + goto err; + + if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) { + bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL", + k.k->p.offset); + ret = -EINVAL; + goto err; + } + } else { + if (fsck_err_on(s.subvol, c, "snapshot should not point to subvol:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.subvol = 0; + s = u->v; + } + } + + ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree)); + if (ret < 0) + goto err; + + if (fsck_err_on(!ret, c, "snapshot points to missing/incorrect tree:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = snapshot_tree_ptr_repair(trans, iter, k, &s); + if (ret) + goto err; + } + ret = 0; + + real_depth = bch2_snapshot_depth(c, parent_id); + + if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, c, + "snapshot with incorrect depth fields, should be %u:\n %s", + real_depth, + (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + u->v.depth = cpu_to_le32(real_depth); + s = u->v; + } + + ret = snapshot_skiplist_good(trans, s); + if (ret < 0) + goto err; + + if (!ret && + (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists || + fsck_err(c, "snapshot with bad skiplist field:\n %s", + (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) { + u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); + ret = PTR_ERR_OR_ZERO(u); + if (ret) + goto err; + + for (i = 0; i < ARRAY_SIZE(u->v.skip); i++) + u->v.skip[i] = cpu_to_le32(snapshot_skiplist_get(c, parent_id)); + + bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_int); + s = u->v; + } + ret = 0; +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_snapshots(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + /* + * We iterate backwards as checking/fixing the depth field requires that + * the parent's depth already be correct: + */ + ret = bch2_trans_run(c, + for_each_btree_key_reverse_commit(&trans, iter, + BTREE_ID_snapshots, POS_MAX, + BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_snapshot(&trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int check_subvol(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c_subvolume subvol; + struct bch_snapshot snapshot; + unsigned snapid; + int ret = 0; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + subvol = bkey_s_c_to_subvolume(k); + snapid = le32_to_cpu(subvol.v->snapshot); + ret = snapshot_lookup(trans, snapid, &snapshot); + + if (bch2_err_matches(ret, ENOENT)) + bch_err(c, "subvolume %llu points to nonexistent snapshot %u", + k.k->p.offset, snapid); + if (ret) + return ret; + + if (BCH_SUBVOLUME_UNLINKED(subvol.v)) { + bch2_fs_lazy_rw(c); + + ret = bch2_subvolume_delete(trans, iter->pos.offset); + if (ret) + bch_err(c, "error deleting subvolume %llu: %s", + iter->pos.offset, bch2_err_str(ret)); + return ret ?: -BCH_ERR_transaction_restart_nested; + } + + if (!BCH_SUBVOLUME_SNAP(subvol.v)) { + u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); + u32 snapshot_tree; + struct bch_snapshot_tree st; + + rcu_read_lock(); + snapshot_tree = snapshot_t(c, snapshot_root)->tree; + rcu_read_unlock(); + + ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); + + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "%s: snapshot tree %u not found", __func__, snapshot_tree); + + if (ret) + return ret; + + if (fsck_err_on(le32_to_cpu(st.master_subvol) != subvol.k->p.offset, c, + "subvolume %llu is not set as snapshot but is not master subvolume", + k.k->p.offset)) { + struct bkey_i_subvolume *s = + bch2_bkey_make_mut_typed(trans, iter, &subvol.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + SET_BCH_SUBVOLUME_SNAP(&s->v, true); + } + } + +fsck_err: + return ret; +} + +int bch2_check_subvols(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + ret = bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, + check_subvol(&trans, &iter, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +void bch2_fs_snapshots_exit(struct bch_fs *c) +{ + kfree(c->snapshots); +} + +int bch2_snapshots_read(struct bch_fs *c) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + ret = bch2_trans_run(c, + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_mark_snapshot(&trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?: + bch2_snapshot_set_equiv(&trans, k))); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +/* + * Mark a snapshot as deleted, for future cleanup: + */ +static int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) +{ + struct btree_iter iter; + struct bkey_i_snapshot *s; + int ret = 0; + + s = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(s); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), + trans->c, "missing snapshot %u", id); + return ret; + } + + /* already deleted? */ + if (BCH_SNAPSHOT_DELETED(&s->v)) + goto err; + + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); + s->v.subvol = 0; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; + struct btree_iter tree_iter = (struct btree_iter) { NULL }; + struct bkey_s_c_snapshot s; + u32 parent_id; + unsigned i; + int ret = 0; + + s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_INTENT, snapshot); + ret = bkey_err(s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", id); + + if (ret) + goto err; + + BUG_ON(!BCH_SNAPSHOT_DELETED(s.v)); + parent_id = le32_to_cpu(s.v->parent); + + if (parent_id) { + struct bkey_i_snapshot *parent; + + parent = bch2_bkey_get_mut_typed(trans, &p_iter, + BTREE_ID_snapshots, POS(0, parent_id), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(parent); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "missing snapshot %u", parent_id); + goto err; + } + + for (i = 0; i < 2; i++) + if (le32_to_cpu(parent->v.children[i]) == id) + break; + + if (i == 2) + bch_err(c, "snapshot %u missing child pointer to %u", + parent_id, id); + else + parent->v.children[i] = 0; + + if (le32_to_cpu(parent->v.children[0]) < + le32_to_cpu(parent->v.children[1])) + swap(parent->v.children[0], + parent->v.children[1]); + } else { + /* + * We're deleting the root of a snapshot tree: update the + * snapshot_tree entry to point to the new root, or delete it if + * this is the last snapshot ID in this tree: + */ + struct bkey_i_snapshot_tree *s_t; + + BUG_ON(s.v->children[1]); + + s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + 0, snapshot_tree); + ret = PTR_ERR_OR_ZERO(s_t); + if (ret) + goto err; + + if (s.v->children[0]) { + s_t->v.root_snapshot = s.v->children[0]; + } else { + s_t->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s_t->k, 0); + } + } + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &tree_iter); + bch2_trans_iter_exit(trans, &p_iter); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_i_snapshot *n; + struct bkey_s_c k; + unsigned i, j; + u32 depth = bch2_snapshot_depth(c, parent); + int ret; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, + POS_MIN, BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + for (i = 0; i < nr_snapids; i++) { + k = bch2_btree_iter_prev_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k || !k.k->p.offset) { + ret = -BCH_ERR_ENOSPC_snapshot_create; + goto err; + } + + n = bch2_bkey_alloc(trans, &iter, 0, snapshot); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.flags = 0; + n->v.parent = cpu_to_le32(parent); + n->v.subvol = cpu_to_le32(snapshot_subvols[i]); + n->v.tree = cpu_to_le32(tree); + n->v.depth = cpu_to_le32(depth); + + for (j = 0; j < ARRAY_SIZE(n->v.skip); j++) + n->v.skip[j] = cpu_to_le32(snapshot_skiplist_get(c, parent)); + + bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_int); + SET_BCH_SNAPSHOT_SUBVOL(&n->v, true); + + ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, + bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0); + if (ret) + goto err; + + new_snapids[i] = iter.pos.offset; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create new snapshot IDs as children of an existing snapshot ID: + */ +static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct btree_iter iter; + struct bkey_i_snapshot *n_parent; + int ret = 0; + + n_parent = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_snapshots, POS(0, parent), + 0, snapshot); + ret = PTR_ERR_OR_ZERO(n_parent); + if (unlikely(ret)) { + if (bch2_err_matches(ret, ENOENT)) + bch_err(trans->c, "snapshot %u not found", parent); + return ret; + } + + if (n_parent->v.children[0] || n_parent->v.children[1]) { + bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children"); + ret = -EINVAL; + goto err; + } + + ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree), + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + goto err; + + n_parent->v.children[0] = cpu_to_le32(new_snapids[0]); + n_parent->v.children[1] = cpu_to_le32(new_snapids[1]); + n_parent->v.subvol = 0; + SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +/* + * Create a snapshot node that is the root of a new tree: + */ +static int bch2_snapshot_node_create_tree(struct btree_trans *trans, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + struct bkey_i_snapshot_tree *n_tree; + int ret; + + n_tree = __snapshot_tree_create(trans); + ret = PTR_ERR_OR_ZERO(n_tree) ?: + create_snapids(trans, 0, n_tree->k.p.offset, + new_snapids, snapshot_subvols, nr_snapids); + if (ret) + return ret; + + n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]); + n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]); + return 0; +} + +int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, + u32 *new_snapids, + u32 *snapshot_subvols, + unsigned nr_snapids) +{ + BUG_ON((parent == 0) != (nr_snapids == 1)); + BUG_ON((parent != 0) != (nr_snapids == 2)); + + return parent + ? bch2_snapshot_node_create_children(trans, parent, + new_snapids, snapshot_subvols, nr_snapids) + : bch2_snapshot_node_create_tree(trans, + new_snapids, snapshot_subvols, nr_snapids); + +} + +static int snapshot_delete_key(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + snapshot_id_list *deleted, + snapshot_id_list *equiv_seen, + struct bpos *last_pos) +{ + struct bch_fs *c = trans->c; + u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot); + + if (!bkey_eq(k.k->p, *last_pos)) + equiv_seen->nr = 0; + *last_pos = k.k->p; + + if (snapshot_list_has_id(deleted, k.k->p.snapshot) || + snapshot_list_has_id(equiv_seen, equiv)) { + return bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE); + } else { + return snapshot_list_add(c, equiv_seen, equiv); + } +} + +static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + struct bkey_s_c_snapshot snap; + u32 children[2]; + int ret; + + if (k.k->type != KEY_TYPE_snapshot) + return 0; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v) || + BCH_SNAPSHOT_SUBVOL(snap.v)) + return 0; + + children[0] = le32_to_cpu(snap.v->children[0]); + children[1] = le32_to_cpu(snap.v->children[1]); + + ret = snapshot_live(trans, children[0]) ?: + snapshot_live(trans, children[1]); + if (ret < 0) + return ret; + + if (!ret) + return bch2_snapshot_node_set_deleted(trans, k.k->p.offset); + return 0; +} + +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_s_c_snapshot snap; + snapshot_id_list deleted = { 0 }; + u32 i, id; + int ret = 0; + + if (!test_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags)) + return 0; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) { + ret = bch2_fs_read_write_early(c); + if (ret) { + bch_err(c, "error deleleting dead snapshots: error going rw: %s", bch2_err_str(ret)); + return ret; + } + } + + bch2_trans_init(&trans, c, 0, 0); + + /* + * For every snapshot node: If we have no live children and it's not + * pointed to by a subvolume, delete it: + */ + ret = for_each_btree_key_commit(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + NULL, NULL, 0, + bch2_delete_redundant_snapshot(&trans, &iter, k)); + if (ret) { + bch_err(c, "error deleting redundant snapshots: %s", bch2_err_str(ret)); + goto err; + } + + for_each_btree_key2(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, + bch2_snapshot_set_equiv(&trans, k)); + if (ret) { + bch_err(c, "error in bch2_snapshots_set_equiv: %s", bch2_err_str(ret)); + goto err; + } + + for_each_btree_key(&trans, iter, BTREE_ID_snapshots, + POS_MIN, 0, k, ret) { + if (k.k->type != KEY_TYPE_snapshot) + continue; + + snap = bkey_s_c_to_snapshot(k); + if (BCH_SNAPSHOT_DELETED(snap.v)) { + ret = snapshot_list_add(c, &deleted, k.k->p.offset); + if (ret) + break; + } + } + bch2_trans_iter_exit(&trans, &iter); + + if (ret) { + bch_err(c, "error walking snapshots: %s", bch2_err_str(ret)); + goto err; + } + + for (id = 0; id < BTREE_ID_NR; id++) { + struct bpos last_pos = POS_MIN; + snapshot_id_list equiv_seen = { 0 }; + + if (!btree_type_has_snapshots(id)) + continue; + + ret = for_each_btree_key_commit(&trans, iter, + id, POS_MIN, + BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + snapshot_delete_key(&trans, &iter, k, &deleted, &equiv_seen, &last_pos)); + + darray_exit(&equiv_seen); + + if (ret) { + bch_err(c, "error deleting snapshot keys: %s", bch2_err_str(ret)); + goto err; + } + } + + for (i = 0; i < deleted.nr; i++) { + ret = commit_do(&trans, NULL, NULL, 0, + bch2_snapshot_node_delete(&trans, deleted.data[i])); + if (ret) { + bch_err(c, "error deleting snapshot %u: %s", + deleted.data[i], bch2_err_str(ret)); + goto err; + } + } + + clear_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); +err: + darray_exit(&deleted); + bch2_trans_exit(&trans); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static void bch2_delete_dead_snapshots_work(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + + bch2_delete_dead_snapshots(c); + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_delete_dead_snapshots_async(struct bch_fs *c) +{ + if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) && + !queue_work(c->write_ref_wq, &c->snapshot_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); +} + +static int bch2_delete_dead_snapshots_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *h) +{ + struct bch_fs *c = trans->c; + + set_bit(BCH_FS_HAVE_DELETED_SNAPSHOTS, &c->flags); + + if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_delete_dead_snapshots) + return 0; + + bch2_delete_dead_snapshots_async(c); + return 0; +} + +/* Subvolumes: */ + +int bch2_subvolume_invalid(const struct bch_fs *c, struct bkey_s_c k, + unsigned flags, struct printbuf *err) +{ + if (bkey_lt(k.k->p, SUBVOL_POS_MIN) || + bkey_gt(k.k->p, SUBVOL_POS_MAX)) { + prt_printf(err, "invalid pos"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k); + + prt_printf(out, "root %llu snapshot id %u", + le64_to_cpu(s.v->inode), + le32_to_cpu(s.v->snapshot)); + + if (bkey_val_bytes(s.k) > offsetof(struct bch_subvolume, parent)) + prt_printf(out, " parent %u", le32_to_cpu(s.v->parent)); +} + +static __always_inline int +bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), + iter_flags, subvolume, s); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && + inconsistent_if_not_found, + trans->c, "missing subvolume %u", subvol); + return ret; +} + +int bch2_subvolume_get(struct btree_trans *trans, unsigned subvol, + bool inconsistent_if_not_found, + int iter_flags, + struct bch_subvolume *s) +{ + return bch2_subvolume_get_inlined(trans, subvol, inconsistent_if_not_found, iter_flags, s); +} + +int bch2_snapshot_get_subvol(struct btree_trans *trans, u32 snapshot, + struct bch_subvolume *subvol) +{ + struct bch_snapshot snap; + + return snapshot_lookup(trans, snapshot, &snap) ?: + bch2_subvolume_get(trans, le32_to_cpu(snap.subvol), true, 0, subvol); +} + +int bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvol, + u32 *snapid) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_subvolumes, POS(0, subvol), + BTREE_ITER_CACHED| + BTREE_ITER_WITH_UPDATES); + ret = bkey_err(k) ?: k.k->type == KEY_TYPE_subvolume ? 0 : -BCH_ERR_ENOENT_subvolume; + + if (likely(!ret)) + *snapid = le32_to_cpu(bkey_s_c_to_subvolume(k).v->snapshot); + else if (bch2_err_matches(ret, ENOENT)) + bch2_fs_inconsistent(trans->c, "missing subvolume %u", subvol); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_subvolume_reparent(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + u32 old_parent, u32 new_parent) +{ + struct bkey_i_subvolume *s; + int ret; + + if (k.k->type != KEY_TYPE_subvolume) + return 0; + + if (bkey_val_bytes(k.k) > offsetof(struct bch_subvolume, parent) && + le32_to_cpu(bkey_s_c_to_subvolume(k).v->parent) != old_parent) + return 0; + + s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, subvolume); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + return ret; + + s->v.parent = cpu_to_le32(new_parent); + return 0; +} + +/* + * Scan for subvolumes with parent @subvolid_to_delete, reparent: + */ +static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bch_subvolume s; + + return lockrestart_do(trans, + bch2_subvolume_get(trans, subvolid_to_delete, true, + BTREE_ITER_CACHED, &s)) ?: + for_each_btree_key_commit(trans, iter, + BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k, + NULL, NULL, BTREE_INSERT_NOFAIL, + bch2_subvolume_reparent(trans, &iter, k, + subvolid_to_delete, le32_to_cpu(s.parent))); +} + +/* + * Delete subvolume, mark snapshot ID as deleted, queue up snapshot + * deletion/cleanup: + */ +static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_s_c_subvolume subvol; + struct btree_trans_commit_hook *h; + u32 snapid; + int ret = 0; + + subvol = bch2_bkey_get_iter_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED|BTREE_ITER_INTENT, + subvolume); + ret = bkey_err(subvol); + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + if (ret) + return ret; + + snapid = le32_to_cpu(subvol.v->snapshot); + + ret = bch2_btree_delete_at(trans, &iter, 0); + if (ret) + goto err; + + ret = bch2_snapshot_node_set_deleted(trans, snapid); + if (ret) + goto err; + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + goto err; + + h->fn = bch2_delete_dead_snapshots_hook; + bch2_trans_commit_hook(trans, h); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) +{ + return bch2_subvolumes_reparent(trans, subvolid) ?: + commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL, + __bch2_subvolume_delete(trans, subvolid)); +} + +static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) +{ + struct bch_fs *c = container_of(work, struct bch_fs, + snapshot_wait_for_pagecache_and_delete_work); + snapshot_id_list s; + u32 *id; + int ret = 0; + + while (!ret) { + mutex_lock(&c->snapshots_unlinked_lock); + s = c->snapshots_unlinked; + darray_init(&c->snapshots_unlinked); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (!s.nr) + break; + + bch2_evict_subvolume_inodes(c, &s); + + for (id = s.data; id < s.data + s.nr; id++) { + ret = bch2_trans_run(c, bch2_subvolume_delete(&trans, *id)); + if (ret) { + bch_err(c, "error deleting subvolume %u: %s", *id, bch2_err_str(ret)); + break; + } + } + + darray_exit(&s); + } + + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); +} + +struct subvolume_unlink_hook { + struct btree_trans_commit_hook h; + u32 subvol; +}; + +static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans *trans, + struct btree_trans_commit_hook *_h) +{ + struct subvolume_unlink_hook *h = container_of(_h, struct subvolume_unlink_hook, h); + struct bch_fs *c = trans->c; + int ret = 0; + + mutex_lock(&c->snapshots_unlinked_lock); + if (!snapshot_list_has_id(&c->snapshots_unlinked, h->subvol)) + ret = snapshot_list_add(c, &c->snapshots_unlinked, h->subvol); + mutex_unlock(&c->snapshots_unlinked_lock); + + if (ret) + return ret; + + if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) + return -EROFS; + + if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) + bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + return 0; +} + +int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) +{ + struct btree_iter iter; + struct bkey_i_subvolume *n; + struct subvolume_unlink_hook *h; + int ret = 0; + + h = bch2_trans_kmalloc(trans, sizeof(*h)); + ret = PTR_ERR_OR_ZERO(h); + if (ret) + return ret; + + h->h.fn = bch2_subvolume_wait_for_pagecache_and_delete_hook; + h->subvol = subvolid; + bch2_trans_commit_hook(trans, &h->h); + + n = bch2_bkey_get_mut_typed(trans, &iter, + BTREE_ID_subvolumes, POS(0, subvolid), + BTREE_ITER_CACHED, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, + "missing subvolume %u", subvolid); + return ret; + } + + SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_subvolume_create(struct btree_trans *trans, u64 inode, + u32 src_subvolid, + u32 *new_subvolid, + u32 *new_snapshotid, + bool ro) +{ + struct bch_fs *c = trans->c; + struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct bkey_i_subvolume *new_subvol = NULL; + struct bkey_i_subvolume *src_subvol = NULL; + u32 parent = 0, new_nodes[2], snapshot_subvols[2]; + int ret = 0; + + ret = bch2_bkey_get_empty_slot(trans, &dst_iter, + BTREE_ID_subvolumes, POS(0, U32_MAX)); + if (ret == -BCH_ERR_ENOSPC_btree_slot) + ret = -BCH_ERR_ENOSPC_subvolume_create; + if (ret) + return ret; + + snapshot_subvols[0] = dst_iter.pos.offset; + snapshot_subvols[1] = src_subvolid; + + if (src_subvolid) { + /* Creating a snapshot: */ + + src_subvol = bch2_bkey_get_mut_typed(trans, &src_iter, + BTREE_ID_subvolumes, POS(0, src_subvolid), + BTREE_ITER_CACHED, subvolume); + ret = PTR_ERR_OR_ZERO(src_subvol); + if (unlikely(ret)) { + bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, + "subvolume %u not found", src_subvolid); + goto err; + } + + parent = le32_to_cpu(src_subvol->v.snapshot); + } + + ret = bch2_snapshot_node_create(trans, parent, new_nodes, + snapshot_subvols, + src_subvolid ? 2 : 1); + if (ret) + goto err; + + if (src_subvolid) { + src_subvol->v.snapshot = cpu_to_le32(new_nodes[1]); + ret = bch2_trans_update(trans, &src_iter, &src_subvol->k_i, 0); + if (ret) + goto err; + } + + new_subvol = bch2_bkey_alloc(trans, &dst_iter, 0, subvolume); + ret = PTR_ERR_OR_ZERO(new_subvol); + if (ret) + goto err; + + new_subvol->v.flags = 0; + new_subvol->v.snapshot = cpu_to_le32(new_nodes[0]); + new_subvol->v.inode = cpu_to_le64(inode); + new_subvol->v.parent = cpu_to_le32(src_subvolid); + new_subvol->v.otime.lo = cpu_to_le64(bch2_current_time(c)); + new_subvol->v.otime.hi = 0; + + SET_BCH_SUBVOLUME_RO(&new_subvol->v, ro); + SET_BCH_SUBVOLUME_SNAP(&new_subvol->v, src_subvolid != 0); + + *new_subvolid = new_subvol->k.p.offset; + *new_snapshotid = new_nodes[0]; +err: + bch2_trans_iter_exit(trans, &src_iter); + bch2_trans_iter_exit(trans, &dst_iter); + return ret; +} + +int bch2_fs_subvolumes_init(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); + INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, + bch2_subvolume_wait_for_pagecache_and_delete); + mutex_init(&c->snapshots_unlinked_lock); + return 0; +} diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h new file mode 100644 index 000000000..12a08a34e --- /dev/null +++ b/fs/bcachefs/subvolume.h @@ -0,0 +1,251 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_H +#define _BCACHEFS_SUBVOLUME_H + +#include "darray.h" +#include "subvolume_types.h" + +enum bkey_invalid_flags; + +void bch2_snapshot_tree_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_tree_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); + +#define bch2_bkey_ops_snapshot_tree ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_tree_invalid, \ + .val_to_text = bch2_snapshot_tree_to_text, \ + .min_val_size = 8, \ +}) + +int bch2_snapshot_tree_lookup(struct btree_trans *, u32, struct bch_snapshot_tree *); + +void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +int bch2_snapshot_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, struct bkey_s_c, unsigned); + +#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \ + .key_invalid = bch2_snapshot_invalid, \ + .val_to_text = bch2_snapshot_to_text, \ + .atomic_trigger = bch2_mark_snapshot, \ + .min_val_size = 24, \ +}) + +static inline struct snapshot_t *__snapshot_t(struct snapshot_table *t, u32 id) +{ + return &t->s[U32_MAX - id]; +} + +static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) +{ + return __snapshot_t(rcu_dereference(c->snapshots), id); +} + +static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = snapshot_t(c, id)->tree; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->parent; +} + +static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent_early(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ +#ifdef CONFIG_BCACHEFS_DEBUG + u32 parent = snapshot_t(c, id)->parent; + + if (parent && + snapshot_t(c, id)->depth != snapshot_t(c, parent)->depth + 1) + panic("id %u depth=%u parent %u depth=%u\n", + id, snapshot_t(c, id)->depth, + parent, snapshot_t(c, parent)->depth); + + return parent; +#else + return snapshot_t(c, id)->parent; +#endif +} + +static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) +{ + rcu_read_lock(); + while (n--) + id = __bch2_snapshot_parent(c, id); + rcu_read_unlock(); + + return id; +} + +static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) +{ + u32 parent; + + rcu_read_lock(); + while ((parent = __bch2_snapshot_parent(c, id))) + id = parent; + rcu_read_unlock(); + + return id; +} + +static inline u32 __bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + return snapshot_t(c, id)->equiv; +} + +static inline u32 bch2_snapshot_equiv(struct bch_fs *c, u32 id) +{ + rcu_read_lock(); + id = __bch2_snapshot_equiv(c, id); + rcu_read_unlock(); + + return id; +} + +static inline bool bch2_snapshot_is_equiv(struct bch_fs *c, u32 id) +{ + return id == bch2_snapshot_equiv(c, id); +} + +static inline bool bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + bool ret; + + rcu_read_lock(); + s = snapshot_t(c, id); + ret = s->children[0]; + rcu_read_unlock(); + + return ret; +} + +static inline u32 bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) +{ + return !bch2_snapshot_is_internal_node(c, id); +} + +static inline u32 bch2_snapshot_sibling(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *s; + u32 parent = __bch2_snapshot_parent(c, id); + + if (!parent) + return 0; + + s = snapshot_t(c, __bch2_snapshot_parent(c, id)); + if (id == s->children[0]) + return s->children[1]; + if (id == s->children[1]) + return s->children[0]; + return 0; +} + +bool bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); + +static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) +{ + const struct snapshot_t *t; + bool ret; + + rcu_read_lock(); + t = snapshot_t(c, id); + ret = (t->children[0]|t->children[1]) != 0; + rcu_read_unlock(); + + return ret; +} + +static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) +{ + u32 *i; + + darray_for_each(*s, i) + if (*i == id) + return true; + return false; +} + +static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + u32 *i; + + darray_for_each(*s, i) + if (bch2_snapshot_is_ancestor(c, id, *i)) + return true; + return false; +} + +static inline int snapshot_list_add(struct bch_fs *c, snapshot_id_list *s, u32 id) +{ + int ret; + + BUG_ON(snapshot_list_has_id(s, id)); + ret = darray_push(s, id); + if (ret) + bch_err(c, "error reallocating snapshot_id_list (size %zu)", s->size); + return ret; +} + +int bch2_check_snapshot_trees(struct bch_fs *); +int bch2_check_snapshots(struct bch_fs *); +int bch2_check_subvols(struct bch_fs *); + +void bch2_fs_snapshots_exit(struct bch_fs *); +int bch2_snapshots_read(struct bch_fs *); + +int bch2_subvolume_invalid(const struct bch_fs *, struct bkey_s_c, + unsigned, struct printbuf *); +void bch2_subvolume_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_subvolume ((struct bkey_ops) { \ + .key_invalid = bch2_subvolume_invalid, \ + .val_to_text = bch2_subvolume_to_text, \ + .min_val_size = 16, \ +}) + +int bch2_subvolume_get(struct btree_trans *, unsigned, + bool, int, struct bch_subvolume *); +int bch2_snapshot_get_subvol(struct btree_trans *, u32, + struct bch_subvolume *); +int bch2_subvolume_get_snapshot(struct btree_trans *, u32, u32 *); + +/* only exported for tests: */ +int bch2_snapshot_node_create(struct btree_trans *, u32, + u32 *, u32 *, unsigned); + +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); + +int bch2_subvolume_unlink(struct btree_trans *, u32); +int bch2_subvolume_create(struct btree_trans *, u64, u32, + u32 *, u32 *, bool); + +int bch2_fs_subvolumes_init(struct bch_fs *); + +#endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h new file mode 100644 index 000000000..86833445a --- /dev/null +++ b/fs/bcachefs/subvolume_types.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUBVOLUME_TYPES_H +#define _BCACHEFS_SUBVOLUME_TYPES_H + +#include "darray.h" + +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + u32 equiv; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct snapshot_t s[0]; +}; + +typedef struct { + u32 subvol; + u64 inum; +} subvol_inum; + +#endif /* _BCACHEFS_SUBVOLUME_TYPES_H */ diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c new file mode 100644 index 000000000..e9ce3f332 --- /dev/null +++ b/fs/bcachefs/super-io.c @@ -0,0 +1,1711 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "btree_update_interior.h" +#include "buckets.h" +#include "checksum.h" +#include "counters.h" +#include "disk_groups.h" +#include "ec.h" +#include "error.h" +#include "io.h" +#include "journal.h" +#include "journal_io.h" +#include "journal_sb.h" +#include "journal_seq_blacklist.h" +#include "recovery.h" +#include "replicas.h" +#include "quota.h" +#include "super-io.h" +#include "super.h" +#include "trace.h" +#include "vstructs.h" + +#include +#include + +struct bch2_metadata_version { + u16 version; + const char *name; + u64 recovery_passes; +}; + +static const struct bch2_metadata_version bch2_metadata_versions[] = { +#define x(n, v, _recovery_passes) { \ + .version = v, \ + .name = #n, \ + .recovery_passes = _recovery_passes, \ +}, + BCH_METADATA_VERSIONS() +#undef x +}; + +void bch2_version_to_text(struct printbuf *out, unsigned v) +{ + const char *str = "(unknown version)"; + + for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) + if (bch2_metadata_versions[i].version == v) { + str = bch2_metadata_versions[i].name; + break; + } + + prt_printf(out, "%u.%u: %s", BCH_VERSION_MAJOR(v), BCH_VERSION_MINOR(v), str); +} + +unsigned bch2_latest_compatible_version(unsigned v) +{ + if (!BCH_VERSION_MAJOR(v)) + return v; + + for (unsigned i = 0; i < ARRAY_SIZE(bch2_metadata_versions); i++) + if (bch2_metadata_versions[i].version > v && + BCH_VERSION_MAJOR(bch2_metadata_versions[i].version) == + BCH_VERSION_MAJOR(v)) + v = bch2_metadata_versions[i].version; + + return v; +} + +u64 bch2_upgrade_recovery_passes(struct bch_fs *c, + unsigned old_version, + unsigned new_version) +{ + u64 ret = 0; + + for (const struct bch2_metadata_version *i = bch2_metadata_versions; + i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions); + i++) + if (i->version > old_version && i->version <= new_version) { + if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK) + ret |= bch2_fsck_recovery_passes(); + ret |= i->recovery_passes; + } + + return ret &= ~RECOVERY_PASS_ALL_FSCK; +} + +const char * const bch2_sb_fields[] = { +#define x(name, nr) #name, + BCH_SB_FIELDS() +#undef x + NULL +}; + +static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *, + struct printbuf *); + +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *sb, + enum bch_sb_field_type type) +{ + struct bch_sb_field *f; + + /* XXX: need locking around superblock to access optional fields */ + + vstruct_for_each(sb, f) + if (le32_to_cpu(f->type) == type) + return f; + return NULL; +} + +static struct bch_sb_field *__bch2_sb_field_resize(struct bch_sb_handle *sb, + struct bch_sb_field *f, + unsigned u64s) +{ + unsigned old_u64s = f ? le32_to_cpu(f->u64s) : 0; + unsigned sb_u64s = le32_to_cpu(sb->sb->u64s) + u64s - old_u64s; + + BUG_ON(__vstruct_bytes(struct bch_sb, sb_u64s) > sb->buffer_size); + + if (!f && !u64s) { + /* nothing to do: */ + } else if (!f) { + f = vstruct_last(sb->sb); + memset(f, 0, sizeof(u64) * u64s); + f->u64s = cpu_to_le32(u64s); + f->type = 0; + } else { + void *src, *dst; + + src = vstruct_end(f); + + if (u64s) { + f->u64s = cpu_to_le32(u64s); + dst = vstruct_end(f); + } else { + dst = f; + } + + memmove(dst, src, vstruct_end(sb->sb) - src); + + if (dst > src) + memset(src, 0, dst - src); + } + + sb->sb->u64s = cpu_to_le32(sb_u64s); + + return u64s ? f : NULL; +} + +void bch2_sb_field_delete(struct bch_sb_handle *sb, + enum bch_sb_field_type type) +{ + struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); + + if (f) + __bch2_sb_field_resize(sb, f, 0); +} + +/* Superblock realloc/free: */ + +void bch2_free_super(struct bch_sb_handle *sb) +{ + kfree(sb->bio); + if (!IS_ERR_OR_NULL(sb->bdev)) + blkdev_put(sb->bdev, sb->mode); + + kfree(sb->sb); + memset(sb, 0, sizeof(*sb)); +} + +int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s) +{ + size_t new_bytes = __vstruct_bytes(struct bch_sb, u64s); + size_t new_buffer_size; + struct bch_sb *new_sb; + struct bio *bio; + + if (sb->bdev) + new_bytes = max_t(size_t, new_bytes, bdev_logical_block_size(sb->bdev)); + + new_buffer_size = roundup_pow_of_two(new_bytes); + + if (sb->sb && sb->buffer_size >= new_buffer_size) + return 0; + + if (sb->have_layout) { + u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits; + + if (new_bytes > max_bytes) { + pr_err("%pg: superblock too big: want %zu but have %llu", + sb->bdev, new_bytes, max_bytes); + return -BCH_ERR_ENOSPC_sb; + } + } + + if (sb->buffer_size >= new_buffer_size && sb->sb) + return 0; + + if (dynamic_fault("bcachefs:add:super_realloc")) + return -BCH_ERR_ENOMEM_sb_realloc_injected; + + if (sb->have_bio) { + unsigned nr_bvecs = DIV_ROUND_UP(new_buffer_size, PAGE_SIZE); + + bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); + if (!bio) + return -BCH_ERR_ENOMEM_sb_bio_realloc; + + bio_init(bio, NULL, bio->bi_inline_vecs, nr_bvecs, 0); + + kfree(sb->bio); + sb->bio = bio; + } + + new_sb = krealloc(sb->sb, new_buffer_size, GFP_NOFS|__GFP_ZERO); + if (!new_sb) + return -BCH_ERR_ENOMEM_sb_buf_realloc; + + sb->sb = new_sb; + sb->buffer_size = new_buffer_size; + + return 0; +} + +struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *sb, + enum bch_sb_field_type type, + unsigned u64s) +{ + struct bch_sb_field *f = bch2_sb_field_get(sb->sb, type); + ssize_t old_u64s = f ? le32_to_cpu(f->u64s) : 0; + ssize_t d = -old_u64s + u64s; + + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) + return NULL; + + if (sb->fs_sb) { + struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb); + struct bch_dev *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + /* XXX: we're not checking that offline device have enough space */ + + for_each_online_member(ca, c, i) { + struct bch_sb_handle *sb = &ca->disk_sb; + + if (bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s) + d)) { + percpu_ref_put(&ca->ref); + return NULL; + } + } + } + + f = bch2_sb_field_get(sb->sb, type); + f = __bch2_sb_field_resize(sb, f, u64s); + if (f) + f->type = cpu_to_le32(type); + return f; +} + +/* Superblock validate: */ + +static inline void __bch2_sb_layout_size_assert(void) +{ + BUILD_BUG_ON(sizeof(struct bch_sb_layout) != 512); +} + +static int validate_sb_layout(struct bch_sb_layout *layout, struct printbuf *out) +{ + u64 offset, prev_offset, max_sectors; + unsigned i; + + if (!uuid_equal(&layout->magic, &BCACHE_MAGIC) && + !uuid_equal(&layout->magic, &BCHFS_MAGIC)) { + prt_printf(out, "Not a bcachefs superblock layout"); + return -BCH_ERR_invalid_sb_layout; + } + + if (layout->layout_type != 0) { + prt_printf(out, "Invalid superblock layout type %u", + layout->layout_type); + return -BCH_ERR_invalid_sb_layout_type; + } + + if (!layout->nr_superblocks) { + prt_printf(out, "Invalid superblock layout: no superblocks"); + return -BCH_ERR_invalid_sb_layout_nr_superblocks; + } + + if (layout->nr_superblocks > ARRAY_SIZE(layout->sb_offset)) { + prt_printf(out, "Invalid superblock layout: too many superblocks"); + return -BCH_ERR_invalid_sb_layout_nr_superblocks; + } + + max_sectors = 1 << layout->sb_max_size_bits; + + prev_offset = le64_to_cpu(layout->sb_offset[0]); + + for (i = 1; i < layout->nr_superblocks; i++) { + offset = le64_to_cpu(layout->sb_offset[i]); + + if (offset < prev_offset + max_sectors) { + prt_printf(out, "Invalid superblock layout: superblocks overlap\n" + " (sb %u ends at %llu next starts at %llu", + i - 1, prev_offset + max_sectors, offset); + return -BCH_ERR_invalid_sb_layout_superblocks_overlap; + } + prev_offset = offset; + } + + return 0; +} + +static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) +{ + u16 version = le16_to_cpu(sb->version); + u16 version_min = le16_to_cpu(sb->version_min); + + if (!bch2_version_compatible(version)) { + prt_str(out, "Unsupported superblock version "); + bch2_version_to_text(out, version); + prt_str(out, " (min "); + bch2_version_to_text(out, bcachefs_metadata_version_min); + prt_str(out, ", max "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + prt_str(out, ")"); + return -BCH_ERR_invalid_sb_version; + } + + if (!bch2_version_compatible(version_min)) { + prt_str(out, "Unsupported superblock version_min "); + bch2_version_to_text(out, version_min); + prt_str(out, " (min "); + bch2_version_to_text(out, bcachefs_metadata_version_min); + prt_str(out, ", max "); + bch2_version_to_text(out, bcachefs_metadata_version_current); + prt_str(out, ")"); + return -BCH_ERR_invalid_sb_version; + } + + if (version_min > version) { + prt_str(out, "Bad minimum version "); + bch2_version_to_text(out, version_min); + prt_str(out, ", greater than version field "); + bch2_version_to_text(out, version); + return -BCH_ERR_invalid_sb_version; + } + + return 0; +} + +static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out, + int rw) +{ + struct bch_sb *sb = disk_sb->sb; + struct bch_sb_field *f; + struct bch_sb_field_members *mi; + enum bch_opt_id opt_id; + u16 block_size; + int ret; + + ret = bch2_sb_compatible(sb, out); + if (ret) + return ret; + + if (sb->features[1] || + (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { + prt_printf(out, "Filesystem has incompatible features"); + return -BCH_ERR_invalid_sb_features; + } + + block_size = le16_to_cpu(sb->block_size); + + if (block_size > PAGE_SECTORS) { + prt_printf(out, "Block size too big (got %u, max %u)", + block_size, PAGE_SECTORS); + return -BCH_ERR_invalid_sb_block_size; + } + + if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { + prt_printf(out, "Bad user UUID (got zeroes)"); + return -BCH_ERR_invalid_sb_uuid; + } + + if (bch2_is_zero(sb->uuid.b, sizeof(sb->uuid))) { + prt_printf(out, "Bad intenal UUID (got zeroes)"); + return -BCH_ERR_invalid_sb_uuid; + } + + if (!sb->nr_devices || + sb->nr_devices > BCH_SB_MEMBERS_MAX) { + prt_printf(out, "Bad number of member devices %u (max %u)", + sb->nr_devices, BCH_SB_MEMBERS_MAX); + return -BCH_ERR_invalid_sb_too_many_members; + } + + if (sb->dev_idx >= sb->nr_devices) { + prt_printf(out, "Bad dev_idx (got %u, nr_devices %u)", + sb->dev_idx, sb->nr_devices); + return -BCH_ERR_invalid_sb_dev_idx; + } + + if (!sb->time_precision || + le32_to_cpu(sb->time_precision) > NSEC_PER_SEC) { + prt_printf(out, "Invalid time precision: %u (min 1, max %lu)", + le32_to_cpu(sb->time_precision), NSEC_PER_SEC); + return -BCH_ERR_invalid_sb_time_precision; + } + + if (rw == READ) { + /* + * Been seeing a bug where these are getting inexplicably + * zeroed, so we're now validating them, but we have to be + * careful not to preven people's filesystems from mounting: + */ + if (!BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + if (!BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 1000); + } + + for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { + const struct bch_option *opt = bch2_opt_table + opt_id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, opt_id); + + prt_printf(out, "Invalid option "); + ret = bch2_opt_validate(opt, v, out); + if (ret) + return ret; + + printbuf_reset(out); + } + } + + /* validate layout */ + ret = validate_sb_layout(&sb->layout, out); + if (ret) + return ret; + + vstruct_for_each(sb, f) { + if (!f->u64s) { + prt_printf(out, "Invalid superblock: optional field with size 0 (type %u)", + le32_to_cpu(f->type)); + return -BCH_ERR_invalid_sb_field_size; + } + + if (vstruct_next(f) > vstruct_last(sb)) { + prt_printf(out, "Invalid superblock: optional field extends past end of superblock (type %u)", + le32_to_cpu(f->type)); + return -BCH_ERR_invalid_sb_field_size; + } + } + + /* members must be validated first: */ + mi = bch2_sb_get_members(sb); + if (!mi) { + prt_printf(out, "Invalid superblock: member info area missing"); + return -BCH_ERR_invalid_sb_members_missing; + } + + ret = bch2_sb_field_validate(sb, &mi->field, out); + if (ret) + return ret; + + vstruct_for_each(sb, f) { + if (le32_to_cpu(f->type) == BCH_SB_FIELD_members) + continue; + + ret = bch2_sb_field_validate(sb, f, out); + if (ret) + return ret; + } + + return 0; +} + +/* device open: */ + +static void bch2_sb_update(struct bch_fs *c) +{ + struct bch_sb *src = c->disk_sb.sb; + struct bch_sb_field_members *mi = bch2_sb_get_members(src); + struct bch_dev *ca; + unsigned i; + + lockdep_assert_held(&c->sb_lock); + + c->sb.uuid = src->uuid; + c->sb.user_uuid = src->user_uuid; + c->sb.version = le16_to_cpu(src->version); + c->sb.version_min = le16_to_cpu(src->version_min); + c->sb.version_upgrade_complete = BCH_SB_VERSION_UPGRADE_COMPLETE(src) ?: c->sb.version; + c->sb.nr_devices = src->nr_devices; + c->sb.clean = BCH_SB_CLEAN(src); + c->sb.encryption_type = BCH_SB_ENCRYPTION_TYPE(src); + + c->sb.nsec_per_time_unit = le32_to_cpu(src->time_precision); + c->sb.time_units_per_sec = NSEC_PER_SEC / c->sb.nsec_per_time_unit; + + /* XXX this is wrong, we need a 96 or 128 bit integer type */ + c->sb.time_base_lo = div_u64(le64_to_cpu(src->time_base_lo), + c->sb.nsec_per_time_unit); + c->sb.time_base_hi = le32_to_cpu(src->time_base_hi); + + c->sb.features = le64_to_cpu(src->features[0]); + c->sb.compat = le64_to_cpu(src->compat[0]); + + for_each_member_device(ca, c, i) + ca->mi = bch2_mi_to_cpu(mi->members + i); +} + +static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src) +{ + struct bch_sb_field *src_f, *dst_f; + struct bch_sb *dst = dst_handle->sb; + unsigned i; + + dst->version = src->version; + dst->version_min = src->version_min; + dst->seq = src->seq; + dst->uuid = src->uuid; + dst->user_uuid = src->user_uuid; + memcpy(dst->label, src->label, sizeof(dst->label)); + + dst->block_size = src->block_size; + dst->nr_devices = src->nr_devices; + + dst->time_base_lo = src->time_base_lo; + dst->time_base_hi = src->time_base_hi; + dst->time_precision = src->time_precision; + + memcpy(dst->flags, src->flags, sizeof(dst->flags)); + memcpy(dst->features, src->features, sizeof(dst->features)); + memcpy(dst->compat, src->compat, sizeof(dst->compat)); + + for (i = 0; i < BCH_SB_FIELD_NR; i++) { + int d; + + if ((1U << i) & BCH_SINGLE_DEVICE_SB_FIELDS) + continue; + + src_f = bch2_sb_field_get(src, i); + dst_f = bch2_sb_field_get(dst, i); + + d = (src_f ? le32_to_cpu(src_f->u64s) : 0) - + (dst_f ? le32_to_cpu(dst_f->u64s) : 0); + if (d > 0) { + int ret = bch2_sb_realloc(dst_handle, le32_to_cpu(dst_handle->sb->u64s) + d); + if (ret) + return ret; + + dst = dst_handle->sb; + dst_f = bch2_sb_field_get(dst, i); + } + + dst_f = __bch2_sb_field_resize(dst_handle, dst_f, + src_f ? le32_to_cpu(src_f->u64s) : 0); + + if (src_f) + memcpy(dst_f, src_f, vstruct_bytes(src_f)); + } + + return 0; +} + +int bch2_sb_to_fs(struct bch_fs *c, struct bch_sb *src) +{ + int ret; + + lockdep_assert_held(&c->sb_lock); + + ret = bch2_sb_realloc(&c->disk_sb, 0) ?: + __copy_super(&c->disk_sb, src) ?: + bch2_sb_replicas_to_cpu_replicas(c) ?: + bch2_sb_disk_groups_to_cpu(c); + if (ret) + return ret; + + bch2_sb_update(c); + return 0; +} + +int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca) +{ + return __copy_super(&ca->disk_sb, c->disk_sb.sb); +} + +/* read superblock: */ + +static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err) +{ + struct bch_csum csum; + size_t bytes; + int ret; +reread: + bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + sb->bio->bi_iter.bi_sector = offset; + bch2_bio_map(sb->bio, sb->sb, sb->buffer_size); + + ret = submit_bio_wait(sb->bio); + if (ret) { + prt_printf(err, "IO error: %i", ret); + return ret; + } + + if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) && + !uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) { + prt_printf(err, "Not a bcachefs superblock"); + return -BCH_ERR_invalid_sb_magic; + } + + ret = bch2_sb_compatible(sb->sb, err); + if (ret) + return ret; + + bytes = vstruct_bytes(sb->sb); + + if (bytes > 512 << sb->sb->layout.sb_max_size_bits) { + prt_printf(err, "Invalid superblock: too big (got %zu bytes, layout max %lu)", + bytes, 512UL << sb->sb->layout.sb_max_size_bits); + return -BCH_ERR_invalid_sb_too_big; + } + + if (bytes > sb->buffer_size) { + ret = bch2_sb_realloc(sb, le32_to_cpu(sb->sb->u64s)); + if (ret) + return ret; + goto reread; + } + + if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) { + prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb)); + return -BCH_ERR_invalid_sb_csum_type; + } + + /* XXX: verify MACs */ + csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb), + null_nonce(), sb->sb); + + if (bch2_crc_cmp(csum, sb->sb->csum)) { + prt_printf(err, "bad checksum"); + return -BCH_ERR_invalid_sb_csum; + } + + sb->seq = le64_to_cpu(sb->sb->seq); + + return 0; +} + +int bch2_read_super(const char *path, struct bch_opts *opts, + struct bch_sb_handle *sb) +{ + u64 offset = opt_get(*opts, sb); + struct bch_sb_layout layout; + struct printbuf err = PRINTBUF; + __le64 *i; + int ret; +#ifndef __KERNEL__ +retry: +#endif + memset(sb, 0, sizeof(*sb)); + sb->mode = FMODE_READ; + sb->have_bio = true; + +#ifndef __KERNEL__ + if (opt_get(*opts, direct_io) == false) + sb->mode |= FMODE_BUFFERED; +#endif + + if (!opt_get(*opts, noexcl)) + sb->mode |= FMODE_EXCL; + + if (!opt_get(*opts, nochanges)) + sb->mode |= FMODE_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + if (IS_ERR(sb->bdev) && + PTR_ERR(sb->bdev) == -EACCES && + opt_get(*opts, read_only)) { + sb->mode &= ~FMODE_WRITE; + + sb->bdev = blkdev_get_by_path(path, sb->mode, sb); + if (!IS_ERR(sb->bdev)) + opt_set(*opts, nochanges, true); + } + + if (IS_ERR(sb->bdev)) { + ret = PTR_ERR(sb->bdev); + goto out; + } + + ret = bch2_sb_realloc(sb, 0); + if (ret) { + prt_printf(&err, "error allocating memory for superblock"); + goto err; + } + + if (bch2_fs_init_fault("read_super")) { + prt_printf(&err, "dynamic fault"); + ret = -EFAULT; + goto err; + } + + ret = read_one_super(sb, offset, &err); + if (!ret) + goto got_super; + + if (opt_defined(*opts, sb)) + goto err; + + printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s", + path, err.buf); + printbuf_reset(&err); + + /* + * Error reading primary superblock - read location of backup + * superblocks: + */ + bio_reset(sb->bio, sb->bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + sb->bio->bi_iter.bi_sector = BCH_SB_LAYOUT_SECTOR; + /* + * use sb buffer to read layout, since sb buffer is page aligned but + * layout won't be: + */ + bch2_bio_map(sb->bio, sb->sb, sizeof(struct bch_sb_layout)); + + ret = submit_bio_wait(sb->bio); + if (ret) { + prt_printf(&err, "IO error: %i", ret); + goto err; + } + + memcpy(&layout, sb->sb, sizeof(layout)); + ret = validate_sb_layout(&layout, &err); + if (ret) + goto err; + + for (i = layout.sb_offset; + i < layout.sb_offset + layout.nr_superblocks; i++) { + offset = le64_to_cpu(*i); + + if (offset == opt_get(*opts, sb)) + continue; + + ret = read_one_super(sb, offset, &err); + if (!ret) + goto got_super; + } + + goto err; + +got_super: + if (le16_to_cpu(sb->sb->block_size) << 9 < + bdev_logical_block_size(sb->bdev) && + opt_get(*opts, direct_io)) { +#ifndef __KERNEL__ + opt_set(*opts, direct_io, false); + bch2_free_super(sb); + goto retry; +#endif + prt_printf(&err, "block size (%u) smaller than device block size (%u)", + le16_to_cpu(sb->sb->block_size) << 9, + bdev_logical_block_size(sb->bdev)); + ret = -BCH_ERR_block_size_too_small; + goto err; + } + + ret = 0; + sb->have_layout = true; + + ret = bch2_sb_validate(sb, &err, READ); + if (ret) { + printk(KERN_ERR "bcachefs (%s): error validating superblock: %s", + path, err.buf); + goto err_no_print; + } +out: + printbuf_exit(&err); + return ret; +err: + printk(KERN_ERR "bcachefs (%s): error reading superblock: %s", + path, err.buf); +err_no_print: + bch2_free_super(sb); + goto out; +} + +/* write superblock: */ + +static void write_super_endio(struct bio *bio) +{ + struct bch_dev *ca = bio->bi_private; + + /* XXX: return errors directly */ + + if (bch2_dev_io_err_on(bio->bi_status, ca, "superblock write error: %s", + bch2_blk_status_to_str(bio->bi_status))) + ca->sb_write_error = 1; + + closure_put(&ca->fs->sb_write); + percpu_ref_put(&ca->io_ref); +} + +static void read_back_super(struct bch_fs *c, struct bch_dev *ca) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_READ|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = le64_to_cpu(sb->layout.sb_offset[0]); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bch2_bio_map(bio, ca->sb_read_scratch, PAGE_SIZE); + + this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +} + +static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) +{ + struct bch_sb *sb = ca->disk_sb.sb; + struct bio *bio = ca->disk_sb.bio; + + sb->offset = sb->layout.sb_offset[idx]; + + SET_BCH_SB_CSUM_TYPE(sb, bch2_csum_opt_to_type(c->opts.metadata_checksum, false)); + sb->csum = csum_vstruct(c, BCH_SB_CSUM_TYPE(sb), + null_nonce(), sb); + + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); + bio->bi_iter.bi_sector = le64_to_cpu(sb->offset); + bio->bi_end_io = write_super_endio; + bio->bi_private = ca; + bch2_bio_map(bio, sb, + roundup((size_t) vstruct_bytes(sb), + bdev_logical_block_size(ca->disk_sb.bdev))); + + this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], + bio_sectors(bio)); + + percpu_ref_get(&ca->io_ref); + closure_bio_submit(bio, &c->sb_write); +} + +int bch2_write_super(struct bch_fs *c) +{ + struct closure *cl = &c->sb_write; + struct bch_dev *ca; + struct printbuf err = PRINTBUF; + unsigned i, sb = 0, nr_wrote; + struct bch_devs_mask sb_written; + bool wrote, can_mount_without_written, can_mount_with_written; + unsigned degraded_flags = BCH_FORCE_IF_DEGRADED; + int ret = 0; + + trace_and_count(c, write_super, c, _RET_IP_); + + if (c->opts.very_degraded) + degraded_flags |= BCH_FORCE_IF_LOST; + + lockdep_assert_held(&c->sb_lock); + + closure_init_stack(cl); + memset(&sb_written, 0, sizeof(sb_written)); + + /* Make sure we're using the new magic numbers: */ + c->disk_sb.sb->magic = BCHFS_MAGIC; + c->disk_sb.sb->layout.magic = BCHFS_MAGIC; + + le64_add_cpu(&c->disk_sb.sb->seq, 1); + + if (test_bit(BCH_FS_ERROR, &c->flags)) + SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1); + if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags)) + SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1); + + SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN); + + bch2_sb_counters_from_cpu(c); + + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); + + for_each_online_member(ca, c, i) { + printbuf_reset(&err); + + ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE); + if (ret) { + bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); + percpu_ref_put(&ca->io_ref); + goto out; + } + } + + if (c->opts.nochanges) + goto out; + + /* + * Defer writing the superblock until filesystem initialization is + * complete - don't write out a partly initialized superblock: + */ + if (!BCH_SB_INITIALIZED(c->disk_sb.sb)) + goto out; + + for_each_online_member(ca, c, i) { + __set_bit(ca->dev_idx, sb_written.d); + ca->sb_write_error = 0; + } + + for_each_online_member(ca, c, i) + read_back_super(c, ca); + closure_sync(cl); + + for_each_online_member(ca, c, i) { + if (ca->sb_write_error) + continue; + + if (le64_to_cpu(ca->sb_read_scratch->seq) < ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock write was silently dropped! (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -BCH_ERR_erofs_sb_err; + goto out; + } + + if (le64_to_cpu(ca->sb_read_scratch->seq) > ca->disk_sb.seq) { + bch2_fs_fatal_error(c, + "Superblock modified by another process (seq %llu expected %llu)", + le64_to_cpu(ca->sb_read_scratch->seq), + ca->disk_sb.seq); + percpu_ref_put(&ca->io_ref); + ret = -BCH_ERR_erofs_sb_err; + goto out; + } + } + + do { + wrote = false; + for_each_online_member(ca, c, i) + if (!ca->sb_write_error && + sb < ca->disk_sb.sb->layout.nr_superblocks) { + write_one_super(c, ca, sb); + wrote = true; + } + closure_sync(cl); + sb++; + } while (wrote); + + for_each_online_member(ca, c, i) { + if (ca->sb_write_error) + __clear_bit(ca->dev_idx, sb_written.d); + else + ca->disk_sb.seq = le64_to_cpu(ca->disk_sb.sb->seq); + } + + nr_wrote = dev_mask_nr(&sb_written); + + can_mount_with_written = + bch2_have_enough_devs(c, sb_written, degraded_flags, false); + + for (i = 0; i < ARRAY_SIZE(sb_written.d); i++) + sb_written.d[i] = ~sb_written.d[i]; + + can_mount_without_written = + bch2_have_enough_devs(c, sb_written, degraded_flags, false); + + /* + * If we would be able to mount _without_ the devices we successfully + * wrote superblocks to, we weren't able to write to enough devices: + * + * Exception: if we can mount without the successes because we haven't + * written anything (new filesystem), we continue if we'd be able to + * mount with the devices we did successfully write to: + */ + if (bch2_fs_fatal_err_on(!nr_wrote || + !can_mount_with_written || + (can_mount_without_written && + !can_mount_with_written), c, + "Unable to write superblock to sufficient devices (from %ps)", + (void *) _RET_IP_)) + ret = -1; +out: + /* Make new options visible after they're persistent: */ + bch2_sb_update(c); + printbuf_exit(&err); + return ret; +} + +void __bch2_check_set_feature(struct bch_fs *c, unsigned feat) +{ + mutex_lock(&c->sb_lock); + if (!(c->sb.features & (1ULL << feat))) { + c->disk_sb.sb->features[0] |= cpu_to_le64(1ULL << feat); + + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); +} + +/* BCH_SB_FIELD_members: */ + +static int bch2_sb_members_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_members *mi = field_to_type(f, members); + unsigned i; + + if ((void *) (mi->members + sb->nr_devices) > + vstruct_end(&mi->field)) { + prt_printf(err, "too many devices for section size"); + return -BCH_ERR_invalid_sb_members; + } + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + + if (!bch2_member_exists(m)) + continue; + + if (le64_to_cpu(m->nbuckets) > LONG_MAX) { + prt_printf(err, "device %u: too many buckets (got %llu, max %lu)", + i, le64_to_cpu(m->nbuckets), LONG_MAX); + return -BCH_ERR_invalid_sb_members; + } + + if (le64_to_cpu(m->nbuckets) - + le16_to_cpu(m->first_bucket) < BCH_MIN_NR_NBUCKETS) { + prt_printf(err, "device %u: not enough buckets (got %llu, max %u)", + i, le64_to_cpu(m->nbuckets), BCH_MIN_NR_NBUCKETS); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m->bucket_size) < + le16_to_cpu(sb->block_size)) { + prt_printf(err, "device %u: bucket size %u smaller than block size %u", + i, le16_to_cpu(m->bucket_size), le16_to_cpu(sb->block_size)); + return -BCH_ERR_invalid_sb_members; + } + + if (le16_to_cpu(m->bucket_size) < + BCH_SB_BTREE_NODE_SIZE(sb)) { + prt_printf(err, "device %u: bucket size %u smaller than btree node size %llu", + i, le16_to_cpu(m->bucket_size), BCH_SB_BTREE_NODE_SIZE(sb)); + return -BCH_ERR_invalid_sb_members; + } + } + + return 0; +} + +static void bch2_sb_members_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_members *mi = field_to_type(f, members); + struct bch_sb_field_disk_groups *gi = bch2_sb_get_disk_groups(sb); + unsigned i; + + for (i = 0; i < sb->nr_devices; i++) { + struct bch_member *m = mi->members + i; + unsigned data_have = bch2_sb_dev_has_data(sb, i); + u64 bucket_size = le16_to_cpu(m->bucket_size); + u64 device_size = le64_to_cpu(m->nbuckets) * bucket_size; + + if (!bch2_member_exists(m)) + continue; + + prt_printf(out, "Device:"); + prt_tab(out); + prt_printf(out, "%u", i); + prt_newline(out); + + printbuf_indent_add(out, 2); + + prt_printf(out, "UUID:"); + prt_tab(out); + pr_uuid(out, m->uuid.b); + prt_newline(out); + + prt_printf(out, "Size:"); + prt_tab(out); + prt_units_u64(out, device_size << 9); + prt_newline(out); + + prt_printf(out, "Bucket size:"); + prt_tab(out); + prt_units_u64(out, bucket_size << 9); + prt_newline(out); + + prt_printf(out, "First bucket:"); + prt_tab(out); + prt_printf(out, "%u", le16_to_cpu(m->first_bucket)); + prt_newline(out); + + prt_printf(out, "Buckets:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(m->nbuckets)); + prt_newline(out); + + prt_printf(out, "Last mount:"); + prt_tab(out); + if (m->last_mount) + pr_time(out, le64_to_cpu(m->last_mount)); + else + prt_printf(out, "(never)"); + prt_newline(out); + + prt_printf(out, "State:"); + prt_tab(out); + prt_printf(out, "%s", + BCH_MEMBER_STATE(m) < BCH_MEMBER_STATE_NR + ? bch2_member_states[BCH_MEMBER_STATE(m)] + : "unknown"); + prt_newline(out); + + prt_printf(out, "Label:"); + prt_tab(out); + if (BCH_MEMBER_GROUP(m)) { + unsigned idx = BCH_MEMBER_GROUP(m) - 1; + + if (idx < disk_groups_nr(gi)) + prt_printf(out, "%s (%u)", + gi->entries[idx].label, idx); + else + prt_printf(out, "(bad disk labels section)"); + } else { + prt_printf(out, "(none)"); + } + prt_newline(out); + + prt_printf(out, "Data allowed:"); + prt_tab(out); + if (BCH_MEMBER_DATA_ALLOWED(m)) + prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(m)); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Has data:"); + prt_tab(out); + if (data_have) + prt_bitflags(out, bch2_data_types, data_have); + else + prt_printf(out, "(none)"); + prt_newline(out); + + prt_printf(out, "Discard:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_DISCARD(m)); + prt_newline(out); + + prt_printf(out, "Freespace initialized:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_MEMBER_FREESPACE_INITIALIZED(m)); + prt_newline(out); + + printbuf_indent_sub(out, 2); + } +} + +static const struct bch_sb_field_ops bch_sb_field_ops_members = { + .validate = bch2_sb_members_validate, + .to_text = bch2_sb_members_to_text, +}; + +/* BCH_SB_FIELD_crypt: */ + +static int bch2_sb_crypt_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + if (vstruct_bytes(&crypt->field) < sizeof(*crypt)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&crypt->field), sizeof(*crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + if (BCH_CRYPT_KDF_TYPE(crypt)) { + prt_printf(err, "bad kdf type %llu", BCH_CRYPT_KDF_TYPE(crypt)); + return -BCH_ERR_invalid_sb_crypt; + } + + return 0; +} + +static void bch2_sb_crypt_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_crypt *crypt = field_to_type(f, crypt); + + prt_printf(out, "KFD: %llu", BCH_CRYPT_KDF_TYPE(crypt)); + prt_newline(out); + prt_printf(out, "scrypt n: %llu", BCH_KDF_SCRYPT_N(crypt)); + prt_newline(out); + prt_printf(out, "scrypt r: %llu", BCH_KDF_SCRYPT_R(crypt)); + prt_newline(out); + prt_printf(out, "scrypt p: %llu", BCH_KDF_SCRYPT_P(crypt)); + prt_newline(out); +} + +static const struct bch_sb_field_ops bch_sb_field_ops_crypt = { + .validate = bch2_sb_crypt_validate, + .to_text = bch2_sb_crypt_to_text, +}; + +/* BCH_SB_FIELD_clean: */ + +int bch2_sb_clean_validate_late(struct bch_fs *c, struct bch_sb_field_clean *clean, int write) +{ + struct jset_entry *entry; + int ret; + + for (entry = clean->start; + entry < (struct jset_entry *) vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + ret = bch2_journal_entry_validate(c, NULL, entry, + le16_to_cpu(c->disk_sb.sb->version), + BCH_SB_BIG_ENDIAN(c->disk_sb.sb), + write); + if (ret) + return ret; + } + + return 0; +} + +/* Downgrade if superblock is at a higher version than currently supported: */ +void bch2_sb_maybe_downgrade(struct bch_fs *c) +{ + lockdep_assert_held(&c->sb_lock); + + /* + * Downgrade, if superblock is at a higher version than currently + * supported: + */ + if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) + SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current); + if (c->sb.version > bcachefs_metadata_version_current) + c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current); + if (c->sb.version_min > bcachefs_metadata_version_current) + c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current); + c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1); +} + +void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version) +{ + lockdep_assert_held(&c->sb_lock); + + c->disk_sb.sb->version = cpu_to_le16(new_version); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); +} + +int bch2_fs_mark_dirty(struct bch_fs *c) +{ + int ret; + + /* + * Unconditionally write superblock, to verify it hasn't changed before + * we go rw: + */ + + mutex_lock(&c->sb_lock); + SET_BCH_SB_CLEAN(c->disk_sb.sb, false); + + bch2_sb_maybe_downgrade(c); + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALWAYS); + + ret = bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + return ret; +} + +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) +{ + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + + memset(entry, 0, u64s * sizeof(u64)); + /* + * The u64s field counts from the start of data, ignoring the shared + * fields. + */ + entry->u64s = cpu_to_le16(u64s - 1); + + *end = vstruct_next(*end); + return entry; +} + +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) +{ + struct bch_dev *ca; + unsigned i, dev; + + percpu_down_read(&c->mark_lock); + + if (!journal_seq) { + for (i = 0; i < ARRAY_SIZE(c->usage); i++) + bch2_fs_usage_acc_to_base(c, i); + } else { + bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_inodes; + u->v = cpu_to_le64(c->usage_base->nr_inodes); + } + + { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_key_version; + u->v = cpu_to_le64(atomic64_read(&c->key_version)); + } + + for (i = 0; i < BCH_REPLICAS_MAX; i++) { + struct jset_entry_usage *u = + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_usage; + u->entry.btree_id = BCH_FS_USAGE_reserved; + u->entry.level = i; + u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); + } + + for (i = 0; i < c->replicas.nr; i++) { + struct bch_replicas_entry *e = + cpu_replicas_entry(&c->replicas, i); + struct jset_entry_data_usage *u = + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_data_usage; + u->v = cpu_to_le64(c->usage_base->replicas[i]); + unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), + "embedded variable length struct"); + } + + for_each_member_device(ca, c, dev) { + unsigned b = sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR; + struct jset_entry_dev_usage *u = + container_of(jset_entry_init(end, b), + struct jset_entry_dev_usage, entry); + + u->entry.type = BCH_JSET_ENTRY_dev_usage; + u->dev = cpu_to_le32(dev); + u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec); + + for (i = 0; i < BCH_DATA_NR; i++) { + u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets); + u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors); + u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented); + } + } + + percpu_up_read(&c->mark_lock); + + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = cpu_to_le64(atomic64_read(&c->io_clock[i].now)); + } +} + +void bch2_fs_mark_clean(struct bch_fs *c) +{ + struct bch_sb_field_clean *sb_clean; + struct jset_entry *entry; + unsigned u64s; + int ret; + + mutex_lock(&c->sb_lock); + if (BCH_SB_CLEAN(c->disk_sb.sb)) + goto out; + + SET_BCH_SB_CLEAN(c->disk_sb.sb, true); + + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info); + c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_metadata); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_extents_above_btree_updates)); + c->disk_sb.sb->features[0] &= cpu_to_le64(~(1ULL << BCH_FEATURE_btree_updates_journalled)); + + u64s = sizeof(*sb_clean) / sizeof(u64) + c->journal.entry_u64s_reserved; + + sb_clean = bch2_sb_resize_clean(&c->disk_sb, u64s); + if (!sb_clean) { + bch_err(c, "error resizing superblock while setting filesystem clean"); + goto out; + } + + sb_clean->flags = 0; + sb_clean->journal_seq = cpu_to_le64(atomic64_read(&c->journal.seq)); + + /* Trying to catch outstanding bug: */ + BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); + + entry = sb_clean->start; + bch2_journal_super_entries_add_common(c, &entry, 0); + entry = bch2_btree_roots_to_journal_entries(c, entry, entry); + BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); + + memset(entry, 0, + vstruct_end(&sb_clean->field) - (void *) entry); + + /* + * this should be in the write path, and we should be validating every + * superblock section: + */ + ret = bch2_sb_clean_validate_late(c, sb_clean, WRITE); + if (ret) { + bch_err(c, "error writing marking filesystem clean: validate error"); + goto out; + } + + bch2_write_super(c); +out: + mutex_unlock(&c->sb_lock); +} + +static int bch2_sb_clean_validate(struct bch_sb *sb, + struct bch_sb_field *f, + struct printbuf *err) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + + if (vstruct_bytes(&clean->field) < sizeof(*clean)) { + prt_printf(err, "wrong size (got %zu should be %zu)", + vstruct_bytes(&clean->field), sizeof(*clean)); + return -BCH_ERR_invalid_sb_clean; + } + + return 0; +} + +static void bch2_sb_clean_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_clean *clean = field_to_type(f, clean); + struct jset_entry *entry; + + prt_printf(out, "flags: %x", le32_to_cpu(clean->flags)); + prt_newline(out); + prt_printf(out, "journal_seq: %llu", le64_to_cpu(clean->journal_seq)); + prt_newline(out); + + for (entry = clean->start; + entry != vstruct_end(&clean->field); + entry = vstruct_next(entry)) { + if (entry->type == BCH_JSET_ENTRY_btree_keys && + !entry->u64s) + continue; + + bch2_journal_entry_to_text(out, NULL, entry); + prt_newline(out); + } +} + +static const struct bch_sb_field_ops bch_sb_field_ops_clean = { + .validate = bch2_sb_clean_validate, + .to_text = bch2_sb_clean_to_text, +}; + +static const struct bch_sb_field_ops *bch2_sb_field_ops[] = { +#define x(f, nr) \ + [BCH_SB_FIELD_##f] = &bch_sb_field_ops_##f, + BCH_SB_FIELDS() +#undef x +}; + +static const struct bch_sb_field_ops bch2_sb_field_null_ops; + +static const struct bch_sb_field_ops *bch2_sb_field_type_ops(unsigned type) +{ + return likely(type < ARRAY_SIZE(bch2_sb_field_ops)) + ? bch2_sb_field_ops[type] + : &bch2_sb_field_null_ops; +} + +static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f, + struct printbuf *err) +{ + unsigned type = le32_to_cpu(f->type); + struct printbuf field_err = PRINTBUF; + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + int ret; + + ret = ops->validate ? ops->validate(sb, f, &field_err) : 0; + if (ret) { + prt_printf(err, "Invalid superblock section %s: %s", + bch2_sb_fields[type], field_err.buf); + prt_newline(err); + bch2_sb_field_to_text(err, sb, f); + } + + printbuf_exit(&field_err); + return ret; +} + +void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb, + struct bch_sb_field *f) +{ + unsigned type = le32_to_cpu(f->type); + const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type); + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + if (type < BCH_SB_FIELD_NR) + prt_printf(out, "%s", bch2_sb_fields[type]); + else + prt_printf(out, "(unknown field %u)", type); + + prt_printf(out, " (size %zu):", vstruct_bytes(f)); + prt_newline(out); + + if (ops->to_text) { + printbuf_indent_add(out, 2); + ops->to_text(out, sb, f); + printbuf_indent_sub(out, 2); + } +} + +void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l) +{ + unsigned i; + + prt_printf(out, "Type: %u", l->layout_type); + prt_newline(out); + + prt_str(out, "Superblock max size: "); + prt_units_u64(out, 512 << l->sb_max_size_bits); + prt_newline(out); + + prt_printf(out, "Nr superblocks: %u", l->nr_superblocks); + prt_newline(out); + + prt_str(out, "Offsets: "); + for (i = 0; i < l->nr_superblocks; i++) { + if (i) + prt_str(out, ", "); + prt_printf(out, "%llu", le64_to_cpu(l->sb_offset[i])); + } + prt_newline(out); +} + +void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, + bool print_layout, unsigned fields) +{ + struct bch_sb_field_members *mi; + struct bch_sb_field *f; + u64 fields_have = 0; + unsigned nr_devices = 0; + + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 44); + + mi = bch2_sb_get_members(sb); + if (mi) { + struct bch_member *m; + + for (m = mi->members; + m < mi->members + sb->nr_devices; + m++) + nr_devices += bch2_member_exists(m); + } + + prt_printf(out, "External UUID:"); + prt_tab(out); + pr_uuid(out, sb->user_uuid.b); + prt_newline(out); + + prt_printf(out, "Internal UUID:"); + prt_tab(out); + pr_uuid(out, sb->uuid.b); + prt_newline(out); + + prt_str(out, "Device index:"); + prt_tab(out); + prt_printf(out, "%u", sb->dev_idx); + prt_newline(out); + + prt_str(out, "Label:"); + prt_tab(out); + prt_printf(out, "%.*s", (int) sizeof(sb->label), sb->label); + prt_newline(out); + + prt_str(out, "Version:"); + prt_tab(out); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_newline(out); + + prt_str(out, "Version upgrade complete:"); + prt_tab(out); + bch2_version_to_text(out, BCH_SB_VERSION_UPGRADE_COMPLETE(sb)); + prt_newline(out); + + prt_printf(out, "Oldest version on disk:"); + prt_tab(out); + bch2_version_to_text(out, le16_to_cpu(sb->version_min)); + prt_newline(out); + + prt_printf(out, "Created:"); + prt_tab(out); + if (sb->time_base_lo) + pr_time(out, div_u64(le64_to_cpu(sb->time_base_lo), NSEC_PER_SEC)); + else + prt_printf(out, "(not set)"); + prt_newline(out); + + prt_printf(out, "Sequence number:"); + prt_tab(out); + prt_printf(out, "%llu", le64_to_cpu(sb->seq)); + prt_newline(out); + + prt_printf(out, "Superblock size:"); + prt_tab(out); + prt_printf(out, "%zu", vstruct_bytes(sb)); + prt_newline(out); + + prt_printf(out, "Clean:"); + prt_tab(out); + prt_printf(out, "%llu", BCH_SB_CLEAN(sb)); + prt_newline(out); + + prt_printf(out, "Devices:"); + prt_tab(out); + prt_printf(out, "%u", nr_devices); + prt_newline(out); + + prt_printf(out, "Sections:"); + vstruct_for_each(sb, f) + fields_have |= 1 << le32_to_cpu(f->type); + prt_tab(out); + prt_bitflags(out, bch2_sb_fields, fields_have); + prt_newline(out); + + prt_printf(out, "Features:"); + prt_tab(out); + prt_bitflags(out, bch2_sb_features, le64_to_cpu(sb->features[0])); + prt_newline(out); + + prt_printf(out, "Compat features:"); + prt_tab(out); + prt_bitflags(out, bch2_sb_compat, le64_to_cpu(sb->compat[0])); + prt_newline(out); + + prt_newline(out); + prt_printf(out, "Options:"); + prt_newline(out); + printbuf_indent_add(out, 2); + { + enum bch_opt_id id; + + for (id = 0; id < bch2_opts_nr; id++) { + const struct bch_option *opt = bch2_opt_table + id; + + if (opt->get_sb != BCH2_NO_SB_OPT) { + u64 v = bch2_opt_from_sb(sb, id); + + prt_printf(out, "%s:", opt->attr.name); + prt_tab(out); + bch2_opt_to_text(out, NULL, sb, opt, v, + OPT_HUMAN_READABLE|OPT_SHOW_FULL_LIST); + prt_newline(out); + } + } + } + + printbuf_indent_sub(out, 2); + + if (print_layout) { + prt_newline(out); + prt_printf(out, "layout:"); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_sb_layout_to_text(out, &sb->layout); + printbuf_indent_sub(out, 2); + } + + vstruct_for_each(sb, f) + if (fields & (1 << le32_to_cpu(f->type))) { + prt_newline(out); + bch2_sb_field_to_text(out, sb, f); + } +} diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h new file mode 100644 index 000000000..904adea6a --- /dev/null +++ b/fs/bcachefs/super-io.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_IO_H +#define _BCACHEFS_SUPER_IO_H + +#include "extents.h" +#include "eytzinger.h" +#include "super_types.h" +#include "super.h" + +#include + +static inline bool bch2_version_compatible(u16 version) +{ + return BCH_VERSION_MAJOR(version) <= BCH_VERSION_MAJOR(bcachefs_metadata_version_current) && + version >= bcachefs_metadata_version_min; +} + +void bch2_version_to_text(struct printbuf *, unsigned); +unsigned bch2_latest_compatible_version(unsigned); + +u64 bch2_upgrade_recovery_passes(struct bch_fs *c, + unsigned, + unsigned); + +struct bch_sb_field *bch2_sb_field_get(struct bch_sb *, enum bch_sb_field_type); +struct bch_sb_field *bch2_sb_field_resize(struct bch_sb_handle *, + enum bch_sb_field_type, unsigned); +void bch2_sb_field_delete(struct bch_sb_handle *, enum bch_sb_field_type); + +#define field_to_type(_f, _name) \ + container_of_or_null(_f, struct bch_sb_field_##_name, field) + +#define x(_name, _nr) \ +static inline struct bch_sb_field_##_name * \ +bch2_sb_get_##_name(struct bch_sb *sb) \ +{ \ + return field_to_type(bch2_sb_field_get(sb, \ + BCH_SB_FIELD_##_name), _name); \ +} \ + \ +static inline struct bch_sb_field_##_name * \ +bch2_sb_resize_##_name(struct bch_sb_handle *sb, unsigned u64s) \ +{ \ + return field_to_type(bch2_sb_field_resize(sb, \ + BCH_SB_FIELD_##_name, u64s), _name); \ +} + +BCH_SB_FIELDS() +#undef x + +extern const char * const bch2_sb_fields[]; + +struct bch_sb_field_ops { + int (*validate)(struct bch_sb *, struct bch_sb_field *, struct printbuf *); + void (*to_text)(struct printbuf *, struct bch_sb *, struct bch_sb_field *); +}; + +static inline __le64 bch2_sb_magic(struct bch_fs *c) +{ + __le64 ret; + memcpy(&ret, &c->sb.uuid, sizeof(ret)); + return ret; +} + +static inline __u64 jset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ JSET_MAGIC); +} + +static inline __u64 bset_magic(struct bch_fs *c) +{ + return __le64_to_cpu(bch2_sb_magic(c) ^ BSET_MAGIC); +} + +int bch2_sb_to_fs(struct bch_fs *, struct bch_sb *); +int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); + +void bch2_free_super(struct bch_sb_handle *); +int bch2_sb_realloc(struct bch_sb_handle *, unsigned); + +int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); +int bch2_write_super(struct bch_fs *); +void __bch2_check_set_feature(struct bch_fs *, unsigned); + +static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) +{ + if (!(c->sb.features & (1ULL << feat))) + __bch2_check_set_feature(c, feat); +} + +/* BCH_SB_FIELD_members: */ + +static inline bool bch2_member_exists(struct bch_member *m) +{ + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); +} + +static inline bool bch2_dev_exists(struct bch_sb *sb, + struct bch_sb_field_members *mi, + unsigned dev) +{ + return dev < sb->nr_devices && + bch2_member_exists(&mi->members[dev]); +} + +static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) +{ + return (struct bch_member_cpu) { + .nbuckets = le64_to_cpu(mi->nbuckets), + .first_bucket = le16_to_cpu(mi->first_bucket), + .bucket_size = le16_to_cpu(mi->bucket_size), + .group = BCH_MEMBER_GROUP(mi), + .state = BCH_MEMBER_STATE(mi), + .discard = BCH_MEMBER_DISCARD(mi), + .data_allowed = BCH_MEMBER_DATA_ALLOWED(mi), + .durability = BCH_MEMBER_DURABILITY(mi) + ? BCH_MEMBER_DURABILITY(mi) - 1 + : 1, + .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .valid = bch2_member_exists(mi), + }; +} + +/* BCH_SB_FIELD_clean: */ + +void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); + +int bch2_sb_clean_validate_late(struct bch_fs *, struct bch_sb_field_clean *, int); + +void bch2_sb_maybe_downgrade(struct bch_fs *); +void bch2_sb_upgrade(struct bch_fs *, unsigned); + +int bch2_fs_mark_dirty(struct bch_fs *); +void bch2_fs_mark_clean(struct bch_fs *); + +void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, + struct bch_sb_field *); +void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *); +void bch2_sb_to_text(struct printbuf *, struct bch_sb *, bool, unsigned); + +#endif /* _BCACHEFS_SUPER_IO_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c new file mode 100644 index 000000000..9f1047a76 --- /dev/null +++ b/fs/bcachefs/super.c @@ -0,0 +1,2006 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcachefs setup/teardown code, and some metadata io - read a superblock and + * figure out what to do with it. + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "bkey_sort.h" +#include "btree_cache.h" +#include "btree_gc.h" +#include "btree_key_cache.h" +#include "btree_update_interior.h" +#include "btree_io.h" +#include "btree_write_buffer.h" +#include "buckets_waiting_for_journal.h" +#include "chardev.h" +#include "checksum.h" +#include "clock.h" +#include "compress.h" +#include "counters.h" +#include "debug.h" +#include "disk_groups.h" +#include "ec.h" +#include "errcode.h" +#include "error.h" +#include "fs.h" +#include "fs-io.h" +#include "fsck.h" +#include "inode.h" +#include "io.h" +#include "journal.h" +#include "journal_reclaim.h" +#include "journal_seq_blacklist.h" +#include "move.h" +#include "migrate.h" +#include "movinggc.h" +#include "nocow_locking.h" +#include "quota.h" +#include "rebalance.h" +#include "recovery.h" +#include "replicas.h" +#include "subvolume.h" +#include "super.h" +#include "super-io.h" +#include "sysfs.h" +#include "trace.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kent Overstreet "); + +#define KTYPE(type) \ +static const struct attribute_group type ## _group = { \ + .attrs = type ## _files \ +}; \ + \ +static const struct attribute_group *type ## _groups[] = { \ + &type ## _group, \ + NULL \ +}; \ + \ +static const struct kobj_type type ## _ktype = { \ + .release = type ## _release, \ + .sysfs_ops = &type ## _sysfs_ops, \ + .default_groups = type ## _groups \ +} + +static void bch2_fs_release(struct kobject *); +static void bch2_dev_release(struct kobject *); +static void bch2_fs_counters_release(struct kobject *k) +{ +} + +static void bch2_fs_internal_release(struct kobject *k) +{ +} + +static void bch2_fs_opts_dir_release(struct kobject *k) +{ +} + +static void bch2_fs_time_stats_release(struct kobject *k) +{ +} + +KTYPE(bch2_fs); +KTYPE(bch2_fs_counters); +KTYPE(bch2_fs_internal); +KTYPE(bch2_fs_opts_dir); +KTYPE(bch2_fs_time_stats); +KTYPE(bch2_dev); + +static struct kset *bcachefs_kset; +static LIST_HEAD(bch_fs_list); +static DEFINE_MUTEX(bch_fs_list_lock); + +DECLARE_WAIT_QUEUE_HEAD(bch2_read_only_wait); + +static void bch2_dev_free(struct bch_dev *); +static int bch2_dev_alloc(struct bch_fs *, unsigned); +static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); +static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); + +struct bch_fs *bch2_dev_to_fs(dev_t dev) +{ + struct bch_fs *c; + struct bch_dev *ca; + unsigned i; + + mutex_lock(&bch_fs_list_lock); + rcu_read_lock(); + + list_for_each_entry(c, &bch_fs_list, list) + for_each_member_device_rcu(ca, c, i, NULL) + if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { + closure_get(&c->cl); + goto found; + } + c = NULL; +found: + rcu_read_unlock(); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) +{ + struct bch_fs *c; + + lockdep_assert_held(&bch_fs_list_lock); + + list_for_each_entry(c, &bch_fs_list, list) + if (!memcmp(&c->disk_sb.sb->uuid, &uuid, sizeof(uuid))) + return c; + + return NULL; +} + +struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid) +{ + struct bch_fs *c; + + mutex_lock(&bch_fs_list_lock); + c = __bch2_uuid_to_fs(uuid); + if (c) + closure_get(&c->cl); + mutex_unlock(&bch_fs_list_lock); + + return c; +} + +static void bch2_dev_usage_journal_reserve(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, nr = 0, u64s = + ((sizeof(struct jset_entry_dev_usage) + + sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) / + sizeof(u64); + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + nr++; + rcu_read_unlock(); + + bch2_journal_entry_res_resize(&c->journal, + &c->dev_usage_journal_res, u64s * nr); +} + +/* Filesystem RO/RW: */ + +/* + * For startup/shutdown of RW stuff, the dependencies are: + * + * - foreground writes depend on copygc and rebalance (to free up space) + * + * - copygc and rebalance depend on mark and sweep gc (they actually probably + * don't because they either reserve ahead of time or don't block if + * allocations fail, but allocations can require mark and sweep gc to run + * because of generation number wraparound) + * + * - all of the above depends on the allocator threads + * + * - allocator depends on the journal (when it rewrites prios and gens) + */ + +static void __bch2_fs_read_only(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, clean_passes = 0; + u64 seq = 0; + + bch2_fs_ec_stop(c); + bch2_open_buckets_stop(c, NULL, true); + bch2_rebalance_stop(c); + bch2_copygc_stop(c); + bch2_gc_thread_stop(c); + bch2_fs_ec_flush(c); + + bch_verbose(c, "flushing journal and stopping allocators, journal seq %llu", + journal_cur_seq(&c->journal)); + + do { + clean_passes++; + + if (bch2_btree_interior_updates_flush(c) || + bch2_journal_flush_all_pins(&c->journal) || + bch2_btree_flush_all_writes(c) || + seq != atomic64_read(&c->journal.seq)) { + seq = atomic64_read(&c->journal.seq); + clean_passes = 0; + } + } while (clean_passes < 2); + + bch_verbose(c, "flushing journal and stopping allocators complete, journal seq %llu", + journal_cur_seq(&c->journal)); + + if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags)) + set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + bch2_fs_journal_stop(&c->journal); + + /* + * After stopping journal: + */ + for_each_member_device(ca, c, i) + bch2_dev_allocator_remove(c, ca); +} + +#ifndef BCH_WRITE_REF_DEBUG +static void bch2_writes_disabled(struct percpu_ref *writes) +{ + struct bch_fs *c = container_of(writes, struct bch_fs, writes); + + set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + wake_up(&bch2_read_only_wait); +} +#endif + +void bch2_fs_read_only(struct bch_fs *c) +{ + if (!test_bit(BCH_FS_RW, &c->flags)) { + bch2_journal_reclaim_stop(&c->journal); + return; + } + + BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + /* + * Block new foreground-end write operations from starting - any new + * writes will return -EROFS: + */ + set_bit(BCH_FS_GOING_RO, &c->flags); +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_kill(&c->writes); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) + bch2_write_ref_put(c, i); +#endif + + /* + * If we're not doing an emergency shutdown, we want to wait on + * outstanding writes to complete so they don't see spurious errors due + * to shutting down the allocator: + * + * If we are doing an emergency shutdown outstanding writes may + * hang until we shutdown the allocator so we don't want to wait + * on outstanding writes before shutting everything down - but + * we do need to wait on them before returning and signalling + * that going RO is complete: + */ + wait_event(bch2_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) || + test_bit(BCH_FS_EMERGENCY_RO, &c->flags)); + + __bch2_fs_read_only(c); + + wait_event(bch2_read_only_wait, + test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags)); + + clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags); + clear_bit(BCH_FS_GOING_RO, &c->flags); + + if (!bch2_journal_error(&c->journal) && + !test_bit(BCH_FS_ERROR, &c->flags) && + !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) && + test_bit(BCH_FS_STARTED, &c->flags) && + test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) && + !c->opts.norecovery) { + BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); + BUG_ON(atomic_read(&c->btree_cache.dirty)); + BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); + BUG_ON(c->btree_write_buffer.state.nr); + + bch_verbose(c, "marking filesystem clean"); + bch2_fs_mark_clean(c); + } + + clear_bit(BCH_FS_RW, &c->flags); +} + +static void bch2_fs_read_only_work(struct work_struct *work) +{ + struct bch_fs *c = + container_of(work, struct bch_fs, read_only_work); + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); +} + +static void bch2_fs_read_only_async(struct bch_fs *c) +{ + queue_work(system_long_wq, &c->read_only_work); +} + +bool bch2_fs_emergency_read_only(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags); + + bch2_journal_halt(&c->journal); + bch2_fs_read_only_async(c); + + wake_up(&bch2_read_only_wait); + return ret; +} + +static int bch2_fs_read_write_late(struct bch_fs *c) +{ + int ret; + + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + * + * Ideally we'd start copygc/rebalance earlier instead of waiting for + * all of recovery/fsck to complete: + */ + ret = bch2_copygc_start(c); + if (ret) { + bch_err(c, "error starting copygc thread"); + return ret; + } + + ret = bch2_rebalance_start(c); + if (ret) { + bch_err(c, "error starting rebalance thread"); + return ret; + } + + return 0; +} + +static int __bch2_fs_read_write(struct bch_fs *c, bool early) +{ + struct bch_dev *ca; + unsigned i; + int ret; + + if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) { + bch_err(c, "cannot go rw, unfixed btree errors"); + return -BCH_ERR_erofs_unfixed_errors; + } + + if (test_bit(BCH_FS_RW, &c->flags)) + return 0; + + if (c->opts.norecovery) + return -BCH_ERR_erofs_norecovery; + + /* + * nochanges is used for fsck -n mode - we have to allow going rw + * during recovery for that to work: + */ + if (c->opts.nochanges && (!early || c->opts.read_only)) + return -BCH_ERR_erofs_nochanges; + + bch_info(c, "going read-write"); + + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; + + clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags); + + /* + * First journal write must be a flush write: after a clean shutdown we + * don't read the journal, so the first journal write may end up + * overwriting whatever was there previously, and there must always be + * at least one non-flush write in the journal or recovery will fail: + */ + set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags); + + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + + ret = bch2_gc_thread_start(c); + if (ret) { + bch_err(c, "error starting gc thread"); + return ret; + } + + if (!early) { + ret = bch2_fs_read_write_late(c); + if (ret) + goto err; + } + +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_reinit(&c->writes); +#else + for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { + BUG_ON(atomic_long_read(&c->writes[i])); + atomic_long_inc(&c->writes[i]); + } +#endif + set_bit(BCH_FS_RW, &c->flags); + set_bit(BCH_FS_WAS_RW, &c->flags); + + bch2_do_discards(c); + bch2_do_invalidates(c); + bch2_do_stripe_deletes(c); + bch2_do_pending_node_rewrites(c); + return 0; +err: + __bch2_fs_read_only(c); + return ret; +} + +int bch2_fs_read_write(struct bch_fs *c) +{ + return __bch2_fs_read_write(c, false); +} + +int bch2_fs_read_write_early(struct bch_fs *c) +{ + lockdep_assert_held(&c->state_lock); + + return __bch2_fs_read_write(c, true); +} + +/* Filesystem startup/shutdown: */ + +static void __bch2_fs_free(struct bch_fs *c) +{ + unsigned i; + int cpu; + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_exit(&c->times[i]); + + bch2_free_pending_node_rewrites(c); + bch2_fs_counters_exit(c); + bch2_fs_snapshots_exit(c); + bch2_fs_quota_exit(c); + bch2_fs_fsio_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_io_exit(c); + bch2_fs_buckets_waiting_for_journal_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_iter_exit(c); + bch2_fs_btree_key_cache_exit(&c->btree_key_cache); + bch2_fs_btree_cache_exit(c); + bch2_fs_replicas_exit(c); + bch2_fs_journal_exit(&c->journal); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); + bch2_fs_compress_exit(c); + bch2_journal_keys_free(&c->journal_keys); + bch2_journal_entries_free(c); + bch2_fs_btree_write_buffer_exit(c); + percpu_free_rwsem(&c->mark_lock); + free_percpu(c->online_reserved); + + if (c->btree_paths_bufs) + for_each_possible_cpu(cpu) + kfree(per_cpu_ptr(c->btree_paths_bufs, cpu)->path); + + darray_exit(&c->btree_roots_extra); + free_percpu(c->btree_paths_bufs); + free_percpu(c->pcpu); + mempool_exit(&c->large_bkey_pool); + mempool_exit(&c->btree_bounce_pool); + bioset_exit(&c->btree_bio); + mempool_exit(&c->fill_iter); +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_exit(&c->writes); +#endif + kfree(rcu_dereference_protected(c->disk_groups, 1)); + kfree(c->journal_seq_blacklist_table); + kfree(c->unused_inode_hints); + + if (c->write_ref_wq) + destroy_workqueue(c->write_ref_wq); + if (c->io_complete_wq) + destroy_workqueue(c->io_complete_wq); + if (c->copygc_wq) + destroy_workqueue(c->copygc_wq); + if (c->btree_io_complete_wq) + destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_update_wq) + destroy_workqueue(c->btree_update_wq); + + bch2_free_super(&c->disk_sb); + kvpfree(c, sizeof(*c)); + module_put(THIS_MODULE); +} + +static void bch2_fs_release(struct kobject *kobj) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + __bch2_fs_free(c); +} + +void __bch2_fs_stop(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + + bch_verbose(c, "shutting down"); + + set_bit(BCH_FS_STOPPING, &c->flags); + + cancel_work_sync(&c->journal_seq_blacklist_gc_work); + + down_write(&c->state_lock); + bch2_fs_read_only(c); + up_write(&c->state_lock); + + for_each_member_device(ca, c, i) + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + + if (c->kobj.state_in_sysfs) + kobject_del(&c->kobj); + + bch2_fs_debug_exit(c); + bch2_fs_chardev_exit(c); + + kobject_put(&c->counters_kobj); + kobject_put(&c->time_stats); + kobject_put(&c->opts_dir); + kobject_put(&c->internal); + + /* btree prefetch might have kicked off reads in the background: */ + bch2_btree_flush_all_reads(c); + + for_each_member_device(ca, c, i) + cancel_work_sync(&ca->io_error_work); + + cancel_work_sync(&c->read_only_work); + + for (i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + + if (ca) + bch2_free_super(&ca->disk_sb); + } +} + +void bch2_fs_free(struct bch_fs *c) +{ + unsigned i; + + mutex_lock(&bch_fs_list_lock); + list_del(&c->list); + mutex_unlock(&bch_fs_list_lock); + + closure_sync(&c->cl); + closure_debug_destroy(&c->cl); + + for (i = 0; i < c->sb.nr_devices; i++) + if (c->devs[i]) + bch2_dev_free(rcu_dereference_protected(c->devs[i], 1)); + + bch_verbose(c, "shutdown complete"); + + kobject_put(&c->kobj); +} + +void bch2_fs_stop(struct bch_fs *c) +{ + __bch2_fs_stop(c); + bch2_fs_free(c); +} + +static int bch2_fs_online(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i; + int ret = 0; + + lockdep_assert_held(&bch_fs_list_lock); + + if (__bch2_uuid_to_fs(c->sb.uuid)) { + bch_err(c, "filesystem UUID already open"); + return -EINVAL; + } + + ret = bch2_fs_chardev_init(c); + if (ret) { + bch_err(c, "error creating character device"); + return ret; + } + + bch2_fs_debug_init(c); + + ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + kobject_add(&c->internal, &c->kobj, "internal") ?: + kobject_add(&c->opts_dir, &c->kobj, "options") ?: + kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: + kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: + bch2_opts_create_sysfs_files(&c->opts_dir); + if (ret) { + bch_err(c, "error creating sysfs objects"); + return ret; + } + + down_write(&c->state_lock); + + for_each_member_device(ca, c, i) { + ret = bch2_dev_sysfs_online(c, ca); + if (ret) { + bch_err(c, "error creating sysfs objects"); + percpu_ref_put(&ca->ref); + goto err; + } + } + + BUG_ON(!list_empty(&c->list)); + list_add(&c->list, &bch_fs_list); +err: + up_write(&c->state_lock); + return ret; +} + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +{ + struct bch_sb_field_members *mi; + struct bch_fs *c; + struct printbuf name = PRINTBUF; + unsigned i, iter_size; + int ret = 0; + + c = kvpmalloc(sizeof(struct bch_fs), GFP_KERNEL|__GFP_ZERO); + if (!c) { + c = ERR_PTR(-BCH_ERR_ENOMEM_fs_alloc); + goto out; + } + + __module_get(THIS_MODULE); + + closure_init(&c->cl, NULL); + + c->kobj.kset = bcachefs_kset; + kobject_init(&c->kobj, &bch2_fs_ktype); + kobject_init(&c->internal, &bch2_fs_internal_ktype); + kobject_init(&c->opts_dir, &bch2_fs_opts_dir_ktype); + kobject_init(&c->time_stats, &bch2_fs_time_stats_ktype); + kobject_init(&c->counters_kobj, &bch2_fs_counters_ktype); + + c->minor = -1; + c->disk_sb.fs_sb = true; + + init_rwsem(&c->state_lock); + mutex_init(&c->sb_lock); + mutex_init(&c->replicas_gc_lock); + mutex_init(&c->btree_root_lock); + INIT_WORK(&c->read_only_work, bch2_fs_read_only_work); + + init_rwsem(&c->gc_lock); + mutex_init(&c->gc_gens_lock); + + for (i = 0; i < BCH_TIME_STAT_NR; i++) + bch2_time_stats_init(&c->times[i]); + + bch2_fs_copygc_init(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_allocator_background_init(c); + bch2_fs_allocator_foreground_init(c); + bch2_fs_rebalance_init(c); + bch2_fs_quota_init(c); + bch2_fs_ec_init_early(c); + bch2_fs_move_init(c); + + INIT_LIST_HEAD(&c->list); + + mutex_init(&c->usage_scratch_lock); + + mutex_init(&c->bio_bounce_pages_lock); + mutex_init(&c->snapshot_table_lock); + + spin_lock_init(&c->btree_write_error_lock); + + INIT_WORK(&c->journal_seq_blacklist_gc_work, + bch2_blacklist_entries_gc); + + INIT_LIST_HEAD(&c->journal_iters); + + INIT_LIST_HEAD(&c->fsck_errors); + mutex_init(&c->fsck_error_lock); + + seqcount_init(&c->gc_pos_lock); + + seqcount_init(&c->usage_lock); + + sema_init(&c->io_in_flight, 128); + + INIT_LIST_HEAD(&c->vfs_inodes_list); + mutex_init(&c->vfs_inodes_lock); + + c->copy_gc_enabled = 1; + c->rebalance.enabled = 1; + c->promote_whole_extents = true; + + c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write]; + c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; + c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal]; + c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; + + bch2_fs_btree_cache_init_early(&c->btree_cache); + + mutex_init(&c->sectors_available_lock); + + ret = percpu_init_rwsem(&c->mark_lock); + if (ret) + goto err; + + mutex_lock(&c->sb_lock); + ret = bch2_sb_to_fs(c, sb); + mutex_unlock(&c->sb_lock); + + if (ret) + goto err; + + pr_uuid(&name, c->sb.user_uuid.b); + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + /* Compat: */ + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) + SET_BCH_SB_JOURNAL_FLUSH_DELAY(sb, 1000); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && + !BCH_SB_JOURNAL_RECLAIM_DELAY(sb)) + SET_BCH_SB_JOURNAL_RECLAIM_DELAY(sb, 100); + + c->opts = bch2_opts_default; + ret = bch2_opts_from_sb(&c->opts, sb); + if (ret) + goto err; + + bch2_opts_apply(&c->opts, opts); + + c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; + if (c->opts.inodes_use_key_cache) + c->btree_key_cache_btrees |= 1U << BTREE_ID_inodes; + + c->block_bits = ilog2(block_sectors(c)); + c->btree_foreground_merge_threshold = BTREE_FOREGROUND_MERGE_THRESHOLD(c); + + if (bch2_fs_init_fault("fs_alloc")) { + bch_err(c, "fs_alloc fault injected"); + ret = -EFAULT; + goto err; + } + + iter_size = sizeof(struct sort_iter) + + (btree_blocks(c) + 1) * 2 * + sizeof(struct sort_iter_set); + + c->inode_shard_bits = ilog2(roundup_pow_of_two(num_possible_cpus())); + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512)) || + !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->io_complete_wq = alloc_workqueue("bcachefs_io", + WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0)) || +#ifndef BCH_WRITE_REF_DEBUG + percpu_ref_init(&c->writes, bch2_writes_disabled, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || +#endif + mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || + bioset_init(&c->btree_bio, 1, + max(offsetof(struct btree_read_bio, bio), + offsetof(struct btree_write_bio, wbio.bio)), + BIOSET_NEED_BVECS) || + !(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) || + !(c->online_reserved = alloc_percpu(u64)) || + !(c->btree_paths_bufs = alloc_percpu(struct btree_path_buf)) || + mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1, + btree_bytes(c)) || + mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) || + !(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits, + sizeof(u64), GFP_KERNEL))) { + ret = -BCH_ERR_ENOMEM_fs_other_alloc; + goto err; + } + + ret = bch2_fs_counters_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: + bch2_fs_journal_init(&c->journal) ?: + bch2_fs_replicas_init(c) ?: + bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: + bch2_fs_btree_iter_init(c) ?: + bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_buckets_waiting_for_journal_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_subvolumes_init(c) ?: + bch2_fs_io_init(c) ?: + bch2_fs_nocow_locking_init(c) ?: + bch2_fs_encryption_init(c) ?: + bch2_fs_compress_init(c) ?: + bch2_fs_ec_init(c) ?: + bch2_fs_fsio_init(c); + if (ret) + goto err; + + mi = bch2_sb_get_members(c->disk_sb.sb); + for (i = 0; i < c->sb.nr_devices; i++) + if (bch2_dev_exists(c->disk_sb.sb, mi, i) && + bch2_dev_alloc(c, i)) { + ret = -EEXIST; + goto err; + } + + bch2_journal_entry_res_resize(&c->journal, + &c->btree_root_journal_res, + BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX)); + bch2_dev_usage_journal_reserve(c); + bch2_journal_entry_res_resize(&c->journal, + &c->clock_journal_res, + (sizeof(struct jset_entry_clock) / sizeof(u64)) * 2); + + mutex_lock(&bch_fs_list_lock); + ret = bch2_fs_online(c); + mutex_unlock(&bch_fs_list_lock); + + if (ret) + goto err; +out: + return c; +err: + bch2_fs_free(c); + c = ERR_PTR(ret); + goto out; +} + +noinline_for_stack +static void print_mount_opts(struct bch_fs *c) +{ + enum bch_opt_id i; + struct printbuf p = PRINTBUF; + bool first = true; + + prt_str(&p, "mounted version "); + bch2_version_to_text(&p, c->sb.version); + + if (c->opts.read_only) { + prt_str(&p, " opts="); + first = false; + prt_printf(&p, "ro"); + } + + for (i = 0; i < bch2_opts_nr; i++) { + const struct bch_option *opt = &bch2_opt_table[i]; + u64 v = bch2_opt_get_by_id(&c->opts, i); + + if (!(opt->flags & OPT_MOUNT)) + continue; + + if (v == bch2_opt_get_by_id(&bch2_opts_default, i)) + continue; + + prt_str(&p, first ? " opts=" : ","); + first = false; + bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); + } + + bch_info(c, "%s", p.buf); + printbuf_exit(&p); +} + +int bch2_fs_start(struct bch_fs *c) +{ + struct bch_sb_field_members *mi; + struct bch_dev *ca; + time64_t now = ktime_get_real_seconds(); + unsigned i; + int ret; + + down_write(&c->state_lock); + + BUG_ON(test_bit(BCH_FS_STARTED, &c->flags)); + + mutex_lock(&c->sb_lock); + + for_each_online_member(ca, c, i) + bch2_sb_from_fs(c, ca); + + mi = bch2_sb_get_members(c->disk_sb.sb); + for_each_online_member(ca, c, i) + mi->members[ca->dev_idx].last_mount = cpu_to_le64(now); + + mutex_unlock(&c->sb_lock); + + for_each_rw_member(ca, c, i) + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); + + for (i = 0; i < BCH_TRANSACTIONS_NR; i++) { + mutex_lock(&c->btree_transaction_stats[i].lock); + bch2_time_stats_init(&c->btree_transaction_stats[i].lock_hold_times); + mutex_unlock(&c->btree_transaction_stats[i].lock); + } + + ret = BCH_SB_INITIALIZED(c->disk_sb.sb) + ? bch2_fs_recovery(c) + : bch2_fs_initialize(c); + if (ret) + goto err; + + ret = bch2_opts_check_may_set(c); + if (ret) + goto err; + + if (bch2_fs_init_fault("fs_start")) { + bch_err(c, "fs_start fault injected"); + ret = -EINVAL; + goto err; + } + + set_bit(BCH_FS_STARTED, &c->flags); + + if (c->opts.read_only || c->opts.nochanges) { + bch2_fs_read_only(c); + } else { + ret = !test_bit(BCH_FS_RW, &c->flags) + ? bch2_fs_read_write(c) + : bch2_fs_read_write_late(c); + if (ret) + goto err; + } + + print_mount_opts(c); + ret = 0; +out: + up_write(&c->state_lock); + return ret; +err: + bch_err(c, "error starting filesystem: %s", bch2_err_str(ret)); + goto out; +} + +static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) +{ + struct bch_sb_field_members *sb_mi; + + sb_mi = bch2_sb_get_members(sb); + if (!sb_mi) + return -BCH_ERR_member_info_missing; + + if (le16_to_cpu(sb->block_size) != block_sectors(c)) + return -BCH_ERR_mismatched_block_size; + + if (le16_to_cpu(sb_mi->members[sb->dev_idx].bucket_size) < + BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) + return -BCH_ERR_bucket_size_too_small; + + return 0; +} + +static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb) +{ + struct bch_sb *newest = + le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb; + struct bch_sb_field_members *mi = bch2_sb_get_members(newest); + + if (!uuid_equal(&fs->uuid, &sb->uuid)) + return -BCH_ERR_device_not_a_member_of_filesystem; + + if (!bch2_dev_exists(newest, mi, sb->dev_idx)) + return -BCH_ERR_device_has_been_removed; + + if (fs->block_size != sb->block_size) + return -BCH_ERR_mismatched_block_size; + + return 0; +} + +/* Device startup/shutdown: */ + +static void bch2_dev_release(struct kobject *kobj) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + + kfree(ca); +} + +static void bch2_dev_free(struct bch_dev *ca) +{ + cancel_work_sync(&ca->io_error_work); + + if (ca->kobj.state_in_sysfs && + ca->disk_sb.bdev) + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + + if (ca->kobj.state_in_sysfs) + kobject_del(&ca->kobj); + + bch2_free_super(&ca->disk_sb); + bch2_dev_journal_exit(ca); + + free_percpu(ca->io_done); + bioset_exit(&ca->replica_set); + bch2_dev_buckets_free(ca); + free_page((unsigned long) ca->sb_read_scratch); + + bch2_time_stats_exit(&ca->io_latency[WRITE]); + bch2_time_stats_exit(&ca->io_latency[READ]); + + percpu_ref_exit(&ca->io_ref); + percpu_ref_exit(&ca->ref); + kobject_put(&ca->kobj); +} + +static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) +{ + + lockdep_assert_held(&c->state_lock); + + if (percpu_ref_is_zero(&ca->io_ref)) + return; + + __bch2_dev_read_only(c, ca); + + reinit_completion(&ca->io_ref_completion); + percpu_ref_kill(&ca->io_ref); + wait_for_completion(&ca->io_ref_completion); + + if (ca->kobj.state_in_sysfs) { + sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs"); + sysfs_remove_link(&ca->kobj, "block"); + } + + bch2_free_super(&ca->disk_sb); + bch2_dev_journal_exit(ca); +} + +static void bch2_dev_ref_complete(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, ref); + + complete(&ca->ref_completion); +} + +static void bch2_dev_io_ref_complete(struct percpu_ref *ref) +{ + struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); + + complete(&ca->io_ref_completion); +} + +static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) +{ + int ret; + + if (!c->kobj.state_in_sysfs) + return 0; + + if (!ca->kobj.state_in_sysfs) { + ret = kobject_add(&ca->kobj, &c->kobj, + "dev-%u", ca->dev_idx); + if (ret) + return ret; + } + + if (ca->disk_sb.bdev) { + struct kobject *block = bdev_kobj(ca->disk_sb.bdev); + + ret = sysfs_create_link(block, &ca->kobj, "bcachefs"); + if (ret) + return ret; + + ret = sysfs_create_link(&ca->kobj, block, "block"); + if (ret) + return ret; + } + + return 0; +} + +static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, + struct bch_member *member) +{ + struct bch_dev *ca; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + return NULL; + + kobject_init(&ca->kobj, &bch2_dev_ktype); + init_completion(&ca->ref_completion); + init_completion(&ca->io_ref_completion); + + init_rwsem(&ca->bucket_lock); + + INIT_WORK(&ca->io_error_work, bch2_io_error_work); + + bch2_time_stats_init(&ca->io_latency[READ]); + bch2_time_stats_init(&ca->io_latency[WRITE]); + + ca->mi = bch2_mi_to_cpu(member); + ca->uuid = member->uuid; + + ca->nr_btree_reserve = DIV_ROUND_UP(BTREE_NODE_RESERVE, + ca->mi.bucket_size / btree_sectors(c)); + + if (percpu_ref_init(&ca->ref, bch2_dev_ref_complete, + 0, GFP_KERNEL) || + percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, + PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + !(ca->sb_read_scratch = (void *) __get_free_page(GFP_KERNEL)) || + bch2_dev_buckets_alloc(c, ca) || + bioset_init(&ca->replica_set, 4, + offsetof(struct bch_write_bio, bio), 0) || + !(ca->io_done = alloc_percpu(*ca->io_done))) + goto err; + + return ca; +err: + bch2_dev_free(ca); + return NULL; +} + +static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, + unsigned dev_idx) +{ + ca->dev_idx = dev_idx; + __set_bit(ca->dev_idx, ca->self.d); + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + ca->fs = c; + rcu_assign_pointer(c->devs[ca->dev_idx], ca); + + if (bch2_dev_sysfs_online(c, ca)) + pr_warn("error creating sysfs objects"); +} + +static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) +{ + struct bch_member *member = + bch2_sb_get_members(c->disk_sb.sb)->members + dev_idx; + struct bch_dev *ca = NULL; + int ret = 0; + + if (bch2_fs_init_fault("dev_alloc")) + goto err; + + ca = __bch2_dev_alloc(c, member); + if (!ca) + goto err; + + ca->fs = c; + + bch2_dev_attach(c, ca, dev_idx); + return ret; +err: + if (ca) + bch2_dev_free(ca); + return -BCH_ERR_ENOMEM_dev_alloc; +} + +static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) +{ + unsigned ret; + + if (bch2_dev_is_online(ca)) { + bch_err(ca, "already have device online in slot %u", + sb->sb->dev_idx); + return -BCH_ERR_device_already_online; + } + + if (get_capacity(sb->bdev->bd_disk) < + ca->mi.bucket_size * ca->mi.nbuckets) { + bch_err(ca, "cannot online: device too small"); + return -BCH_ERR_device_size_too_small; + } + + BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + + ret = bch2_dev_journal_init(ca, sb->sb); + if (ret) + return ret; + + /* Commit: */ + ca->disk_sb = *sb; + if (sb->mode & FMODE_EXCL) + ca->disk_sb.bdev->bd_holder = ca; + memset(sb, 0, sizeof(*sb)); + + ca->dev = ca->disk_sb.bdev->bd_dev; + + percpu_ref_reinit(&ca->io_ref); + + return 0; +} + +static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) +{ + struct bch_dev *ca; + int ret; + + lockdep_assert_held(&c->state_lock); + + if (le64_to_cpu(sb->sb->seq) > + le64_to_cpu(c->disk_sb.sb->seq)) + bch2_sb_to_fs(c, sb->sb); + + BUG_ON(sb->sb->dev_idx >= c->sb.nr_devices || + !c->devs[sb->sb->dev_idx]); + + ca = bch_dev_locked(c, sb->sb->dev_idx); + + ret = __bch2_dev_attach_bdev(ca, sb); + if (ret) + return ret; + + bch2_dev_sysfs_online(c, ca); + + if (c->sb.nr_devices == 1) + snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev); + snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev); + + rebalance_wakeup(c); + return 0; +} + +/* Device management: */ + +/* + * Note: this function is also used by the error paths - when a particular + * device sees an error, we call it to determine whether we can just set the + * device RO, or - if this function returns false - we'll set the whole + * filesystem RO: + * + * XXX: maybe we should be more explicit about whether we're changing state + * because we got an error or what have you? + */ +bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + struct bch_devs_mask new_online_devs; + struct bch_dev *ca2; + int i, nr_rw = 0, required; + + lockdep_assert_held(&c->state_lock); + + switch (new_state) { + case BCH_MEMBER_STATE_rw: + return true; + case BCH_MEMBER_STATE_ro: + if (ca->mi.state != BCH_MEMBER_STATE_rw) + return true; + + /* do we have enough devices to write to? */ + for_each_member_device(ca2, c, i) + if (ca2 != ca) + nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw; + + required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED) + ? c->opts.metadata_replicas + : c->opts.metadata_replicas_required, + !(flags & BCH_FORCE_IF_DATA_DEGRADED) + ? c->opts.data_replicas + : c->opts.data_replicas_required); + + return nr_rw >= required; + case BCH_MEMBER_STATE_failed: + case BCH_MEMBER_STATE_spare: + if (ca->mi.state != BCH_MEMBER_STATE_rw && + ca->mi.state != BCH_MEMBER_STATE_ro) + return true; + + /* do we have enough devices to read from? */ + new_online_devs = bch2_online_devs(c); + __clear_bit(ca->dev_idx, new_online_devs.d); + + return bch2_have_enough_devs(c, new_online_devs, flags, false); + default: + BUG(); + } +} + +static bool bch2_fs_may_start(struct bch_fs *c) +{ + struct bch_sb_field_members *mi; + struct bch_dev *ca; + unsigned i, flags = 0; + + if (c->opts.very_degraded) + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + + if (c->opts.degraded) + flags |= BCH_FORCE_IF_DEGRADED; + + if (!c->opts.degraded && + !c->opts.very_degraded) { + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + if (!bch2_dev_exists(c->disk_sb.sb, mi, i)) + continue; + + ca = bch_dev_locked(c, i); + + if (!bch2_dev_is_online(ca) && + (ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro)) { + mutex_unlock(&c->sb_lock); + return false; + } + } + mutex_unlock(&c->sb_lock); + } + + return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); +} + +static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) +{ + /* + * The allocator thread itself allocates btree nodes, so stop it first: + */ + bch2_dev_allocator_remove(c, ca); + bch2_dev_journal_stop(&c->journal, ca); +} + +static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) +{ + lockdep_assert_held(&c->state_lock); + + BUG_ON(ca->mi.state != BCH_MEMBER_STATE_rw); + + bch2_dev_allocator_add(c, ca); + bch2_recalc_capacity(c); +} + +int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + struct bch_sb_field_members *mi; + int ret = 0; + + if (ca->mi.state == new_state) + return 0; + + if (!bch2_dev_state_allowed(c, ca, new_state, flags)) + return -BCH_ERR_device_state_not_allowed; + + if (new_state != BCH_MEMBER_STATE_rw) + __bch2_dev_read_only(c, ca); + + bch_notice(ca, "%s", bch2_member_states[new_state]); + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + SET_BCH_MEMBER_STATE(&mi->members[ca->dev_idx], new_state); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (new_state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + rebalance_wakeup(c); + + return ret; +} + +int bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, + enum bch_member_state new_state, int flags) +{ + int ret; + + down_write(&c->state_lock); + ret = __bch2_dev_set_state(c, ca, new_state, flags); + up_write(&c->state_lock); + + return ret; +} + +/* Device add/removal: */ + +static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) +{ + struct bpos start = POS(ca->dev_idx, 0); + struct bpos end = POS(ca->dev_idx, U64_MAX); + int ret; + + /* + * We clear the LRU and need_discard btrees first so that we don't race + * with bch2_do_invalidates() and bch2_do_discards() + */ + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_freespace, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_backpointers, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_alloc, start, end, + BTREE_TRIGGER_NORUN, NULL) ?: + bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end, + BTREE_TRIGGER_NORUN, NULL); + if (ret) + bch_err(c, "error removing dev alloc info: %s", bch2_err_str(ret)); + + return ret; +} + +int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) +{ + struct bch_sb_field_members *mi; + unsigned dev_idx = ca->dev_idx, data; + int ret; + + down_write(&c->state_lock); + + /* + * We consume a reference to ca->ref, regardless of whether we succeed + * or fail: + */ + percpu_ref_put(&ca->ref); + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot remove without losing data"); + ret = -BCH_ERR_device_state_not_allowed; + goto err; + } + + __bch2_dev_read_only(c, ca); + + ret = bch2_dev_data_drop(c, ca->dev_idx, flags); + if (ret) { + bch_err(ca, "Remove failed: error dropping data: %s", bch2_err_str(ret)); + goto err; + } + + ret = bch2_dev_remove_alloc(c, ca); + if (ret) { + bch_err(ca, "Remove failed, error deleting alloc info"); + goto err; + } + + ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx); + if (ret) { + bch_err(ca, "Remove failed: error flushing journal: %s", bch2_err_str(ret)); + goto err; + } + + ret = bch2_journal_flush(&c->journal); + if (ret) { + bch_err(ca, "Remove failed, journal error"); + goto err; + } + + ret = bch2_replicas_gc2(c); + if (ret) { + bch_err(ca, "Remove failed: error from replicas gc: %s", bch2_err_str(ret)); + goto err; + } + + data = bch2_dev_has_data(c, ca); + if (data) { + struct printbuf data_has = PRINTBUF; + + prt_bitflags(&data_has, bch2_data_types, data); + bch_err(ca, "Remove failed, still has data (%s)", data_has.buf); + printbuf_exit(&data_has); + ret = -EBUSY; + goto err; + } + + __bch2_dev_offline(c, ca); + + mutex_lock(&c->sb_lock); + rcu_assign_pointer(c->devs[ca->dev_idx], NULL); + mutex_unlock(&c->sb_lock); + + percpu_ref_kill(&ca->ref); + wait_for_completion(&ca->ref_completion); + + bch2_dev_free(ca); + + /* + * At this point the device object has been removed in-core, but the + * on-disk journal might still refer to the device index via sb device + * usage entries. Recovery fails if it sees usage information for an + * invalid device. Flush journal pins to push the back of the journal + * past now invalid device index references before we update the + * superblock, but after the device object has been removed so any + * further journal writes elide usage info for the device. + */ + bch2_journal_flush_all_pins(&c->journal); + + /* + * Free this device's slot in the bch_member array - all pointers to + * this device must be gone: + */ + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + memset(&mi->members[dev_idx].uuid, 0, sizeof(mi->members[dev_idx].uuid)); + + bch2_write_super(c); + + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); + + bch2_dev_usage_journal_reserve(c); + return 0; +err: + if (ca->mi.state == BCH_MEMBER_STATE_rw && + !percpu_ref_is_zero(&ca->io_ref)) + __bch2_dev_read_write(c, ca); + up_write(&c->state_lock); + return ret; +} + +/* Add new device to running filesystem: */ +int bch2_dev_add(struct bch_fs *c, const char *path) +{ + struct bch_opts opts = bch2_opts_empty(); + struct bch_sb_handle sb; + struct bch_dev *ca = NULL; + struct bch_sb_field_members *mi; + struct bch_member dev_mi; + unsigned dev_idx, nr_devices, u64s; + struct printbuf errbuf = PRINTBUF; + struct printbuf label = PRINTBUF; + int ret; + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + bch_err(c, "device add error: error reading super: %s", bch2_err_str(ret)); + goto err; + } + + dev_mi = bch2_sb_get_members(sb.sb)->members[sb.sb->dev_idx]; + + if (BCH_MEMBER_GROUP(&dev_mi)) { + bch2_disk_path_to_text(&label, sb.sb, BCH_MEMBER_GROUP(&dev_mi) - 1); + if (label.allocation_failure) { + ret = -ENOMEM; + goto err; + } + } + + ret = bch2_dev_may_add(sb.sb, c); + if (ret) { + bch_err(c, "device add error: %s", bch2_err_str(ret)); + goto err; + } + + ca = __bch2_dev_alloc(c, &dev_mi); + if (!ca) { + bch2_free_super(&sb); + ret = -ENOMEM; + goto err; + } + + bch2_dev_usage_init(ca); + + ret = __bch2_dev_attach_bdev(ca, &sb); + if (ret) { + bch2_dev_free(ca); + goto err; + } + + ret = bch2_dev_journal_alloc(ca); + if (ret) { + bch_err(c, "device add error: journal alloc failed"); + goto err; + } + + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); + + ret = bch2_sb_from_fs(c, ca); + if (ret) { + bch_err(c, "device add error: new device superblock too small"); + goto err_unlock; + } + + mi = bch2_sb_get_members(ca->disk_sb.sb); + + if (!bch2_sb_resize_members(&ca->disk_sb, + le32_to_cpu(mi->field.u64s) + + sizeof(dev_mi) / sizeof(u64))) { + bch_err(c, "device add error: new device superblock too small"); + ret = -BCH_ERR_ENOSPC_sb_members; + goto err_unlock; + } + + if (dynamic_fault("bcachefs:add:no_slot")) + goto no_slot; + + mi = bch2_sb_get_members(c->disk_sb.sb); + for (dev_idx = 0; dev_idx < BCH_SB_MEMBERS_MAX; dev_idx++) + if (!bch2_dev_exists(c->disk_sb.sb, mi, dev_idx)) + goto have_slot; +no_slot: + bch_err(c, "device add error: already have maximum number of devices"); + ret = -BCH_ERR_ENOSPC_sb_members; + goto err_unlock; + +have_slot: + nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); + u64s = (sizeof(struct bch_sb_field_members) + + sizeof(struct bch_member) * nr_devices) / sizeof(u64); + + mi = bch2_sb_resize_members(&c->disk_sb, u64s); + if (!mi) { + bch_err(c, "device add error: no room in superblock for member info"); + ret = -BCH_ERR_ENOSPC_sb_members; + goto err_unlock; + } + + /* success: */ + + mi->members[dev_idx] = dev_mi; + mi->members[dev_idx].last_mount = cpu_to_le64(ktime_get_real_seconds()); + c->disk_sb.sb->nr_devices = nr_devices; + + ca->disk_sb.sb->dev_idx = dev_idx; + bch2_dev_attach(c, ca, dev_idx); + + if (BCH_MEMBER_GROUP(&dev_mi)) { + ret = __bch2_dev_group_set(c, ca, label.buf); + if (ret) { + bch_err(c, "device add error: error setting label"); + goto err_unlock; + } + } + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + bch2_dev_usage_journal_reserve(c); + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + bch_err(c, "device add error: error marking new superblock: %s", bch2_err_str(ret)); + goto err_late; + } + + ret = bch2_fs_freespace_init(c); + if (ret) { + bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + goto err_late; + } + + ca->new_fs_bucket_idx = 0; + + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + up_write(&c->state_lock); + return 0; + +err_unlock: + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); +err: + if (ca) + bch2_dev_free(ca); + bch2_free_super(&sb); + printbuf_exit(&label); + printbuf_exit(&errbuf); + return ret; +err_late: + up_write(&c->state_lock); + ca = NULL; + goto err; +} + +/* Hot add existing device to running filesystem: */ +int bch2_dev_online(struct bch_fs *c, const char *path) +{ + struct bch_opts opts = bch2_opts_empty(); + struct bch_sb_handle sb = { NULL }; + struct bch_sb_field_members *mi; + struct bch_dev *ca; + unsigned dev_idx; + int ret; + + down_write(&c->state_lock); + + ret = bch2_read_super(path, &opts, &sb); + if (ret) { + up_write(&c->state_lock); + return ret; + } + + dev_idx = sb.sb->dev_idx; + + ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb); + if (ret) { + bch_err(c, "error bringing %s online: %s", path, bch2_err_str(ret)); + goto err; + } + + ret = bch2_dev_attach_bdev(c, &sb); + if (ret) + goto err; + + ca = bch_dev_locked(c, dev_idx); + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) { + bch_err(c, "error bringing %s online: error from bch2_trans_mark_dev_sb: %s", + path, bch2_err_str(ret)); + goto err; + } + + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); + + mutex_lock(&c->sb_lock); + mi = bch2_sb_get_members(c->disk_sb.sb); + + mi->members[ca->dev_idx].last_mount = + cpu_to_le64(ktime_get_real_seconds()); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + ret = bch2_fs_freespace_init(c); + if (ret) + bch_err(c, "device add error: error initializing free space: %s", bch2_err_str(ret)); + + up_write(&c->state_lock); + return 0; +err: + up_write(&c->state_lock); + bch2_free_super(&sb); + return ret; +} + +int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) +{ + down_write(&c->state_lock); + + if (!bch2_dev_is_online(ca)) { + bch_err(ca, "Already offline"); + up_write(&c->state_lock); + return 0; + } + + if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { + bch_err(ca, "Cannot offline required disk"); + up_write(&c->state_lock); + return -BCH_ERR_device_state_not_allowed; + } + + __bch2_dev_offline(c, ca); + + up_write(&c->state_lock); + return 0; +} + +int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) +{ + struct bch_member *mi; + int ret = 0; + + down_write(&c->state_lock); + + if (nbuckets < ca->mi.nbuckets) { + bch_err(ca, "Cannot shrink yet"); + ret = -EINVAL; + goto err; + } + + if (bch2_dev_is_online(ca) && + get_capacity(ca->disk_sb.bdev->bd_disk) < + ca->mi.bucket_size * nbuckets) { + bch_err(ca, "New size larger than device"); + ret = -BCH_ERR_device_size_too_small; + goto err; + } + + ret = bch2_dev_buckets_resize(c, ca, nbuckets); + if (ret) { + bch_err(ca, "Resize error: %s", bch2_err_str(ret)); + goto err; + } + + ret = bch2_trans_mark_dev_sb(c, ca); + if (ret) + goto err; + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + mi->nbuckets = cpu_to_le64(nbuckets); + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + bch2_recalc_capacity(c); +err: + up_write(&c->state_lock); + return ret; +} + +/* return with ref on ca->ref: */ +struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) +{ + struct bch_dev *ca; + unsigned i; + + rcu_read_lock(); + for_each_member_device_rcu(ca, c, i, NULL) + if (!strcmp(name, ca->name)) + goto found; + ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); +found: + rcu_read_unlock(); + + return ca; +} + +/* Filesystem open: */ + +struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, + struct bch_opts opts) +{ + struct bch_sb_handle *sb = NULL; + struct bch_fs *c = NULL; + struct bch_sb_field_members *mi; + unsigned i, best_sb = 0; + struct printbuf errbuf = PRINTBUF; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return ERR_PTR(-ENODEV); + + if (!nr_devices) { + ret = -EINVAL; + goto err; + } + + sb = kcalloc(nr_devices, sizeof(*sb), GFP_KERNEL); + if (!sb) { + ret = -ENOMEM; + goto err; + } + + for (i = 0; i < nr_devices; i++) { + ret = bch2_read_super(devices[i], &opts, &sb[i]); + if (ret) + goto err; + + } + + for (i = 1; i < nr_devices; i++) + if (le64_to_cpu(sb[i].sb->seq) > + le64_to_cpu(sb[best_sb].sb->seq)) + best_sb = i; + + mi = bch2_sb_get_members(sb[best_sb].sb); + + i = 0; + while (i < nr_devices) { + if (i != best_sb && + !bch2_dev_exists(sb[best_sb].sb, mi, sb[i].sb->dev_idx)) { + pr_info("%pg has been removed, skipping", sb[i].bdev); + bch2_free_super(&sb[i]); + array_remove_item(sb, nr_devices, i); + continue; + } + + ret = bch2_dev_in_fs(sb[best_sb].sb, sb[i].sb); + if (ret) + goto err_print; + i++; + } + + c = bch2_fs_alloc(sb[best_sb].sb, opts); + if (IS_ERR(c)) { + ret = PTR_ERR(c); + goto err; + } + + down_write(&c->state_lock); + for (i = 0; i < nr_devices; i++) { + ret = bch2_dev_attach_bdev(c, &sb[i]); + if (ret) { + up_write(&c->state_lock); + goto err; + } + } + up_write(&c->state_lock); + + if (!bch2_fs_may_start(c)) { + ret = -BCH_ERR_insufficient_devices_to_start; + goto err_print; + } + + if (!c->opts.nostart) { + ret = bch2_fs_start(c); + if (ret) + goto err; + } +out: + kfree(sb); + printbuf_exit(&errbuf); + module_put(THIS_MODULE); + return c; +err_print: + pr_err("bch_fs_open err opening %s: %s", + devices[0], bch2_err_str(ret)); +err: + if (!IS_ERR_OR_NULL(c)) + bch2_fs_stop(c); + if (sb) + for (i = 0; i < nr_devices; i++) + bch2_free_super(&sb[i]); + c = ERR_PTR(ret); + goto out; +} + +/* Global interfaces/init */ + +static void bcachefs_exit(void) +{ + bch2_debug_exit(); + bch2_vfs_exit(); + bch2_chardev_exit(); + bch2_btree_key_cache_exit(); + if (bcachefs_kset) + kset_unregister(bcachefs_kset); +} + +static int __init bcachefs_init(void) +{ + bch2_bkey_pack_test(); + + if (!(bcachefs_kset = kset_create_and_add("bcachefs", NULL, fs_kobj)) || + bch2_btree_key_cache_init() || + bch2_chardev_init() || + bch2_vfs_init() || + bch2_debug_init()) + goto err; + + return 0; +err: + bcachefs_exit(); + return -ENOMEM; +} + +#define BCH_DEBUG_PARAM(name, description) \ + bool bch2_##name; \ + module_param_named(name, bch2_##name, bool, 0644); \ + MODULE_PARM_DESC(name, description); +BCH_DEBUG_PARAMS() +#undef BCH_DEBUG_PARAM + +static unsigned bch2_metadata_version = bcachefs_metadata_version_current; +module_param_named(version, bch2_metadata_version, uint, 0400); + +module_exit(bcachefs_exit); +module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h new file mode 100644 index 000000000..36bcb9ec2 --- /dev/null +++ b/fs/bcachefs/super.h @@ -0,0 +1,266 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_H +#define _BCACHEFS_SUPER_H + +#include "extents.h" + +#include "bcachefs_ioctl.h" + +#include + +static inline size_t sector_to_bucket(const struct bch_dev *ca, sector_t s) +{ + return div_u64(s, ca->mi.bucket_size); +} + +static inline sector_t bucket_to_sector(const struct bch_dev *ca, size_t b) +{ + return ((sector_t) b) * ca->mi.bucket_size; +} + +static inline sector_t bucket_remainder(const struct bch_dev *ca, sector_t s) +{ + u32 remainder; + + div_u64_rem(s, ca->mi.bucket_size, &remainder); + return remainder; +} + +static inline size_t sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t s, + u32 *offset) +{ + return div_u64_rem(s, ca->mi.bucket_size, offset); +} + +static inline bool bch2_dev_is_online(struct bch_dev *ca) +{ + return !percpu_ref_is_zero(&ca->io_ref); +} + +static inline bool bch2_dev_is_readable(struct bch_dev *ca) +{ + return bch2_dev_is_online(ca) && + ca->mi.state != BCH_MEMBER_STATE_failed; +} + +static inline bool bch2_dev_get_ioref(struct bch_dev *ca, int rw) +{ + if (!percpu_ref_tryget(&ca->io_ref)) + return false; + + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) + return true; + + percpu_ref_put(&ca->io_ref); + return false; +} + +static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs) +{ + return bitmap_weight(devs->d, BCH_SB_MEMBERS_MAX); +} + +static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs.nr; i++) + if (devs.devs[i] == dev) + return true; + + return false; +} + +static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs, + unsigned dev) +{ + unsigned i; + + for (i = 0; i < devs->nr; i++) + if (devs->devs[i] == dev) { + array_remove_item(devs->devs, devs->nr, i); + return; + } +} + +static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs, + unsigned dev) +{ + if (!bch2_dev_list_has_dev(*devs, dev)) { + BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs)); + devs->devs[devs->nr++] = dev; + } +} + +static inline struct bch_devs_list bch2_dev_list_single(unsigned dev) +{ + return (struct bch_devs_list) { .nr = 1, .devs[0] = dev }; +} + +static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter, + const struct bch_devs_mask *mask) +{ + struct bch_dev *ca = NULL; + + while ((*iter = mask + ? find_next_bit(mask->d, c->sb.nr_devices, *iter) + : *iter) < c->sb.nr_devices && + !(ca = rcu_dereference_check(c->devs[*iter], + lockdep_is_held(&c->state_lock)))) + (*iter)++; + + return ca; +} + +#define for_each_member_device_rcu(ca, c, iter, mask) \ + for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++) + +static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter) +{ + struct bch_dev *ca; + + rcu_read_lock(); + if ((ca = __bch2_next_dev(c, iter, NULL))) + percpu_ref_get(&ca->ref); + rcu_read_unlock(); + + return ca; +} + +/* + * If you break early, you must drop your ref on the current device + */ +#define for_each_member_device(ca, c, iter) \ + for ((iter) = 0; \ + (ca = bch2_get_next_dev(c, &(iter))); \ + percpu_ref_put(&ca->ref), (iter)++) + +static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, + unsigned *iter, + int state_mask) +{ + struct bch_dev *ca; + + rcu_read_lock(); + while ((ca = __bch2_next_dev(c, iter, NULL)) && + (!((1 << ca->mi.state) & state_mask) || + !percpu_ref_tryget(&ca->io_ref))) + (*iter)++; + rcu_read_unlock(); + + return ca; +} + +#define __for_each_online_member(ca, c, iter, state_mask) \ + for ((iter) = 0; \ + (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \ + percpu_ref_put(&ca->io_ref), (iter)++) + +#define for_each_online_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, ~0) + +#define for_each_rw_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw) + +#define for_each_readable_member(ca, c, iter) \ + __for_each_online_member(ca, c, iter, \ + (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro)) + +/* + * If a key exists that references a device, the device won't be going away and + * we can omit rcu_read_lock(): + */ +static inline struct bch_dev *bch_dev_bkey_exists(const struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_check(c->devs[idx], 1); +} + +static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx) +{ + EBUG_ON(idx >= c->sb.nr_devices || !c->devs[idx]); + + return rcu_dereference_protected(c->devs[idx], + lockdep_is_held(&c->sb_lock) || + lockdep_is_held(&c->state_lock)); +} + +/* XXX kill, move to struct bch_fs */ +static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) +{ + struct bch_devs_mask devs; + struct bch_dev *ca; + unsigned i; + + memset(&devs, 0, sizeof(devs)); + for_each_online_member(ca, c, i) + __set_bit(ca->dev_idx, devs.d); + return devs; +} + +static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b) +{ + struct bch_sb_layout *layout = &ca->disk_sb.sb->layout; + u64 b_offset = bucket_to_sector(ca, b); + u64 b_end = bucket_to_sector(ca, b + 1); + unsigned i; + + if (!b) + return true; + + for (i = 0; i < layout->nr_superblocks; i++) { + u64 offset = le64_to_cpu(layout->sb_offset[i]); + u64 end = offset + (1 << layout->sb_max_size_bits); + + if (!(offset >= b_end || end <= b_offset)) + return true; + } + + return false; +} + +struct bch_fs *bch2_dev_to_fs(dev_t); +struct bch_fs *bch2_uuid_to_fs(__uuid_t); + +bool bch2_dev_state_allowed(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); +int __bch2_dev_set_state(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); +int bch2_dev_set_state(struct bch_fs *, struct bch_dev *, + enum bch_member_state, int); + +int bch2_dev_fail(struct bch_dev *, int); +int bch2_dev_remove(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_add(struct bch_fs *, const char *); +int bch2_dev_online(struct bch_fs *, const char *); +int bch2_dev_offline(struct bch_fs *, struct bch_dev *, int); +int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); +struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); + +bool bch2_fs_emergency_read_only(struct bch_fs *); +void bch2_fs_read_only(struct bch_fs *); + +int bch2_fs_read_write(struct bch_fs *); +int bch2_fs_read_write_early(struct bch_fs *); + +/* + * Only for use in the recovery/fsck path: + */ +static inline void bch2_fs_lazy_rw(struct bch_fs *c) +{ + if (!test_bit(BCH_FS_RW, &c->flags) && + !test_bit(BCH_FS_WAS_RW, &c->flags)) + bch2_fs_read_write_early(c); +} + +void __bch2_fs_stop(struct bch_fs *); +void bch2_fs_free(struct bch_fs *); +void bch2_fs_stop(struct bch_fs *); + +int bch2_fs_start(struct bch_fs *); +struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); + +#endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h new file mode 100644 index 000000000..89419fc79 --- /dev/null +++ b/fs/bcachefs/super_types.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SUPER_TYPES_H +#define _BCACHEFS_SUPER_TYPES_H + +struct bch_sb_handle { + struct bch_sb *sb; + struct block_device *bdev; + struct bio *bio; + size_t buffer_size; + fmode_t mode; + unsigned have_layout:1; + unsigned have_bio:1; + unsigned fs_sb:1; + u64 seq; +}; + +struct bch_devs_mask { + unsigned long d[BITS_TO_LONGS(BCH_SB_MEMBERS_MAX)]; +}; + +struct bch_devs_list { + u8 nr; + u8 devs[BCH_BKEY_PTRS_MAX]; +}; + +struct bch_member_cpu { + u64 nbuckets; /* device size */ + u16 first_bucket; /* index of first bucket used */ + u16 bucket_size; /* sectors */ + u16 group; + u8 state; + u8 discard; + u8 data_allowed; + u8 durability; + u8 freespace_initialized; + u8 valid; +}; + +struct bch_disk_group_cpu { + bool deleted; + u16 parent; + struct bch_devs_mask devs; +}; + +struct bch_disk_groups_cpu { + struct rcu_head rcu; + unsigned nr; + struct bch_disk_group_cpu entries[]; +}; + +#endif /* _BCACHEFS_SUPER_TYPES_H */ diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c new file mode 100644 index 000000000..740305e67 --- /dev/null +++ b/fs/bcachefs/sysfs.c @@ -0,0 +1,1064 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bcache sysfs interfaces + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#ifndef NO_BCACHEFS_SYSFS + +#include "bcachefs.h" +#include "alloc_background.h" +#include "alloc_foreground.h" +#include "sysfs.h" +#include "btree_cache.h" +#include "btree_io.h" +#include "btree_iter.h" +#include "btree_key_cache.h" +#include "btree_update.h" +#include "btree_update_interior.h" +#include "btree_gc.h" +#include "buckets.h" +#include "clock.h" +#include "disk_groups.h" +#include "ec.h" +#include "inode.h" +#include "journal.h" +#include "keylist.h" +#include "move.h" +#include "movinggc.h" +#include "nocow_locking.h" +#include "opts.h" +#include "rebalance.h" +#include "replicas.h" +#include "super-io.h" +#include "tests.h" + +#include +#include +#include + +#include "util.h" + +#define SYSFS_OPS(type) \ +const struct sysfs_ops type ## _sysfs_ops = { \ + .show = type ## _show, \ + .store = type ## _store \ +} + +#define SHOW(fn) \ +static ssize_t fn ## _to_text(struct printbuf *, \ + struct kobject *, struct attribute *); \ + \ +static ssize_t fn ## _show(struct kobject *kobj, struct attribute *attr,\ + char *buf) \ +{ \ + struct printbuf out = PRINTBUF; \ + ssize_t ret = fn ## _to_text(&out, kobj, attr); \ + \ + if (out.pos && out.buf[out.pos - 1] != '\n') \ + prt_newline(&out); \ + \ + if (!ret && out.allocation_failure) \ + ret = -ENOMEM; \ + \ + if (!ret) { \ + ret = min_t(size_t, out.pos, PAGE_SIZE - 1); \ + memcpy(buf, out.buf, ret); \ + } \ + printbuf_exit(&out); \ + return bch2_err_class(ret); \ +} \ + \ +static ssize_t fn ## _to_text(struct printbuf *out, struct kobject *kobj,\ + struct attribute *attr) + +#define STORE(fn) \ +static ssize_t fn ## _store_inner(struct kobject *, struct attribute *,\ + const char *, size_t); \ + \ +static ssize_t fn ## _store(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) \ +{ \ + return bch2_err_class(fn##_store_inner(kobj, attr, buf, size)); \ +} \ + \ +static ssize_t fn ## _store_inner(struct kobject *kobj, struct attribute *attr,\ + const char *buf, size_t size) + +#define __sysfs_attribute(_name, _mode) \ + static struct attribute sysfs_##_name = \ + { .name = #_name, .mode = _mode } + +#define write_attribute(n) __sysfs_attribute(n, 0200) +#define read_attribute(n) __sysfs_attribute(n, 0444) +#define rw_attribute(n) __sysfs_attribute(n, 0644) + +#define sysfs_printf(file, fmt, ...) \ +do { \ + if (attr == &sysfs_ ## file) \ + prt_printf(out, fmt "\n", __VA_ARGS__); \ +} while (0) + +#define sysfs_print(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + snprint(out, var); \ +} while (0) + +#define sysfs_hprint(file, val) \ +do { \ + if (attr == &sysfs_ ## file) \ + prt_human_readable_s64(out, val); \ +} while (0) + +#define var_printf(_var, fmt) sysfs_printf(_var, fmt, var(_var)) +#define var_print(_var) sysfs_print(_var, var(_var)) +#define var_hprint(_var) sysfs_hprint(_var, var(_var)) + +#define sysfs_strtoul(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe(buf, var) ?: (ssize_t) size; \ +} while (0) + +#define sysfs_strtoul_clamp(file, var, min, max) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoul_safe_clamp(buf, var, min, max) \ + ?: (ssize_t) size; \ +} while (0) + +#define strtoul_or_return(cp) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define strtoul_restrict_or_return(cp, min, max) \ +({ \ + unsigned long __v = 0; \ + int _r = strtoul_safe_restrict(cp, __v, min, max); \ + if (_r) \ + return _r; \ + __v; \ +}) + +#define strtoi_h_or_return(cp) \ +({ \ + u64 _v; \ + int _r = strtoi_h(cp, &_v); \ + if (_r) \ + return _r; \ + _v; \ +}) + +#define sysfs_hatoi(file, var) \ +do { \ + if (attr == &sysfs_ ## file) \ + return strtoi_h(buf, &var) ?: (ssize_t) size; \ +} while (0) + +write_attribute(trigger_gc); +write_attribute(trigger_discards); +write_attribute(trigger_invalidates); +write_attribute(prune_cache); +write_attribute(btree_wakeup); +rw_attribute(btree_gc_periodic); +rw_attribute(gc_gens_pos); + +read_attribute(uuid); +read_attribute(minor); +read_attribute(bucket_size); +read_attribute(first_bucket); +read_attribute(nbuckets); +rw_attribute(durability); +read_attribute(iodone); + +read_attribute(io_latency_read); +read_attribute(io_latency_write); +read_attribute(io_latency_stats_read); +read_attribute(io_latency_stats_write); +read_attribute(congested); + +read_attribute(btree_write_stats); + +read_attribute(btree_cache_size); +read_attribute(compression_stats); +read_attribute(journal_debug); +read_attribute(btree_updates); +read_attribute(btree_cache); +read_attribute(btree_key_cache); +read_attribute(stripes_heap); +read_attribute(open_buckets); +read_attribute(open_buckets_partial); +read_attribute(write_points); +read_attribute(nocow_lock_table); + +#ifdef BCH_WRITE_REF_DEBUG +read_attribute(write_refs); + +static const char * const bch2_write_refs[] = { +#define x(n) #n, + BCH_WRITE_REFS() +#undef x + NULL +}; + +static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_tabstop_push(out, 24); + + for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) { + prt_str(out, bch2_write_refs[i]); + prt_tab(out); + prt_printf(out, "%li", atomic_long_read(&c->writes[i])); + prt_newline(out); + } +} +#endif + +read_attribute(internal_uuid); +read_attribute(disk_groups); + +read_attribute(has_data); +read_attribute(alloc_debug); + +#define x(t, n, ...) read_attribute(t); +BCH_PERSISTENT_COUNTERS() +#undef x + +rw_attribute(discard); +rw_attribute(label); + +rw_attribute(copy_gc_enabled); +read_attribute(copy_gc_wait); + +rw_attribute(rebalance_enabled); +sysfs_pd_controller_attribute(rebalance); +read_attribute(rebalance_work); +rw_attribute(promote_whole_extents); + +read_attribute(new_stripes); + +read_attribute(io_timers_read); +read_attribute(io_timers_write); + +read_attribute(data_jobs); +read_attribute(moving_ctxts); + +#ifdef CONFIG_BCACHEFS_TESTS +write_attribute(perf_test); +#endif /* CONFIG_BCACHEFS_TESTS */ + +#define x(_name) \ + static struct attribute sysfs_time_stat_##_name = \ + { .name = #_name, .mode = 0444 }; + BCH_TIME_STATS() +#undef x + +static struct attribute sysfs_state_rw = { + .name = "state", + .mode = 0444, +}; + +static size_t bch2_btree_cache_size(struct bch_fs *c) +{ + size_t ret = 0; + struct btree *b; + + mutex_lock(&c->btree_cache.lock); + list_for_each_entry(b, &c->btree_cache.live, list) + ret += btree_bytes(c); + + mutex_unlock(&c->btree_cache.lock); + return ret; +} + +static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + enum btree_id id; + u64 nr_uncompressed_extents = 0, + nr_compressed_extents = 0, + nr_incompressible_extents = 0, + uncompressed_sectors = 0, + incompressible_sectors = 0, + compressed_sectors_compressed = 0, + compressed_sectors_uncompressed = 0; + int ret; + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EPERM; + + bch2_trans_init(&trans, c, 0, 0); + + for (id = 0; id < BTREE_ID_NR; id++) { + if (!btree_type_has_ptrs(id)) + continue; + + for_each_btree_key(&trans, iter, id, POS_MIN, + BTREE_ITER_ALL_SNAPSHOTS, k, ret) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bool compressed = false, uncompressed = false, incompressible = false; + + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + switch (p.crc.compression_type) { + case BCH_COMPRESSION_TYPE_none: + uncompressed = true; + uncompressed_sectors += k.k->size; + break; + case BCH_COMPRESSION_TYPE_incompressible: + incompressible = true; + incompressible_sectors += k.k->size; + break; + default: + compressed_sectors_compressed += + p.crc.compressed_size; + compressed_sectors_uncompressed += + p.crc.uncompressed_size; + compressed = true; + break; + } + } + + if (incompressible) + nr_incompressible_extents++; + else if (uncompressed) + nr_uncompressed_extents++; + else if (compressed) + nr_compressed_extents++; + } + bch2_trans_iter_exit(&trans, &iter); + } + + bch2_trans_exit(&trans); + + if (ret) + return ret; + + prt_printf(out, "uncompressed:\n"); + prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents); + prt_printf(out, " size: "); + prt_human_readable_u64(out, uncompressed_sectors << 9); + prt_printf(out, "\n"); + + prt_printf(out, "compressed:\n"); + prt_printf(out, " nr extents: %llu\n", nr_compressed_extents); + prt_printf(out, " compressed size: "); + prt_human_readable_u64(out, compressed_sectors_compressed << 9); + prt_printf(out, "\n"); + prt_printf(out, " uncompressed size: "); + prt_human_readable_u64(out, compressed_sectors_uncompressed << 9); + prt_printf(out, "\n"); + + prt_printf(out, "incompressible:\n"); + prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents); + prt_printf(out, " size: "); + prt_human_readable_u64(out, incompressible_sectors << 9); + prt_printf(out, "\n"); + return 0; +} + +static void bch2_gc_gens_pos_to_text(struct printbuf *out, struct bch_fs *c) +{ + prt_printf(out, "%s: ", bch2_btree_ids[c->gc_gens_btree]); + bch2_bpos_to_text(out, c->gc_gens_pos); + prt_printf(out, "\n"); +} + +static void bch2_btree_wakeup_all(struct bch_fs *c) +{ + struct btree_trans *trans; + + seqmutex_lock(&c->btree_trans_lock); + list_for_each_entry(trans, &c->btree_trans_list, list) { + struct btree_bkey_cached_common *b = READ_ONCE(trans->locking); + + if (b) + six_lock_wakeup_all(&b->lock); + + } + seqmutex_unlock(&c->btree_trans_lock); +} + +SHOW(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + sysfs_print(minor, c->minor); + sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b); + + sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c)); + + if (attr == &sysfs_btree_write_stats) + bch2_btree_write_stats_to_text(out, c); + + sysfs_printf(btree_gc_periodic, "%u", (int) c->btree_gc_periodic); + + if (attr == &sysfs_gc_gens_pos) + bch2_gc_gens_pos_to_text(out, c); + + sysfs_printf(copy_gc_enabled, "%i", c->copy_gc_enabled); + + sysfs_printf(rebalance_enabled, "%i", c->rebalance.enabled); + sysfs_pd_controller_show(rebalance, &c->rebalance.pd); /* XXX */ + + if (attr == &sysfs_copy_gc_wait) + bch2_copygc_wait_to_text(out, c); + + if (attr == &sysfs_rebalance_work) + bch2_rebalance_work_to_text(out, c); + + sysfs_print(promote_whole_extents, c->promote_whole_extents); + + /* Debugging: */ + + if (attr == &sysfs_journal_debug) + bch2_journal_debug_to_text(out, &c->journal); + + if (attr == &sysfs_btree_updates) + bch2_btree_updates_to_text(out, c); + + if (attr == &sysfs_btree_cache) + bch2_btree_cache_to_text(out, &c->btree_cache); + + if (attr == &sysfs_btree_key_cache) + bch2_btree_key_cache_to_text(out, &c->btree_key_cache); + + if (attr == &sysfs_stripes_heap) + bch2_stripes_heap_to_text(out, c); + + if (attr == &sysfs_open_buckets) + bch2_open_buckets_to_text(out, c); + + if (attr == &sysfs_open_buckets_partial) + bch2_open_buckets_partial_to_text(out, c); + + if (attr == &sysfs_write_points) + bch2_write_points_to_text(out, c); + + if (attr == &sysfs_compression_stats) + bch2_compression_stats_to_text(out, c); + + if (attr == &sysfs_new_stripes) + bch2_new_stripes_to_text(out, c); + + if (attr == &sysfs_io_timers_read) + bch2_io_timers_to_text(out, &c->io_clock[READ]); + + if (attr == &sysfs_io_timers_write) + bch2_io_timers_to_text(out, &c->io_clock[WRITE]); + + if (attr == &sysfs_data_jobs) + bch2_data_jobs_to_text(out, c); + + if (attr == &sysfs_moving_ctxts) + bch2_fs_moving_ctxts_to_text(out, c); + +#ifdef BCH_WRITE_REF_DEBUG + if (attr == &sysfs_write_refs) + bch2_write_refs_to_text(out, c); +#endif + + if (attr == &sysfs_nocow_lock_table) + bch2_nocow_locks_to_text(out, &c->nocow_locks); + + if (attr == &sysfs_disk_groups) + bch2_disk_groups_to_text(out, c); + + return 0; +} + +STORE(bch2_fs) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, kobj); + + if (attr == &sysfs_btree_gc_periodic) { + ssize_t ret = strtoul_safe(buf, c->btree_gc_periodic) + ?: (ssize_t) size; + + wake_up_process(c->gc_thread); + return ret; + } + + if (attr == &sysfs_copy_gc_enabled) { + ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) + ?: (ssize_t) size; + + if (c->copygc_thread) + wake_up_process(c->copygc_thread); + return ret; + } + + if (attr == &sysfs_rebalance_enabled) { + ssize_t ret = strtoul_safe(buf, c->rebalance.enabled) + ?: (ssize_t) size; + + rebalance_wakeup(c); + return ret; + } + + sysfs_pd_controller_store(rebalance, &c->rebalance.pd); + + sysfs_strtoul(promote_whole_extents, c->promote_whole_extents); + + /* Debugging: */ + + if (!test_bit(BCH_FS_STARTED, &c->flags)) + return -EPERM; + + /* Debugging: */ + + if (!test_bit(BCH_FS_RW, &c->flags)) + return -EROFS; + + if (attr == &sysfs_prune_cache) { + struct shrink_control sc; + + sc.gfp_mask = GFP_KERNEL; + sc.nr_to_scan = strtoul_or_return(buf); + c->btree_cache.shrink.scan_objects(&c->btree_cache.shrink, &sc); + } + + if (attr == &sysfs_btree_wakeup) + bch2_btree_wakeup_all(c); + + if (attr == &sysfs_trigger_gc) { + /* + * Full gc is currently incompatible with btree key cache: + */ +#if 0 + down_read(&c->state_lock); + bch2_gc(c, false, false); + up_read(&c->state_lock); +#else + bch2_gc_gens(c); +#endif + } + + if (attr == &sysfs_trigger_discards) + bch2_do_discards(c); + + if (attr == &sysfs_trigger_invalidates) + bch2_do_invalidates(c); + +#ifdef CONFIG_BCACHEFS_TESTS + if (attr == &sysfs_perf_test) { + char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; + char *test = strsep(&p, " \t\n"); + char *nr_str = strsep(&p, " \t\n"); + char *threads_str = strsep(&p, " \t\n"); + unsigned threads; + u64 nr; + int ret = -EINVAL; + + if (threads_str && + !(ret = kstrtouint(threads_str, 10, &threads)) && + !(ret = bch2_strtoull_h(nr_str, &nr))) + ret = bch2_btree_perf_test(c, test, nr, threads); + kfree(tmp); + + if (ret) + size = ret; + } +#endif + return size; +} +SYSFS_OPS(bch2_fs); + +struct attribute *bch2_fs_files[] = { + &sysfs_minor, + &sysfs_btree_cache_size, + &sysfs_btree_write_stats, + + &sysfs_promote_whole_extents, + + &sysfs_compression_stats, + +#ifdef CONFIG_BCACHEFS_TESTS + &sysfs_perf_test, +#endif + NULL +}; + +/* counters dir */ + +SHOW(bch2_fs_counters) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, counters_kobj); + u64 counter = 0; + u64 counter_since_mount = 0; + + printbuf_tabstop_push(out, 32); + + #define x(t, ...) \ + if (attr == &sysfs_##t) { \ + counter = percpu_u64_get(&c->counters[BCH_COUNTER_##t]);\ + counter_since_mount = counter - c->counters_on_mount[BCH_COUNTER_##t];\ + prt_printf(out, "since mount:"); \ + prt_tab(out); \ + prt_human_readable_u64(out, counter_since_mount); \ + prt_newline(out); \ + \ + prt_printf(out, "since filesystem creation:"); \ + prt_tab(out); \ + prt_human_readable_u64(out, counter); \ + prt_newline(out); \ + } + BCH_PERSISTENT_COUNTERS() + #undef x + return 0; +} + +STORE(bch2_fs_counters) { + return 0; +} + +SYSFS_OPS(bch2_fs_counters); + +struct attribute *bch2_fs_counters_files[] = { +#define x(t, ...) \ + &sysfs_##t, + BCH_PERSISTENT_COUNTERS() +#undef x + NULL +}; +/* internal dir - just a wrapper */ + +SHOW(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); + + return bch2_fs_to_text(out, &c->kobj, attr); +} + +STORE(bch2_fs_internal) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, internal); + + return bch2_fs_store(&c->kobj, attr, buf, size); +} +SYSFS_OPS(bch2_fs_internal); + +struct attribute *bch2_fs_internal_files[] = { + &sysfs_journal_debug, + &sysfs_btree_updates, + &sysfs_btree_cache, + &sysfs_btree_key_cache, + &sysfs_new_stripes, + &sysfs_stripes_heap, + &sysfs_open_buckets, + &sysfs_open_buckets_partial, + &sysfs_write_points, +#ifdef BCH_WRITE_REF_DEBUG + &sysfs_write_refs, +#endif + &sysfs_nocow_lock_table, + &sysfs_io_timers_read, + &sysfs_io_timers_write, + + &sysfs_trigger_gc, + &sysfs_trigger_discards, + &sysfs_trigger_invalidates, + &sysfs_prune_cache, + &sysfs_btree_wakeup, + + &sysfs_gc_gens_pos, + + &sysfs_copy_gc_enabled, + &sysfs_copy_gc_wait, + + &sysfs_rebalance_enabled, + &sysfs_rebalance_work, + sysfs_pd_controller_files(rebalance), + + &sysfs_data_jobs, + &sysfs_moving_ctxts, + + &sysfs_internal_uuid, + + &sysfs_disk_groups, + NULL +}; + +/* options */ + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); + int id = opt - bch2_opt_table; + u64 v = bch2_opt_get_by_id(&c->opts, id); + + bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); + prt_char(out, '\n'); + + return 0; +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + const struct bch_option *opt = container_of(attr, struct bch_option, attr); + int ret, id = opt - bch2_opt_table; + char *tmp; + u64 v; + + /* + * We don't need to take c->writes for correctness, but it eliminates an + * unsightly error message in the dmesg log when we're RO: + */ + if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) + return -EROFS; + + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) { + ret = -ENOMEM; + goto err; + } + + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + kfree(tmp); + + if (ret < 0) + goto err; + + ret = bch2_opt_check_may_set(c, id, v); + if (ret < 0) + goto err; + + bch2_opt_set_sb(c, opt, v); + bch2_opt_set_by_id(&c->opts, id, v); + + if ((id == Opt_background_target || + id == Opt_background_compression) && v) { + bch2_rebalance_add_work(c, S64_MAX); + rebalance_wakeup(c); + } + + ret = size; +err: + bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + return ret; +} +SYSFS_OPS(bch2_fs_opts_dir); + +struct attribute *bch2_fs_opts_dir_files[] = { NULL }; + +int bch2_opts_create_sysfs_files(struct kobject *kobj) +{ + const struct bch_option *i; + int ret; + + for (i = bch2_opt_table; + i < bch2_opt_table + bch2_opts_nr; + i++) { + if (!(i->flags & OPT_FS)) + continue; + + ret = sysfs_create_file(kobj, &i->attr); + if (ret) + return ret; + } + + return 0; +} + +/* time stats */ + +SHOW(bch2_fs_time_stats) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, time_stats); + +#define x(name) \ + if (attr == &sysfs_time_stat_##name) \ + bch2_time_stats_to_text(out, &c->times[BCH_TIME_##name]); + BCH_TIME_STATS() +#undef x + + return 0; +} + +STORE(bch2_fs_time_stats) +{ + return size; +} +SYSFS_OPS(bch2_fs_time_stats); + +struct attribute *bch2_fs_time_stats_files[] = { +#define x(name) \ + &sysfs_time_stat_##name, + BCH_TIME_STATS() +#undef x + NULL +}; + +static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) +{ + struct bch_fs *c = ca->fs; + struct bch_dev_usage stats = bch2_dev_usage_read(ca); + unsigned i, nr[BCH_DATA_NR]; + + memset(nr, 0, sizeof(nr)); + + for (i = 0; i < ARRAY_SIZE(c->open_buckets); i++) + nr[c->open_buckets[i].data_type]++; + + printbuf_tabstop_push(out, 8); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + printbuf_tabstop_push(out, 16); + + prt_tab(out); + prt_str(out, "buckets"); + prt_tab_rjust(out); + prt_str(out, "sectors"); + prt_tab_rjust(out); + prt_str(out, "fragmented"); + prt_tab_rjust(out); + prt_newline(out); + + for (i = 0; i < BCH_DATA_NR; i++) { + prt_str(out, bch2_data_types[i]); + prt_tab(out); + prt_u64(out, stats.d[i].buckets); + prt_tab_rjust(out); + prt_u64(out, stats.d[i].sectors); + prt_tab_rjust(out); + prt_u64(out, stats.d[i].fragmented); + prt_tab_rjust(out); + prt_newline(out); + } + + prt_str(out, "ec"); + prt_tab(out); + prt_u64(out, stats.buckets_ec); + prt_tab_rjust(out); + prt_newline(out); + + prt_newline(out); + + prt_printf(out, "reserves:"); + prt_newline(out); + for (i = 0; i < BCH_WATERMARK_NR; i++) { + prt_str(out, bch2_watermarks[i]); + prt_tab(out); + prt_u64(out, bch2_dev_buckets_reserved(ca, i)); + prt_tab_rjust(out); + prt_newline(out); + } + + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 24); + + prt_str(out, "freelist_wait"); + prt_tab(out); + prt_str(out, c->freelist_wait.list.first ? "waiting" : "empty"); + prt_newline(out); + + prt_str(out, "open buckets allocated"); + prt_tab(out); + prt_u64(out, OPEN_BUCKETS_COUNT - c->open_buckets_nr_free); + prt_newline(out); + + prt_str(out, "open buckets this dev"); + prt_tab(out); + prt_u64(out, ca->nr_open_buckets); + prt_newline(out); + + prt_str(out, "open buckets total"); + prt_tab(out); + prt_u64(out, OPEN_BUCKETS_COUNT); + prt_newline(out); + + prt_str(out, "open_buckets_wait"); + prt_tab(out); + prt_str(out, c->open_buckets_wait.list.first ? "waiting" : "empty"); + prt_newline(out); + + prt_str(out, "open_buckets_btree"); + prt_tab(out); + prt_u64(out, nr[BCH_DATA_btree]); + prt_newline(out); + + prt_str(out, "open_buckets_user"); + prt_tab(out); + prt_u64(out, nr[BCH_DATA_user]); + prt_newline(out); + + prt_str(out, "buckets_to_invalidate"); + prt_tab(out); + prt_u64(out, should_invalidate_buckets(ca, stats)); + prt_newline(out); + + prt_str(out, "btree reserve cache"); + prt_tab(out); + prt_u64(out, c->btree_reserve_cache_nr); + prt_newline(out); +} + +static const char * const bch2_rw[] = { + "read", + "write", + NULL +}; + +static void dev_iodone_to_text(struct printbuf *out, struct bch_dev *ca) +{ + int rw, i; + + for (rw = 0; rw < 2; rw++) { + prt_printf(out, "%s:\n", bch2_rw[rw]); + + for (i = 1; i < BCH_DATA_NR; i++) + prt_printf(out, "%-12s:%12llu\n", + bch2_data_types[i], + percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9); + } +} + +SHOW(bch2_dev) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; + + sysfs_printf(uuid, "%pU\n", ca->uuid.b); + + sysfs_print(bucket_size, bucket_bytes(ca)); + sysfs_print(first_bucket, ca->mi.first_bucket); + sysfs_print(nbuckets, ca->mi.nbuckets); + sysfs_print(durability, ca->mi.durability); + sysfs_print(discard, ca->mi.discard); + + if (attr == &sysfs_label) { + if (ca->mi.group) { + mutex_lock(&c->sb_lock); + bch2_disk_path_to_text(out, c->disk_sb.sb, + ca->mi.group - 1); + mutex_unlock(&c->sb_lock); + } + + prt_char(out, '\n'); + } + + if (attr == &sysfs_has_data) { + prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca)); + prt_char(out, '\n'); + } + + if (attr == &sysfs_state_rw) { + prt_string_option(out, bch2_member_states, ca->mi.state); + prt_char(out, '\n'); + } + + if (attr == &sysfs_iodone) + dev_iodone_to_text(out, ca); + + sysfs_print(io_latency_read, atomic64_read(&ca->cur_latency[READ])); + sysfs_print(io_latency_write, atomic64_read(&ca->cur_latency[WRITE])); + + if (attr == &sysfs_io_latency_stats_read) + bch2_time_stats_to_text(out, &ca->io_latency[READ]); + + if (attr == &sysfs_io_latency_stats_write) + bch2_time_stats_to_text(out, &ca->io_latency[WRITE]); + + sysfs_printf(congested, "%u%%", + clamp(atomic_read(&ca->congested), 0, CONGESTED_MAX) + * 100 / CONGESTED_MAX); + + if (attr == &sysfs_alloc_debug) + dev_alloc_debug_to_text(out, ca); + + return 0; +} + +STORE(bch2_dev) +{ + struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); + struct bch_fs *c = ca->fs; + struct bch_member *mi; + + if (attr == &sysfs_discard) { + bool v = strtoul_or_return(buf); + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + + if (v != BCH_MEMBER_DISCARD(mi)) { + SET_BCH_MEMBER_DISCARD(mi, v); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + if (attr == &sysfs_durability) { + u64 v = strtoul_or_return(buf); + + mutex_lock(&c->sb_lock); + mi = &bch2_sb_get_members(c->disk_sb.sb)->members[ca->dev_idx]; + + if (v != BCH_MEMBER_DURABILITY(mi)) { + SET_BCH_MEMBER_DURABILITY(mi, v + 1); + bch2_write_super(c); + } + mutex_unlock(&c->sb_lock); + } + + if (attr == &sysfs_label) { + char *tmp; + int ret; + + tmp = kstrdup(buf, GFP_KERNEL); + if (!tmp) + return -ENOMEM; + + ret = bch2_dev_group_set(c, ca, strim(tmp)); + kfree(tmp); + if (ret) + return ret; + } + + return size; +} +SYSFS_OPS(bch2_dev); + +struct attribute *bch2_dev_files[] = { + &sysfs_uuid, + &sysfs_bucket_size, + &sysfs_first_bucket, + &sysfs_nbuckets, + &sysfs_durability, + + /* settings: */ + &sysfs_discard, + &sysfs_state_rw, + &sysfs_label, + + &sysfs_has_data, + &sysfs_iodone, + + &sysfs_io_latency_read, + &sysfs_io_latency_write, + &sysfs_io_latency_stats_read, + &sysfs_io_latency_stats_write, + &sysfs_congested, + + /* debug: */ + &sysfs_alloc_debug, + NULL +}; + +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h new file mode 100644 index 000000000..222cd5062 --- /dev/null +++ b/fs/bcachefs/sysfs.h @@ -0,0 +1,48 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SYSFS_H_ +#define _BCACHEFS_SYSFS_H_ + +#include + +#ifndef NO_BCACHEFS_SYSFS + +struct attribute; +struct sysfs_ops; + +extern struct attribute *bch2_fs_files[]; +extern struct attribute *bch2_fs_counters_files[]; +extern struct attribute *bch2_fs_internal_files[]; +extern struct attribute *bch2_fs_opts_dir_files[]; +extern struct attribute *bch2_fs_time_stats_files[]; +extern struct attribute *bch2_dev_files[]; + +extern const struct sysfs_ops bch2_fs_sysfs_ops; +extern const struct sysfs_ops bch2_fs_counters_sysfs_ops; +extern const struct sysfs_ops bch2_fs_internal_sysfs_ops; +extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; +extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; +extern const struct sysfs_ops bch2_dev_sysfs_ops; + +int bch2_opts_create_sysfs_files(struct kobject *); + +#else + +static struct attribute *bch2_fs_files[] = {}; +static struct attribute *bch2_fs_counters_files[] = {}; +static struct attribute *bch2_fs_internal_files[] = {}; +static struct attribute *bch2_fs_opts_dir_files[] = {}; +static struct attribute *bch2_fs_time_stats_files[] = {}; +static struct attribute *bch2_dev_files[] = {}; + +static const struct sysfs_ops bch2_fs_sysfs_ops; +static const struct sysfs_ops bch2_fs_counters_sysfs_ops; +static const struct sysfs_ops bch2_fs_internal_sysfs_ops; +static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; +static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; +static const struct sysfs_ops bch2_dev_sysfs_ops; + +static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } + +#endif /* NO_BCACHEFS_SYSFS */ + +#endif /* _BCACHEFS_SYSFS_H_ */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c new file mode 100644 index 000000000..cef23d2cc --- /dev/null +++ b/fs/bcachefs/tests.c @@ -0,0 +1,939 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifdef CONFIG_BCACHEFS_TESTS + +#include "bcachefs.h" +#include "btree_update.h" +#include "journal_reclaim.h" +#include "subvolume.h" +#include "tests.h" + +#include "linux/kthread.h" +#include "linux/random.h" + +static void delete_test_keys(struct bch_fs *c) +{ + int ret; + + ret = bch2_btree_delete_range(c, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); + BUG_ON(ret); + + ret = bch2_btree_delete_range(c, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); + BUG_ON(ret); +} + +/* unit tests */ + +static int test_delete(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { + bch_err_msg(c, ret, "update error"); + goto err; + } + + pr_info("deleting once"); + ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { + bch_err_msg(c, ret, "delete error (first)"); + goto err; + } + + pr_info("deleting twice"); + ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { + bch_err_msg(c, ret, "delete error (second)"); + goto err; + } +err: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_delete_written(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k.p.snapshot = U32_MAX; + + bch2_trans_init(&trans, c, 0, 0); + + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, k.k.p, + BTREE_ITER_INTENT); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_trans_update(&trans, &iter, &k.k_i, 0)); + if (ret) { + bch_err_msg(c, ret, "update error"); + goto err; + } + + bch2_trans_unlock(&trans); + bch2_journal_flush_all_pins(&c->journal); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_btree_iter_traverse(&iter) ?: + bch2_btree_delete_at(&trans, &iter, 0)); + if (ret) { + bch_err_msg(c, ret, "delete error"); + goto err; + } +err: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_iterate(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i; + k.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err_msg(c, ret, "insert error"); + goto err; + } + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i++); + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating forwards"); + goto err; + } + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_xattrs, + SPOS(0, U64_MAX, U32_MAX), 0, k, + ({ + BUG_ON(k.k->p.offset != --i); + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating backwards"); + goto err; + } + + BUG_ON(i); +err: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_iterate_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test extents"); + + for (i = 0; i < nr; i += 8) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 8; + k.k.p.snapshot = U32_MAX; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err_msg(c, ret, "insert error"); + goto err; + } + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i); + i = k.k->p.offset; + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating forwards"); + goto err; + } + + BUG_ON(i != nr); + + pr_info("iterating backwards"); + + ret = for_each_btree_key_reverse(&trans, iter, BTREE_ID_extents, + SPOS(0, U64_MAX, U32_MAX), 0, k, + ({ + BUG_ON(k.k->p.offset != i); + i = bkey_start_offset(k.k); + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating backwards"); + goto err; + } + + BUG_ON(i); +err: + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_iterate_slots(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i++) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i * 2; + k.k.p.snapshot = U32_MAX; + + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err_msg(c, ret, "insert error"); + goto err; + } + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(k.k->p.offset != i); + i += 2; + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating forwards"); + goto err; + } + + BUG_ON(i != nr * 2); + + pr_info("iterating forwards by slots"); + + i = 0; + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i >= nr * 2) + break; + + BUG_ON(k.k->p.offset != i); + BUG_ON(bkey_deleted(k.k) != (i & 1)); + + i++; + 0; + })); + if (ret < 0) { + bch_err_msg(c, ret, "error iterating forwards by slots"); + goto err; + } + ret = 0; +err: + bch2_trans_exit(&trans); + return ret; +} + +static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter = { NULL }; + struct bkey_s_c k; + u64 i; + int ret = 0; + + bch2_trans_init(&trans, c, 0, 0); + + delete_test_keys(c); + + pr_info("inserting test keys"); + + for (i = 0; i < nr; i += 16) { + struct bkey_i_cookie k; + + bkey_cookie_init(&k.k_i); + k.k.p.offset = i + 16; + k.k.p.snapshot = U32_MAX; + k.k.size = 8; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) { + bch_err_msg(c, ret, "insert error"); + goto err; + } + } + + pr_info("iterating forwards"); + + i = 0; + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, ({ + BUG_ON(bkey_start_offset(k.k) != i + 8); + BUG_ON(k.k->size != 8); + i += 16; + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating forwards"); + goto err; + } + + BUG_ON(i != nr); + + pr_info("iterating forwards by slots"); + + i = 0; + + ret = for_each_btree_key2_upto(&trans, iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + BTREE_ITER_SLOTS, k, ({ + if (i == nr) + break; + BUG_ON(bkey_deleted(k.k) != !(i % 16)); + + BUG_ON(bkey_start_offset(k.k) != i); + BUG_ON(k.k->size != 8); + i = k.k->p.offset; + 0; + })); + if (ret) { + bch_err_msg(c, ret, "error iterating forwards by slots"); + goto err; + } + ret = 0; +err: + bch2_trans_exit(&trans); + return 0; +} + +/* + * XXX: we really want to make sure we've got a btree with depth > 0 for these + * tests + */ +static int test_peek_end(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return 0; +} + +static int test_peek_end_extents(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_extents, + SPOS(0, 0, U32_MAX), 0); + + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + BUG_ON(k.k); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return 0; +} + +/* extent unit tests */ + +static u64 test_version; + +static int insert_test_extent(struct bch_fs *c, + u64 start, u64 end) +{ + struct bkey_i_cookie k; + int ret; + + bkey_cookie_init(&k.k_i); + k.k_i.k.p.offset = end; + k.k_i.k.p.snapshot = U32_MAX; + k.k_i.k.size = end - start; + k.k_i.k.version.lo = test_version++; + + ret = bch2_btree_insert(c, BTREE_ID_extents, &k.k_i, + NULL, NULL, 0); + if (ret) + bch_err_fn(c, ret); + return ret; +} + +static int __test_extent_overwrite(struct bch_fs *c, + u64 e1_start, u64 e1_end, + u64 e2_start, u64 e2_end) +{ + int ret; + + ret = insert_test_extent(c, e1_start, e1_end) ?: + insert_test_extent(c, e2_start, e2_end); + + delete_test_keys(c); + return ret; +} + +static int test_extent_overwrite_front(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 0, 64, 0, 32) ?: + __test_extent_overwrite(c, 8, 64, 0, 32); +} + +static int test_extent_overwrite_back(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 0, 64, 32, 64) ?: + __test_extent_overwrite(c, 0, 64, 32, 72); +} + +static int test_extent_overwrite_middle(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 0, 64, 32, 40); +} + +static int test_extent_overwrite_all(struct bch_fs *c, u64 nr) +{ + return __test_extent_overwrite(c, 32, 64, 0, 64) ?: + __test_extent_overwrite(c, 32, 64, 0, 128) ?: + __test_extent_overwrite(c, 32, 64, 32, 64) ?: + __test_extent_overwrite(c, 32, 64, 32, 128); +} + +/* snapshot unit tests */ + +/* Test skipping over keys in unrelated snapshots: */ +static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie cookie; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = snapid_hi; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, snapid_lo), 0); + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek_upto(&iter, POS(0, U64_MAX)))); + + BUG_ON(k.k->p.snapshot != U32_MAX); + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int test_snapshots(struct bch_fs *c, u64 nr) +{ + struct bkey_i_cookie cookie; + u32 snapids[2]; + u32 snapid_subvols[2] = { 1, 1 }; + int ret; + + bkey_cookie_init(&cookie.k_i); + cookie.k.p.snapshot = U32_MAX; + ret = bch2_btree_insert(c, BTREE_ID_xattrs, &cookie.k_i, + NULL, NULL, 0); + if (ret) + return ret; + + ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_snapshot_node_create(&trans, U32_MAX, + snapids, + snapid_subvols, + 2)); + if (ret) + return ret; + + if (snapids[0] > snapids[1]) + swap(snapids[0], snapids[1]); + + ret = test_snapshot_filter(c, snapids[0], snapids[1]); + if (ret) { + bch_err_msg(c, ret, "from test_snapshot_filter"); + return ret; + } + + return 0; +} + +/* perf tests */ + +static u64 test_rand(void) +{ + u64 v; + + get_random_bytes(&v, sizeof(v)); + return v; +} + +static int rand_insert(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct bkey_i_cookie k; + int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { + bkey_cookie_init(&k.k_i); + k.k.p.offset = test_rand(); + k.k.p.snapshot = U32_MAX; + + ret = commit_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k.k_i, 0)); + if (ret) + break; + } + + bch2_trans_exit(&trans); + return ret; +} + +static int rand_insert_multi(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct bkey_i_cookie k[8]; + int ret = 0; + unsigned j; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i += ARRAY_SIZE(k)) { + for (j = 0; j < ARRAY_SIZE(k); j++) { + bkey_cookie_init(&k[j].k_i); + k[j].k.p.offset = test_rand(); + k[j].k.p.snapshot = U32_MAX; + } + + ret = commit_do(&trans, NULL, NULL, 0, + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[0].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[1].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[2].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[3].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[4].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[5].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[6].k_i, 0) ?: + __bch2_btree_insert(&trans, BTREE_ID_xattrs, &k[7].k_i, 0)); + if (ret) + break; + } + + bch2_trans_exit(&trans); + return ret; +} + +static int rand_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { + bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + + lockrestart_do(&trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + ret = bkey_err(k); + if (ret) + break; + } + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int rand_mixed_trans(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_i_cookie *cookie, + u64 i, u64 pos) +{ + struct bkey_s_c k; + int ret; + + bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + + k = bch2_btree_iter_peek(iter); + ret = bkey_err(k); + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) + bch_err_msg(trans->c, ret, "lookup error"); + if (ret) + return ret; + + if (!(i & 3) && k.k) { + bkey_cookie_init(&cookie->k_i); + cookie->k.p = iter->pos; + ret = bch2_trans_update(trans, iter, &cookie->k_i, 0); + } + + return ret; +} + +static int rand_mixed(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + struct btree_iter iter; + struct bkey_i_cookie cookie; + int ret = 0; + u64 i, rand; + + bch2_trans_init(&trans, c, 0, 0); + bch2_trans_iter_init(&trans, &iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), 0); + + for (i = 0; i < nr; i++) { + rand = test_rand(); + ret = commit_do(&trans, NULL, NULL, 0, + rand_mixed_trans(&trans, &iter, &cookie, i, rand)); + if (ret) + break; + } + + bch2_trans_iter_exit(&trans, &iter); + bch2_trans_exit(&trans); + return ret; +} + +static int __do_delete(struct btree_trans *trans, struct bpos pos) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, + BTREE_ITER_INTENT); + k = bch2_btree_iter_peek(&iter); + ret = bkey_err(k); + if (ret) + goto err; + + if (!k.k) + goto err; + + ret = bch2_btree_delete_at(trans, &iter, 0); +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int rand_delete(struct bch_fs *c, u64 nr) +{ + struct btree_trans trans; + int ret = 0; + u64 i; + + bch2_trans_init(&trans, c, 0, 0); + + for (i = 0; i < nr; i++) { + struct bpos pos = SPOS(0, test_rand(), U32_MAX); + + ret = commit_do(&trans, NULL, NULL, 0, + __do_delete(&trans, pos)); + if (ret) + break; + } + + bch2_trans_exit(&trans); + return ret; +} + +static int seq_insert(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct bkey_i_cookie insert; + + bkey_cookie_init(&insert.k_i); + + return bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_SLOTS|BTREE_ITER_INTENT, k, + NULL, NULL, 0, ({ + if (iter.pos.offset >= nr) + break; + insert.k.p = iter.pos; + bch2_trans_update(&trans, &iter, &insert.k_i, 0); + }))); +} + +static int seq_lookup(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + + return bch2_trans_run(c, + for_each_btree_key2_upto(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), POS(0, U64_MAX), + 0, k, + 0)); +} + +static int seq_overwrite(struct bch_fs *c, u64 nr) +{ + struct btree_iter iter; + struct bkey_s_c k; + + return bch2_trans_run(c, + for_each_btree_key_commit(&trans, iter, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + BTREE_ITER_INTENT, k, + NULL, NULL, 0, ({ + struct bkey_i_cookie u; + + bkey_reassemble(&u.k_i, k); + bch2_trans_update(&trans, &iter, &u.k_i, 0); + }))); +} + +static int seq_delete(struct bch_fs *c, u64 nr) +{ + return bch2_btree_delete_range(c, BTREE_ID_xattrs, + SPOS(0, 0, U32_MAX), + POS(0, U64_MAX), + 0, NULL); +} + +typedef int (*perf_test_fn)(struct bch_fs *, u64); + +struct test_job { + struct bch_fs *c; + u64 nr; + unsigned nr_threads; + perf_test_fn fn; + + atomic_t ready; + wait_queue_head_t ready_wait; + + atomic_t done; + struct completion done_completion; + + u64 start; + u64 finish; + int ret; +}; + +static int btree_perf_test_thread(void *data) +{ + struct test_job *j = data; + int ret; + + if (atomic_dec_and_test(&j->ready)) { + wake_up(&j->ready_wait); + j->start = sched_clock(); + } else { + wait_event(j->ready_wait, !atomic_read(&j->ready)); + } + + ret = j->fn(j->c, div64_u64(j->nr, j->nr_threads)); + if (ret) { + bch_err(j->c, "%ps: error %s", j->fn, bch2_err_str(ret)); + j->ret = ret; + } + + if (atomic_dec_and_test(&j->done)) { + j->finish = sched_clock(); + complete(&j->done_completion); + } + + return 0; +} + +int bch2_btree_perf_test(struct bch_fs *c, const char *testname, + u64 nr, unsigned nr_threads) +{ + struct test_job j = { .c = c, .nr = nr, .nr_threads = nr_threads }; + char name_buf[20]; + struct printbuf nr_buf = PRINTBUF; + struct printbuf per_sec_buf = PRINTBUF; + unsigned i; + u64 time; + + atomic_set(&j.ready, nr_threads); + init_waitqueue_head(&j.ready_wait); + + atomic_set(&j.done, nr_threads); + init_completion(&j.done_completion); + +#define perf_test(_test) \ + if (!strcmp(testname, #_test)) j.fn = _test + + perf_test(rand_insert); + perf_test(rand_insert_multi); + perf_test(rand_lookup); + perf_test(rand_mixed); + perf_test(rand_delete); + + perf_test(seq_insert); + perf_test(seq_lookup); + perf_test(seq_overwrite); + perf_test(seq_delete); + + /* a unit test, not a perf test: */ + perf_test(test_delete); + perf_test(test_delete_written); + perf_test(test_iterate); + perf_test(test_iterate_extents); + perf_test(test_iterate_slots); + perf_test(test_iterate_slots_extents); + perf_test(test_peek_end); + perf_test(test_peek_end_extents); + + perf_test(test_extent_overwrite_front); + perf_test(test_extent_overwrite_back); + perf_test(test_extent_overwrite_middle); + perf_test(test_extent_overwrite_all); + + perf_test(test_snapshots); + + if (!j.fn) { + pr_err("unknown test %s", testname); + return -EINVAL; + } + + //pr_info("running test %s:", testname); + + if (nr_threads == 1) + btree_perf_test_thread(&j); + else + for (i = 0; i < nr_threads; i++) + kthread_run(btree_perf_test_thread, &j, + "bcachefs perf test[%u]", i); + + while (wait_for_completion_interruptible(&j.done_completion)) + ; + + time = j.finish - j.start; + + scnprintf(name_buf, sizeof(name_buf), "%s:", testname); + prt_human_readable_u64(&nr_buf, nr); + prt_human_readable_u64(&per_sec_buf, div64_u64(nr * NSEC_PER_SEC, time)); + printk(KERN_INFO "%-12s %s with %u threads in %5llu sec, %5llu nsec per iter, %5s per sec\n", + name_buf, nr_buf.buf, nr_threads, + div_u64(time, NSEC_PER_SEC), + div_u64(time * nr_threads, nr), + per_sec_buf.buf); + printbuf_exit(&per_sec_buf); + printbuf_exit(&nr_buf); + return j.ret; +} + +#endif /* CONFIG_BCACHEFS_TESTS */ diff --git a/fs/bcachefs/tests.h b/fs/bcachefs/tests.h new file mode 100644 index 000000000..c73b18aea --- /dev/null +++ b/fs/bcachefs/tests.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TEST_H +#define _BCACHEFS_TEST_H + +struct bch_fs; + +#ifdef CONFIG_BCACHEFS_TESTS + +int bch2_btree_perf_test(struct bch_fs *, const char *, u64, unsigned); + +#else + +#endif /* CONFIG_BCACHEFS_TESTS */ + +#endif /* _BCACHEFS_TEST_H */ diff --git a/fs/bcachefs/trace.c b/fs/bcachefs/trace.c new file mode 100644 index 000000000..d294b3d71 --- /dev/null +++ b/fs/bcachefs/trace.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "alloc_types.h" +#include "buckets.h" +#include "btree_cache.h" +#include "btree_iter.h" +#include "btree_locking.h" +#include "btree_update_interior.h" +#include "keylist.h" +#include "opts.h" + +#include +#include + +#define CREATE_TRACE_POINTS +#include "trace.h" diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h new file mode 100644 index 000000000..a743ab477 --- /dev/null +++ b/fs/bcachefs/trace.h @@ -0,0 +1,1247 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM bcachefs + +#if !defined(_TRACE_BCACHEFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_BCACHEFS_H + +#include + +#define TRACE_BPOS_entries(name) \ + __field(u64, name##_inode ) \ + __field(u64, name##_offset ) \ + __field(u32, name##_snapshot ) + +#define TRACE_BPOS_assign(dst, src) \ + __entry->dst##_inode = (src).inode; \ + __entry->dst##_offset = (src).offset; \ + __entry->dst##_snapshot = (src).snapshot + +DECLARE_EVENT_CLASS(bpos, + TP_PROTO(const struct bpos *p), + TP_ARGS(p), + + TP_STRUCT__entry( + TRACE_BPOS_entries(p) + ), + + TP_fast_assign( + TRACE_BPOS_assign(p, *p); + ), + + TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot) +); + +DECLARE_EVENT_CLASS(bkey, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k), + + TP_STRUCT__entry( + __string(k, k ) + ), + + TP_fast_assign( + __assign_str(k, k); + ), + + TP_printk("%s", __get_str(k)) +); + +DECLARE_EVENT_CLASS(btree_node, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u8, level ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->level = b->c.level; + __entry->btree_id = b->c.btree_id; + TRACE_BPOS_assign(pos, b->key.k.p); + ), + + TP_printk("%d,%d %u %s %llu:%llu:%u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->level, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot) +); + +DECLARE_EVENT_CLASS(bch_fs, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c), + + TP_STRUCT__entry( + __field(dev_t, dev ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + ), + + TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) +); + +DECLARE_EVENT_CLASS(bio, + TP_PROTO(struct bio *bio), + TP_ARGS(bio), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(sector_t, sector ) + __field(unsigned int, nr_sector ) + __array(char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = bio->bi_bdev ? bio_dev(bio) : 0; + __entry->sector = bio->bi_iter.bi_sector; + __entry->nr_sector = bio->bi_iter.bi_size >> 9; + blk_fill_rwbs(__entry->rwbs, bio->bi_opf); + ), + + TP_printk("%d,%d %s %llu + %u", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, __entry->nr_sector) +); + +/* super-io.c: */ +TRACE_EVENT(write_super, + TP_PROTO(struct bch_fs *c, unsigned long ip), + TP_ARGS(c, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(unsigned long, ip ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->ip = ip; + ), + + TP_printk("%d,%d for %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + (void *) __entry->ip) +); + +/* io.c: */ + +DEFINE_EVENT(bio, read_promote, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_split, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_retry, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +DEFINE_EVENT(bio, read_reuse_race, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +/* Journal */ + +DEFINE_EVENT(bch_fs, journal_full, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, journal_entry_full, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bio, journal_write, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +TRACE_EVENT(journal_reclaim_start, + TP_PROTO(struct bch_fs *c, bool direct, bool kicked, + u64 min_nr, u64 min_key_cache, + u64 prereserved, u64 prereserved_total, + u64 btree_cache_dirty, u64 btree_cache_total, + u64 btree_key_cache_dirty, u64 btree_key_cache_total), + TP_ARGS(c, direct, kicked, min_nr, min_key_cache, prereserved, prereserved_total, + btree_cache_dirty, btree_cache_total, + btree_key_cache_dirty, btree_key_cache_total), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(bool, direct ) + __field(bool, kicked ) + __field(u64, min_nr ) + __field(u64, min_key_cache ) + __field(u64, prereserved ) + __field(u64, prereserved_total ) + __field(u64, btree_cache_dirty ) + __field(u64, btree_cache_total ) + __field(u64, btree_key_cache_dirty ) + __field(u64, btree_key_cache_total ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->direct = direct; + __entry->kicked = kicked; + __entry->min_nr = min_nr; + __entry->min_key_cache = min_key_cache; + __entry->prereserved = prereserved; + __entry->prereserved_total = prereserved_total; + __entry->btree_cache_dirty = btree_cache_dirty; + __entry->btree_cache_total = btree_cache_total; + __entry->btree_key_cache_dirty = btree_key_cache_dirty; + __entry->btree_key_cache_total = btree_key_cache_total; + ), + + TP_printk("%d,%d direct %u kicked %u min %llu key cache %llu prereserved %llu/%llu btree cache %llu/%llu key cache %llu/%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->direct, + __entry->kicked, + __entry->min_nr, + __entry->min_key_cache, + __entry->prereserved, + __entry->prereserved_total, + __entry->btree_cache_dirty, + __entry->btree_cache_total, + __entry->btree_key_cache_dirty, + __entry->btree_key_cache_total) +); + +TRACE_EVENT(journal_reclaim_finish, + TP_PROTO(struct bch_fs *c, u64 nr_flushed), + TP_ARGS(c, nr_flushed), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, nr_flushed ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->nr_flushed = nr_flushed; + ), + + TP_printk("%d,%d flushed %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->nr_flushed) +); + +/* bset.c: */ + +DEFINE_EVENT(bpos, bkey_pack_pos_fail, + TP_PROTO(const struct bpos *p), + TP_ARGS(p) +); + +/* Btree cache: */ + +TRACE_EVENT(btree_cache_scan, + TP_PROTO(long nr_to_scan, long can_free, long ret), + TP_ARGS(nr_to_scan, can_free, ret), + + TP_STRUCT__entry( + __field(long, nr_to_scan ) + __field(long, can_free ) + __field(long, ret ) + ), + + TP_fast_assign( + __entry->nr_to_scan = nr_to_scan; + __entry->can_free = can_free; + __entry->ret = ret; + ), + + TP_printk("scanned for %li nodes, can free %li, ret %li", + __entry->nr_to_scan, __entry->can_free, __entry->ret) +); + +DEFINE_EVENT(btree_node, btree_cache_reap, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +/* Btree */ + +DEFINE_EVENT(btree_node, btree_node_read, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_node_write, + TP_PROTO(struct btree *b, unsigned bytes, unsigned sectors), + TP_ARGS(b, bytes, sectors), + + TP_STRUCT__entry( + __field(enum btree_node_type, type) + __field(unsigned, bytes ) + __field(unsigned, sectors ) + ), + + TP_fast_assign( + __entry->type = btree_node_type(b); + __entry->bytes = bytes; + __entry->sectors = sectors; + ), + + TP_printk("bkey type %u bytes %u sectors %u", + __entry->type , __entry->bytes, __entry->sectors) +); + +DEFINE_EVENT(btree_node, btree_node_alloc, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_free, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_reserve_get_fail, + TP_PROTO(const char *trans_fn, + unsigned long caller_ip, + size_t required, + int ret), + TP_ARGS(trans_fn, caller_ip, required, ret), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(size_t, required ) + __array(char, ret, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans_fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->required = required; + strscpy(__entry->ret, bch2_err_str(ret), sizeof(__entry->ret)); + ), + + TP_printk("%s %pS required %zu ret %s", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->required, + __entry->ret) +); + +DEFINE_EVENT(btree_node, btree_node_compact, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_merge, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_split, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_rewrite, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +DEFINE_EVENT(btree_node, btree_node_set_root, + TP_PROTO(struct bch_fs *c, struct btree *b), + TP_ARGS(c, b) +); + +TRACE_EVENT(btree_path_relock_fail, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned level), + TP_ARGS(trans, caller_ip, path, level), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __array(char, node, 24 ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + struct btree *b = btree_path_node(path, level); + + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = path->level; + TRACE_BPOS_assign(pos, path->pos); + if (IS_ERR(b)) + strscpy(__entry->node, bch2_err_str(PTR_ERR(b)), sizeof(__entry->node)); + else + scnprintf(__entry->node, sizeof(__entry->node), "%px", b); + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u node %s iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->node, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); + +TRACE_EVENT(btree_path_upgrade_fail, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned level), + TP_ARGS(trans, caller_ip, path, level), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, level ) + TRACE_BPOS_entries(pos) + __field(u8, locked ) + __field(u8, self_read_count ) + __field(u8, self_intent_count) + __field(u8, read_count ) + __field(u8, intent_count ) + __field(u32, iter_lock_seq ) + __field(u32, node_lock_seq ) + ), + + TP_fast_assign( + struct six_lock_count c; + + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->level = level; + TRACE_BPOS_assign(pos, path->pos); + __entry->locked = btree_node_locked(path, level); + + c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level), + __entry->self_read_count = c.n[SIX_LOCK_read]; + __entry->self_intent_count = c.n[SIX_LOCK_intent]; + c = six_lock_counts(&path->l[level].b->c.lock); + __entry->read_count = c.n[SIX_LOCK_read]; + __entry->intent_count = c.n[SIX_LOCK_read]; + __entry->iter_lock_seq = path->l[level].lock_seq; + __entry->node_lock_seq = is_btree_node(path, level) + ? six_lock_seq(&path->l[level].b->c.lock) + : 0; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u level %u locked %u held %u:%u lock count %u:%u iter seq %u lock seq %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->level, + __entry->locked, + __entry->self_read_count, + __entry->self_intent_count, + __entry->read_count, + __entry->intent_count, + __entry->iter_lock_seq, + __entry->node_lock_seq) +); + +/* Garbage collection */ + +DEFINE_EVENT(bch_fs, gc_gens_start, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +DEFINE_EVENT(bch_fs, gc_gens_end, + TP_PROTO(struct bch_fs *c), + TP_ARGS(c) +); + +/* Allocator */ + +DECLARE_EVENT_CLASS(bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err), + + TP_STRUCT__entry( + __field(u8, dev ) + __array(char, reserve, 16 ) + __field(u64, bucket ) + __field(u64, free ) + __field(u64, avail ) + __field(u64, copygc_wait_amount ) + __field(s64, copygc_waiting_for ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, nouse ) + __field(bool, nonblocking ) + __field(u64, nocow ) + __array(char, err, 32 ) + ), + + TP_fast_assign( + __entry->dev = ca->dev_idx; + strscpy(__entry->reserve, alloc_reserve, sizeof(__entry->reserve)); + __entry->bucket = bucket; + __entry->free = free; + __entry->avail = avail; + __entry->copygc_wait_amount = copygc_wait_amount; + __entry->copygc_waiting_for = copygc_waiting_for; + __entry->seen = s->buckets_seen; + __entry->open = s->skipped_open; + __entry->need_journal_commit = s->skipped_need_journal_commit; + __entry->nouse = s->skipped_nouse; + __entry->nonblocking = nonblocking; + __entry->nocow = s->skipped_nocow; + strscpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("reserve %s bucket %u:%llu free %llu avail %llu copygc_wait %llu/%lli seen %llu open %llu need_journal_commit %llu nouse %llu nocow %llu nonblocking %u err %s", + __entry->reserve, + __entry->dev, + __entry->bucket, + __entry->free, + __entry->avail, + __entry->copygc_wait_amount, + __entry->copygc_waiting_for, + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->nouse, + __entry->nocow, + __entry->nonblocking, + __entry->err) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + +DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, const char *alloc_reserve, + u64 bucket, + u64 free, + u64 avail, + u64 copygc_wait_amount, + s64 copygc_waiting_for, + struct bucket_alloc_state *s, + bool nonblocking, + const char *err), + TP_ARGS(ca, alloc_reserve, bucket, free, avail, + copygc_wait_amount, copygc_waiting_for, + s, nonblocking, err) +); + +TRACE_EVENT(discard_buckets, + TP_PROTO(struct bch_fs *c, u64 seen, u64 open, + u64 need_journal_commit, u64 discarded, const char *err), + TP_ARGS(c, seen, open, need_journal_commit, discarded, err), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, seen ) + __field(u64, open ) + __field(u64, need_journal_commit ) + __field(u64, discarded ) + __array(char, err, 16 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->seen = seen; + __entry->open = open; + __entry->need_journal_commit = need_journal_commit; + __entry->discarded = discarded; + strscpy(__entry->err, err, sizeof(__entry->err)); + ), + + TP_printk("%d%d seen %llu open %llu need_journal_commit %llu discarded %llu err %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->seen, + __entry->open, + __entry->need_journal_commit, + __entry->discarded, + __entry->err) +); + +TRACE_EVENT(bucket_invalidate, + TP_PROTO(struct bch_fs *c, unsigned dev, u64 bucket, u32 sectors), + TP_ARGS(c, dev, bucket, sectors), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u32, sectors ) + __field(u64, bucket ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->dev_idx = dev; + __entry->sectors = sectors; + __entry->bucket = bucket; + ), + + TP_printk("%d:%d invalidated %u:%llu cached sectors %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dev_idx, __entry->bucket, + __entry->sectors) +); + +/* Moving IO */ + +TRACE_EVENT(bucket_evacuate, + TP_PROTO(struct bch_fs *c, struct bpos *bucket), + TP_ARGS(c, bucket), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u32, dev_idx ) + __field(u64, bucket ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->dev_idx = bucket->inode; + __entry->bucket = bucket->offset; + ), + + TP_printk("%d:%d %u:%llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->dev_idx, __entry->bucket) +); + +DEFINE_EVENT(bkey, move_extent, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +DEFINE_EVENT(bkey, move_extent_read, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +DEFINE_EVENT(bkey, move_extent_write, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +DEFINE_EVENT(bkey, move_extent_finish, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +TRACE_EVENT(move_extent_fail, + TP_PROTO(struct bch_fs *c, const char *msg), + TP_ARGS(c, msg), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __string(msg, msg ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __assign_str(msg, msg); + ), + + TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg)) +); + +DEFINE_EVENT(bkey, move_extent_alloc_mem_fail, + TP_PROTO(struct bch_fs *c, const char *k), + TP_ARGS(c, k) +); + +TRACE_EVENT(move_data, + TP_PROTO(struct bch_fs *c, u64 sectors_moved, + u64 keys_moved), + TP_ARGS(c, sectors_moved, keys_moved), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, sectors_moved ) + __field(u64, keys_moved ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->sectors_moved = sectors_moved; + __entry->keys_moved = keys_moved; + ), + + TP_printk("%d,%d sectors_moved %llu keys_moved %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->sectors_moved, __entry->keys_moved) +); + +TRACE_EVENT(evacuate_bucket, + TP_PROTO(struct bch_fs *c, struct bpos *bucket, + unsigned sectors, unsigned bucket_size, + u64 fragmentation, int ret), + TP_ARGS(c, bucket, sectors, bucket_size, fragmentation, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, member ) + __field(u64, bucket ) + __field(u32, sectors ) + __field(u32, bucket_size ) + __field(u64, fragmentation ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->member = bucket->inode; + __entry->bucket = bucket->offset; + __entry->sectors = sectors; + __entry->bucket_size = bucket_size; + __entry->fragmentation = fragmentation; + __entry->ret = ret; + ), + + TP_printk("%d,%d %llu:%llu sectors %u/%u fragmentation %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->member, __entry->bucket, + __entry->sectors, __entry->bucket_size, + __entry->fragmentation, __entry->ret) +); + +TRACE_EVENT(copygc, + TP_PROTO(struct bch_fs *c, + u64 sectors_moved, u64 sectors_not_moved, + u64 buckets_moved, u64 buckets_not_moved), + TP_ARGS(c, + sectors_moved, sectors_not_moved, + buckets_moved, buckets_not_moved), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, sectors_moved ) + __field(u64, sectors_not_moved ) + __field(u64, buckets_moved ) + __field(u64, buckets_not_moved ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->sectors_moved = sectors_moved; + __entry->sectors_not_moved = sectors_not_moved; + __entry->buckets_moved = buckets_moved; + __entry->buckets_not_moved = buckets_moved; + ), + + TP_printk("%d,%d sectors moved %llu remain %llu buckets moved %llu remain %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->sectors_moved, __entry->sectors_not_moved, + __entry->buckets_moved, __entry->buckets_not_moved) +); + +TRACE_EVENT(copygc_wait, + TP_PROTO(struct bch_fs *c, + u64 wait_amount, u64 until), + TP_ARGS(c, wait_amount, until), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, wait_amount ) + __field(u64, until ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->wait_amount = wait_amount; + __entry->until = until; + ), + + TP_printk("%d,%u waiting for %llu sectors until %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->wait_amount, __entry->until) +); + +/* btree transactions: */ + +DECLARE_EVENT_CLASS(transaction_event, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + ), + + TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip) +); + +DEFINE_EVENT(transaction_event, transaction_commit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_injected, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(trans_restart_split_race, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree *b), + TP_ARGS(trans, caller_ip, b), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, level ) + __field(u16, written ) + __field(u16, blocks ) + __field(u16, u64s_remaining ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->level = b->c.level; + __entry->written = b->written; + __entry->blocks = btree_blocks(trans->c); + __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b); + ), + + TP_printk("%s %pS l=%u written %u/%u u64s remaining %u", + __entry->trans_fn, (void *) __entry->caller_ip, + __entry->level, + __entry->written, __entry->blocks, + __entry->u64s_remaining) +); + +DEFINE_EVENT(transaction_event, trans_blocked_journal_reclaim, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(trans_restart_journal_preres_get, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + unsigned flags), + TP_ARGS(trans, caller_ip, flags), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned, flags ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->flags = flags; + ), + + TP_printk("%s %pS %x", __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->flags) +); + +DEFINE_EVENT(transaction_event, trans_restart_fault_inject, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_traverse_all, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_too_many_iters, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DECLARE_EVENT_CLASS(transaction_restart_iter, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos) + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_reused, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +TRACE_EVENT(trans_restart_upgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_locks_want, + unsigned new_locks_want), + TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(u8, btree_id ) + __field(u8, old_locks_want ) + __field(u8, new_locks_want ) + TRACE_BPOS_entries(pos) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->btree_id = path->btree_id; + __entry->old_locks_want = old_locks_want; + __entry->new_locks_want = new_locks_want; + TRACE_BPOS_assign(pos, path->pos) + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_locks_want, + __entry->new_locks_want) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_parent_for_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_after_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_event, trans_restart_key_cache_upgrade, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_key_cache_fill, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_path_intent, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_traverse, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path), + TP_ARGS(trans, caller_ip, path) +); + +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(trans_restart_would_deadlock_write, + TP_PROTO(struct btree_trans *trans), + TP_ARGS(trans), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + ), + + TP_printk("%s", __entry->trans_fn) +); + +TRACE_EVENT(trans_restart_mem_realloced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + unsigned long bytes), + TP_ARGS(trans, caller_ip, bytes), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(unsigned long, bytes ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + __entry->bytes = bytes; + ), + + TP_printk("%s %pS bytes %lu", + __entry->trans_fn, + (void *) __entry->caller_ip, + __entry->bytes) +); + +TRACE_EVENT(trans_restart_key_cache_key_realloced, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip, + struct btree_path *path, + unsigned old_u64s, + unsigned new_u64s), + TP_ARGS(trans, caller_ip, path, old_u64s, new_u64s), + + TP_STRUCT__entry( + __array(char, trans_fn, 32 ) + __field(unsigned long, caller_ip ) + __field(enum btree_id, btree_id ) + TRACE_BPOS_entries(pos) + __field(u32, old_u64s ) + __field(u32, new_u64s ) + ), + + TP_fast_assign( + strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); + __entry->caller_ip = caller_ip; + + __entry->btree_id = path->btree_id; + TRACE_BPOS_assign(pos, path->pos); + __entry->old_u64s = old_u64s; + __entry->new_u64s = new_u64s; + ), + + TP_printk("%s %pS btree %s pos %llu:%llu:%u old_u64s %u new_u64s %u", + __entry->trans_fn, + (void *) __entry->caller_ip, + bch2_btree_ids[__entry->btree_id], + __entry->pos_inode, + __entry->pos_offset, + __entry->pos_snapshot, + __entry->old_u64s, + __entry->new_u64s) +); + +DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush, + TP_PROTO(struct btree_trans *trans, + unsigned long caller_ip), + TP_ARGS(trans, caller_ip) +); + +TRACE_EVENT(write_buffer_flush, + TP_PROTO(struct btree_trans *trans, size_t nr, size_t skipped, size_t fast, size_t size), + TP_ARGS(trans, nr, skipped, fast, size), + + TP_STRUCT__entry( + __field(size_t, nr ) + __field(size_t, skipped ) + __field(size_t, fast ) + __field(size_t, size ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->skipped = skipped; + __entry->fast = fast; + __entry->size = size; + ), + + TP_printk("%zu/%zu skipped %zu fast %zu", + __entry->nr, __entry->size, __entry->skipped, __entry->fast) +); + +TRACE_EVENT(write_buffer_flush_slowpath, + TP_PROTO(struct btree_trans *trans, size_t nr, size_t size), + TP_ARGS(trans, nr, size), + + TP_STRUCT__entry( + __field(size_t, nr ) + __field(size_t, size ) + ), + + TP_fast_assign( + __entry->nr = nr; + __entry->size = size; + ), + + TP_printk("%zu/%zu", __entry->nr, __entry->size) +); + +#endif /* _TRACE_BCACHEFS_H */ + +/* This part must be outside protection */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH ../../fs/bcachefs + +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE trace + +#include diff --git a/fs/bcachefs/two_state_shared_lock.c b/fs/bcachefs/two_state_shared_lock.c new file mode 100644 index 000000000..9764c2e6a --- /dev/null +++ b/fs/bcachefs/two_state_shared_lock.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "two_state_shared_lock.h" + +void __bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + __wait_event(lock->wait, bch2_two_state_trylock(lock, s)); +} diff --git a/fs/bcachefs/two_state_shared_lock.h b/fs/bcachefs/two_state_shared_lock.h new file mode 100644 index 000000000..905801772 --- /dev/null +++ b/fs/bcachefs/two_state_shared_lock.h @@ -0,0 +1,59 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_TWO_STATE_LOCK_H +#define _BCACHEFS_TWO_STATE_LOCK_H + +#include +#include +#include + +#include "util.h" + +/* + * Two-state lock - can be taken for add or block - both states are shared, + * like read side of rwsem, but conflict with other state: + */ +typedef struct { + atomic_long_t v; + wait_queue_head_t wait; +} two_state_lock_t; + +static inline void two_state_lock_init(two_state_lock_t *lock) +{ + atomic_long_set(&lock->v, 0); + init_waitqueue_head(&lock->wait); +} + +static inline void bch2_two_state_unlock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + + EBUG_ON(atomic_long_read(&lock->v) == 0); + + if (atomic_long_sub_return_release(i, &lock->v) == 0) + wake_up_all(&lock->wait); +} + +static inline bool bch2_two_state_trylock(two_state_lock_t *lock, int s) +{ + long i = s ? 1 : -1; + long v = atomic_long_read(&lock->v), old; + + do { + old = v; + + if (i > 0 ? v < 0 : v > 0) + return false; + } while ((v = atomic_long_cmpxchg_acquire(&lock->v, + old, old + i)) != old); + return true; +} + +void __bch2_two_state_lock(two_state_lock_t *, int); + +static inline void bch2_two_state_lock(two_state_lock_t *lock, int s) +{ + if (!bch2_two_state_trylock(lock, s)) + __bch2_two_state_lock(lock, s); +} + +#endif /* _BCACHEFS_TWO_STATE_LOCK_H */ diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c new file mode 100644 index 000000000..ae4f6de3c --- /dev/null +++ b/fs/bcachefs/util.c @@ -0,0 +1,1137 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * random utiility code, for bcache but in theory not specific to bcache + * + * Copyright 2010, 2011 Kent Overstreet + * Copyright 2012 Google, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "eytzinger.h" +#include "util.h" + +static const char si_units[] = "?kMGTPEZY"; + +/* string_get_size units: */ +static const char *const units_2[] = { + "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" +}; +static const char *const units_10[] = { + "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" +}; + +static int parse_u64(const char *cp, u64 *res) +{ + const char *start = cp; + u64 v = 0; + + if (!isdigit(*cp)) + return -EINVAL; + + do { + if (v > U64_MAX / 10) + return -ERANGE; + v *= 10; + if (v > U64_MAX - (*cp - '0')) + return -ERANGE; + v += *cp - '0'; + cp++; + } while (isdigit(*cp)); + + *res = v; + return cp - start; +} + +static int bch2_pow(u64 n, u64 p, u64 *res) +{ + *res = 1; + + while (p--) { + if (*res > div_u64(U64_MAX, n)) + return -ERANGE; + *res *= n; + } + return 0; +} + +static int parse_unit_suffix(const char *cp, u64 *res) +{ + const char *start = cp; + u64 base = 1024; + unsigned u; + int ret; + + if (*cp == ' ') + cp++; + + for (u = 1; u < strlen(si_units); u++) + if (*cp == si_units[u]) { + cp++; + goto got_unit; + } + + for (u = 0; u < ARRAY_SIZE(units_2); u++) + if (!strncmp(cp, units_2[u], strlen(units_2[u]))) { + cp += strlen(units_2[u]); + goto got_unit; + } + + for (u = 0; u < ARRAY_SIZE(units_10); u++) + if (!strncmp(cp, units_10[u], strlen(units_10[u]))) { + cp += strlen(units_10[u]); + base = 1000; + goto got_unit; + } + + *res = 1; + return 0; +got_unit: + ret = bch2_pow(base, u, res); + if (ret) + return ret; + + return cp - start; +} + +#define parse_or_ret(cp, _f) \ +do { \ + int ret = _f; \ + if (ret < 0) \ + return ret; \ + cp += ret; \ +} while (0) + +static int __bch2_strtou64_h(const char *cp, u64 *res) +{ + const char *start = cp; + u64 v = 0, b, f_n = 0, f_d = 1; + int ret; + + parse_or_ret(cp, parse_u64(cp, &v)); + + if (*cp == '.') { + cp++; + ret = parse_u64(cp, &f_n); + if (ret < 0) + return ret; + cp += ret; + + ret = bch2_pow(10, ret, &f_d); + if (ret) + return ret; + } + + parse_or_ret(cp, parse_unit_suffix(cp, &b)); + + if (v > div_u64(U64_MAX, b)) + return -ERANGE; + v *= b; + + if (f_n > div_u64(U64_MAX, b)) + return -ERANGE; + + f_n = div_u64(f_n * b, f_d); + if (v + f_n < v) + return -ERANGE; + v += f_n; + + *res = v; + return cp - start; +} + +static int __bch2_strtoh(const char *cp, u64 *res, + u64 t_max, bool t_signed) +{ + bool positive = *cp != '-'; + u64 v = 0; + + if (*cp == '+' || *cp == '-') + cp++; + + parse_or_ret(cp, __bch2_strtou64_h(cp, &v)); + + if (*cp == '\n') + cp++; + if (*cp) + return -EINVAL; + + if (positive) { + if (v > t_max) + return -ERANGE; + } else { + if (v && !t_signed) + return -ERANGE; + + if (v > t_max + 1) + return -ERANGE; + v = -v; + } + + *res = v; + return 0; +} + +#define STRTO_H(name, type) \ +int bch2_ ## name ## _h(const char *cp, type *res) \ +{ \ + u64 v = 0; \ + int ret = __bch2_strtoh(cp, &v, ANYSINT_MAX(type), \ + ANYSINT_MAX(type) != ((type) ~0ULL)); \ + *res = v; \ + return ret; \ +} + +STRTO_H(strtoint, int) +STRTO_H(strtouint, unsigned int) +STRTO_H(strtoll, long long) +STRTO_H(strtoull, unsigned long long) +STRTO_H(strtou64, u64) + +u64 bch2_read_flag_list(char *opt, const char * const list[]) +{ + u64 ret = 0; + char *p, *s, *d = kstrdup(opt, GFP_KERNEL); + + if (!d) + return -ENOMEM; + + s = strim(d); + + while ((p = strsep(&s, ","))) { + int flag = match_string(list, -1, p); + if (flag < 0) { + ret = -1; + break; + } + + ret |= 1 << flag; + } + + kfree(d); + + return ret; +} + +bool bch2_is_zero(const void *_p, size_t n) +{ + const char *p = _p; + size_t i; + + for (i = 0; i < n; i++) + if (p[i]) + return false; + return true; +} + +void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits) +{ + while (nr_bits) + prt_char(out, '0' + ((v >> --nr_bits) & 1)); +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines) +{ + const char *p; + + if (!lines) { + printk("%s (null)\n", prefix); + return; + } + + console_lock(); + while (1) { + p = strchrnul(lines, '\n'); + printk("%s%.*s\n", prefix, (int) (p - lines), lines); + if (!*p) + break; + lines = p + 1; + } + console_unlock(); +} + +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task) +{ + unsigned nr_entries = 0; + int ret = 0; + + stack->nr = 0; + ret = darray_make_room(stack, 32); + if (ret) + return ret; + + if (!down_read_trylock(&task->signal->exec_update_lock)) + return -1; + + do { + nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0); + } while (nr_entries == stack->size && + !(ret = darray_make_room(stack, stack->size * 2))); + + stack->nr = nr_entries; + up_read(&task->signal->exec_update_lock); + + return ret; +} + +void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack) +{ + unsigned long *i; + + darray_for_each(*stack, i) { + prt_printf(out, "[<0>] %pB", (void *) *i); + prt_newline(out); + } +} + +int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task) +{ + bch_stacktrace stack = { 0 }; + int ret = bch2_save_backtrace(&stack, task); + + bch2_prt_backtrace(out, &stack); + darray_exit(&stack); + return ret; +} + +/* time stats: */ + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +static void bch2_quantiles_update(struct bch2_quantiles *q, u64 v) +{ + unsigned i = 0; + + while (i < ARRAY_SIZE(q->entries)) { + struct bch2_quantile_entry *e = q->entries + i; + + if (unlikely(!e->step)) { + e->m = v; + e->step = max_t(unsigned, v / 2, 1024); + } else if (e->m > v) { + e->m = e->m >= e->step + ? e->m - e->step + : 0; + } else if (e->m < v) { + e->m = e->m + e->step > e->m + ? e->m + e->step + : U32_MAX; + } + + if ((e->m > v ? e->m - v : v - e->m) < e->step) + e->step = max_t(unsigned, e->step / 2, 1); + + if (v >= e->m) + break; + + i = eytzinger0_child(i, v > e->m); + } +} + +static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats, + u64 start, u64 end) +{ + u64 duration, freq; + + if (time_after64(end, start)) { + duration = end - start; + mean_and_variance_update(&stats->duration_stats, duration); + mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration); + stats->max_duration = max(stats->max_duration, duration); + stats->min_duration = min(stats->min_duration, duration); + bch2_quantiles_update(&stats->quantiles, duration); + } + + if (time_after64(end, stats->last_event)) { + freq = end - stats->last_event; + mean_and_variance_update(&stats->freq_stats, freq); + mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq); + stats->max_freq = max(stats->max_freq, freq); + stats->min_freq = min(stats->min_freq, freq); + stats->last_event = end; + } +} + +static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats, + struct bch2_time_stat_buffer *b) +{ + struct bch2_time_stat_buffer_entry *i; + unsigned long flags; + + spin_lock_irqsave(&stats->lock, flags); + for (i = b->entries; + i < b->entries + ARRAY_SIZE(b->entries); + i++) + bch2_time_stats_update_one(stats, i->start, i->end); + spin_unlock_irqrestore(&stats->lock, flags); + + b->nr = 0; +} + +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) +{ + unsigned long flags; + + WARN_RATELIMIT(!stats->min_duration || !stats->min_freq, + "time_stats: min_duration = %llu, min_freq = %llu", + stats->min_duration, stats->min_freq); + + if (!stats->buffer) { + spin_lock_irqsave(&stats->lock, flags); + bch2_time_stats_update_one(stats, start, end); + + if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted) < 32 && + stats->duration_stats.n > 1024) + stats->buffer = + alloc_percpu_gfp(struct bch2_time_stat_buffer, + GFP_ATOMIC); + spin_unlock_irqrestore(&stats->lock, flags); + } else { + struct bch2_time_stat_buffer *b; + + preempt_disable(); + b = this_cpu_ptr(stats->buffer); + + BUG_ON(b->nr >= ARRAY_SIZE(b->entries)); + b->entries[b->nr++] = (struct bch2_time_stat_buffer_entry) { + .start = start, + .end = end + }; + + if (unlikely(b->nr == ARRAY_SIZE(b->entries))) + bch2_time_stats_clear_buffer(stats, b); + preempt_enable(); + } +} +#endif + +static const struct time_unit { + const char *name; + u64 nsecs; +} time_units[] = { + { "ns", 1 }, + { "us", NSEC_PER_USEC }, + { "ms", NSEC_PER_MSEC }, + { "s", NSEC_PER_SEC }, + { "m", (u64) NSEC_PER_SEC * 60}, + { "h", (u64) NSEC_PER_SEC * 3600}, + { "eon", U64_MAX }, +}; + +static const struct time_unit *pick_time_units(u64 ns) +{ + const struct time_unit *u; + + for (u = time_units; + u + 1 < time_units + ARRAY_SIZE(time_units) && + ns >= u[1].nsecs << 1; + u++) + ; + + return u; +} + +void bch2_pr_time_units(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name); +} + +static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns) +{ + const struct time_unit *u = pick_time_units(ns); + + prt_printf(out, "%llu ", div64_u64(ns, u->nsecs)); + prt_tab_rjust(out); + prt_printf(out, "%s", u->name); +} + +#define TABSTOP_SIZE 12 + +static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns) +{ + prt_str(out, name); + prt_tab(out); + bch2_pr_time_units_aligned(out, ns); + prt_newline(out); +} + +void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) +{ + const struct time_unit *u; + s64 f_mean = 0, d_mean = 0; + u64 q, last_q = 0, f_stddev = 0, d_stddev = 0; + int i; + /* + * avoid divide by zero + */ + if (stats->freq_stats.n) { + f_mean = mean_and_variance_get_mean(stats->freq_stats); + f_stddev = mean_and_variance_get_stddev(stats->freq_stats); + d_mean = mean_and_variance_get_mean(stats->duration_stats); + d_stddev = mean_and_variance_get_stddev(stats->duration_stats); + } + + printbuf_tabstop_push(out, out->indent + TABSTOP_SIZE); + prt_printf(out, "count:"); + prt_tab(out); + prt_printf(out, "%llu ", + stats->duration_stats.n); + printbuf_tabstop_pop(out); + prt_newline(out); + + printbuf_tabstops_reset(out); + + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + printbuf_tabstop_push(out, 0); + printbuf_tabstop_push(out, TABSTOP_SIZE + 2); + + prt_tab(out); + prt_printf(out, "since mount"); + prt_tab_rjust(out); + prt_tab(out); + prt_printf(out, "recent"); + prt_tab_rjust(out); + prt_newline(out); + + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, out->indent + 20); + printbuf_tabstop_push(out, TABSTOP_SIZE); + printbuf_tabstop_push(out, 2); + printbuf_tabstop_push(out, TABSTOP_SIZE); + + prt_printf(out, "duration of events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_duration); + pr_name_and_units(out, "max:", stats->max_duration); + + prt_printf(out, "mean:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, d_mean); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->duration_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, d_stddev); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->duration_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + prt_printf(out, "time between events"); + prt_newline(out); + printbuf_indent_add(out, 2); + + pr_name_and_units(out, "min:", stats->min_freq); + pr_name_and_units(out, "max:", stats->max_freq); + + prt_printf(out, "mean:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, f_mean); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_mean(stats->freq_stats_weighted)); + prt_newline(out); + + prt_printf(out, "stddev:"); + prt_tab(out); + bch2_pr_time_units_aligned(out, f_stddev); + prt_tab(out); + bch2_pr_time_units_aligned(out, mean_and_variance_weighted_get_stddev(stats->freq_stats_weighted)); + + printbuf_indent_sub(out, 2); + prt_newline(out); + + printbuf_tabstops_reset(out); + + i = eytzinger0_first(NR_QUANTILES); + u = pick_time_units(stats->quantiles.entries[i].m); + + prt_printf(out, "quantiles (%s):\t", u->name); + eytzinger0_for_each(i, NR_QUANTILES) { + bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + + q = max(stats->quantiles.entries[i].m, last_q); + prt_printf(out, "%llu ", + div_u64(q, u->nsecs)); + if (is_last) + prt_newline(out); + last_q = q; + } +} + +void bch2_time_stats_exit(struct bch2_time_stats *stats) +{ + free_percpu(stats->buffer); +} + +void bch2_time_stats_init(struct bch2_time_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); + stats->duration_stats_weighted.weight = 8; + stats->freq_stats_weighted.weight = 8; + stats->min_duration = U64_MAX; + stats->min_freq = U64_MAX; + spin_lock_init(&stats->lock); +} + +/* ratelimit: */ + +/** + * bch2_ratelimit_delay() - return how long to delay until the next time to do + * some work + * + * @d - the struct bch_ratelimit to update + * + * Returns the amount of time to delay by, in jiffies + */ +u64 bch2_ratelimit_delay(struct bch_ratelimit *d) +{ + u64 now = local_clock(); + + return time_after64(d->next, now) + ? nsecs_to_jiffies(d->next - now) + : 0; +} + +/** + * bch2_ratelimit_increment() - increment @d by the amount of work done + * + * @d - the struct bch_ratelimit to update + * @done - the amount of work done, in arbitrary units + */ +void bch2_ratelimit_increment(struct bch_ratelimit *d, u64 done) +{ + u64 now = local_clock(); + + d->next += div_u64(done * NSEC_PER_SEC, d->rate); + + if (time_before64(now + NSEC_PER_SEC, d->next)) + d->next = now + NSEC_PER_SEC; + + if (time_after64(now - NSEC_PER_SEC * 2, d->next)) + d->next = now - NSEC_PER_SEC * 2; +} + +/* pd controller: */ + +/* + * Updates pd_controller. Attempts to scale inputed values to units per second. + * @target: desired value + * @actual: current value + * + * @sign: 1 or -1; 1 if increasing the rate makes actual go up, -1 if increasing + * it makes actual go down. + */ +void bch2_pd_controller_update(struct bch_pd_controller *pd, + s64 target, s64 actual, int sign) +{ + s64 proportional, derivative, change; + + unsigned long seconds_since_update = (jiffies - pd->last_update) / HZ; + + if (seconds_since_update == 0) + return; + + pd->last_update = jiffies; + + proportional = actual - target; + proportional *= seconds_since_update; + proportional = div_s64(proportional, pd->p_term_inverse); + + derivative = actual - pd->last_actual; + derivative = div_s64(derivative, seconds_since_update); + derivative = ewma_add(pd->smoothed_derivative, derivative, + (pd->d_term / seconds_since_update) ?: 1); + derivative = derivative * pd->d_term; + derivative = div_s64(derivative, pd->p_term_inverse); + + change = proportional + derivative; + + /* Don't increase rate if not keeping up */ + if (change > 0 && + pd->backpressure && + time_after64(local_clock(), + pd->rate.next + NSEC_PER_MSEC)) + change = 0; + + change *= (sign * -1); + + pd->rate.rate = clamp_t(s64, (s64) pd->rate.rate + change, + 1, UINT_MAX); + + pd->last_actual = actual; + pd->last_derivative = derivative; + pd->last_proportional = proportional; + pd->last_change = change; + pd->last_target = target; +} + +void bch2_pd_controller_init(struct bch_pd_controller *pd) +{ + pd->rate.rate = 1024; + pd->last_update = jiffies; + pd->p_term_inverse = 6000; + pd->d_term = 30; + pd->d_smooth = pd->d_term; + pd->backpressure = 1; +} + +void bch2_pd_controller_debug_to_text(struct printbuf *out, struct bch_pd_controller *pd) +{ + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); + + prt_printf(out, "rate:"); + prt_tab(out); + prt_human_readable_s64(out, pd->rate.rate); + prt_newline(out); + + prt_printf(out, "target:"); + prt_tab(out); + prt_human_readable_u64(out, pd->last_target); + prt_newline(out); + + prt_printf(out, "actual:"); + prt_tab(out); + prt_human_readable_u64(out, pd->last_actual); + prt_newline(out); + + prt_printf(out, "proportional:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_proportional); + prt_newline(out); + + prt_printf(out, "derivative:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_derivative); + prt_newline(out); + + prt_printf(out, "change:"); + prt_tab(out); + prt_human_readable_s64(out, pd->last_change); + prt_newline(out); + + prt_printf(out, "next io:"); + prt_tab(out); + prt_printf(out, "%llims", div64_s64(pd->rate.next - local_clock(), NSEC_PER_MSEC)); + prt_newline(out); +} + +/* misc: */ + +void bch2_bio_map(struct bio *bio, void *base, size_t size) +{ + while (size) { + struct page *page = is_vmalloc_addr(base) + ? vmalloc_to_page(base) + : virt_to_page(base); + unsigned offset = offset_in_page(base); + unsigned len = min_t(size_t, PAGE_SIZE - offset, size); + + BUG_ON(!bio_add_page(bio, page, len, offset)); + size -= len; + base += len; + } +} + +int bch2_bio_alloc_pages_noprof(struct bio *bio, size_t size, gfp_t gfp_mask) +{ + while (size) { + struct page *page = alloc_pages_noprof(gfp_mask, 0); + unsigned len = min_t(size_t, PAGE_SIZE, size); + + if (!page) + return -ENOMEM; + + if (unlikely(!bio_add_page(bio, page, len, 0))) { + __free_page(page); + break; + } + + size -= len; + } + + return 0; +} + +size_t bch2_rand_range(size_t max) +{ + size_t rand; + + if (!max) + return 0; + + do { + rand = get_random_long(); + rand &= roundup_pow_of_two(max) - 1; + } while (rand >= max); + + return rand; +} + +void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) +{ + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, dst, iter, dst_iter) { + void *dstp = kmap_atomic(bv.bv_page); + memcpy(dstp + bv.bv_offset, src, bv.bv_len); + kunmap_atomic(dstp); + + src += bv.bv_len; + } +} + +void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) +{ + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, src, iter, src_iter) { + void *srcp = kmap_atomic(bv.bv_page); + memcpy(dst, srcp + bv.bv_offset, bv.bv_len); + kunmap_atomic(srcp); + + dst += bv.bv_len; + } +} + +static int alignment_ok(const void *base, size_t align) +{ + return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) || + ((unsigned long)base & (align - 1)) == 0; +} + +static void u32_swap(void *a, void *b, size_t size) +{ + u32 t = *(u32 *)a; + *(u32 *)a = *(u32 *)b; + *(u32 *)b = t; +} + +static void u64_swap(void *a, void *b, size_t size) +{ + u64 t = *(u64 *)a; + *(u64 *)a = *(u64 *)b; + *(u64 *)b = t; +} + +static void generic_swap(void *a, void *b, size_t size) +{ + char t; + + do { + t = *(char *)a; + *(char *)a++ = *(char *)b; + *(char *)b++ = t; + } while (--size > 0); +} + +static inline int do_cmp(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + size_t l, size_t r) +{ + return cmp_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +static inline void do_swap(void *base, size_t n, size_t size, + void (*swap_func)(void *, void *, size_t), + size_t l, size_t r) +{ + swap_func(base + inorder_to_eytzinger0(l, n) * size, + base + inorder_to_eytzinger0(r, n) * size, + size); +} + +void eytzinger0_sort(void *base, size_t n, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)) +{ + int i, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for (i = n / 2 - 1; i >= 0; --i) { + for (r = i; r * 2 + 1 < n; r = c) { + c = r * 2 + 1; + + if (c + 1 < n && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } + + /* sort */ + for (i = n - 1; i > 0; --i) { + do_swap(base, n, size, swap_func, 0, i); + + for (r = 0; r * 2 + 1 < i; r = c) { + c = r * 2 + 1; + + if (c + 1 < i && + do_cmp(base, n, size, cmp_func, c, c + 1) < 0) + c++; + + if (do_cmp(base, n, size, cmp_func, r, c) >= 0) + break; + + do_swap(base, n, size, swap_func, r, c); + } + } +} + +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t size)) +{ + /* pre-scale counters for performance */ + int i = (num/2 - 1) * size, n = num * size, c, r; + + if (!swap_func) { + if (size == 4 && alignment_ok(base, 4)) + swap_func = u32_swap; + else if (size == 8 && alignment_ok(base, 8)) + swap_func = u64_swap; + else + swap_func = generic_swap; + } + + /* heapify */ + for ( ; i >= 0; i -= size) { + for (r = i; r * 2 + size < n; r = c) { + c = r * 2 + size; + if (c < n - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } + + /* sort */ + for (i = n - size; i > 0; i -= size) { + swap_func(base, base + i, size); + for (r = 0; r * 2 + size < i; r = c) { + c = r * 2 + size; + if (c < i - size && + cmp_func(base + c, base + c + size, size) < 0) + c += size; + if (cmp_func(base + r, base + c, size) >= 0) + break; + swap_func(base + r, base + c, size); + } + } +} + +static void mempool_free_vp(void *element, void *pool_data) +{ + size_t size = (size_t) pool_data; + + vpfree(element, size); +} + +static void *mempool_alloc_vp(gfp_t gfp_mask, void *pool_data) +{ + size_t size = (size_t) pool_data; + + return vpmalloc(size, gfp_mask); +} + +int mempool_init_kvpmalloc_pool(mempool_t *pool, int min_nr, size_t size) +{ + return size < PAGE_SIZE + ? mempool_init_kmalloc_pool(pool, min_nr, size) + : mempool_init(pool, min_nr, mempool_alloc_vp, + mempool_free_vp, (void *) size); +} + +#if 0 +void eytzinger1_test(void) +{ + unsigned inorder, eytz, size; + + pr_info("1 based eytzinger test:"); + + for (size = 2; + size < 65536; + size++) { + unsigned extra = eytzinger1_extra(size); + + if (!(size % 4096)) + pr_info("tree size %u", size); + + BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); + BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); + + BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); + BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); + + inorder = 1; + eytzinger1_for_each(eytz, size) { + BUG_ON(__inorder_to_eytzinger1(inorder, size, extra) != eytz); + BUG_ON(__eytzinger1_to_inorder(eytz, size, extra) != inorder); + BUG_ON(eytz != eytzinger1_last(size) && + eytzinger1_prev(eytzinger1_next(eytz, size), size) != eytz); + + inorder++; + } + } +} + +void eytzinger0_test(void) +{ + + unsigned inorder, eytz, size; + + pr_info("0 based eytzinger test:"); + + for (size = 1; + size < 65536; + size++) { + unsigned extra = eytzinger0_extra(size); + + if (!(size % 4096)) + pr_info("tree size %u", size); + + BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); + BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); + + BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); + BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); + + inorder = 0; + eytzinger0_for_each(eytz, size) { + BUG_ON(__inorder_to_eytzinger0(inorder, size, extra) != eytz); + BUG_ON(__eytzinger0_to_inorder(eytz, size, extra) != inorder); + BUG_ON(eytz != eytzinger0_last(size) && + eytzinger0_prev(eytzinger0_next(eytz, size), size) != eytz); + + inorder++; + } + } +} + +static inline int cmp_u16(const void *_l, const void *_r, size_t size) +{ + const u16 *l = _l, *r = _r; + + return (*l > *r) - (*r - *l); +} + +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + int i, c1 = -1, c2 = -1; + ssize_t r; + + r = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) + c1 = test_array[r]; + + for (i = 0; i < nr; i++) + if (test_array[i] <= search && test_array[i] > c2) + c2 = test_array[i]; + + if (c1 != c2) { + eytzinger0_for_each(i, nr) + pr_info("[%3u] = %12u", i, test_array[i]); + pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", + i, r, c1, c2); + } +} + +void eytzinger0_find_test(void) +{ + unsigned i, nr, allocated = 1 << 12; + u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); + + for (nr = 1; nr < allocated; nr++) { + pr_info("testing %u elems", nr); + + get_random_bytes(test_array, nr * sizeof(test_array[0])); + eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); + + /* verify array is sorted correctly: */ + eytzinger0_for_each(i, nr) + BUG_ON(i != eytzinger0_last(nr) && + test_array[i] > test_array[eytzinger0_next(i, nr)]); + + for (i = 0; i < U16_MAX; i += 1 << 12) + eytzinger0_find_test_val(test_array, nr, i); + + for (i = 0; i < nr; i++) { + eytzinger0_find_test_val(test_array, nr, test_array[i] - 1); + eytzinger0_find_test_val(test_array, nr, test_array[i]); + eytzinger0_find_test_val(test_array, nr, test_array[i] + 1); + } + } + + kfree(test_array); +} +#endif + +/* + * Accumulate percpu counters onto one cpu's copy - only valid when access + * against any percpu counter is guarded against + */ +u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) +{ + u64 *ret; + int cpu; + + /* access to pcpu vars has to be blocked by other locking */ + preempt_disable(); + ret = this_cpu_ptr(p); + preempt_enable(); + + for_each_possible_cpu(cpu) { + u64 *i = per_cpu_ptr(p, cpu); + + if (i != ret) { + acc_u64s(ret, i, nr); + memset(i, 0, nr * sizeof(u64)); + } + } + + return ret; +} diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h new file mode 100644 index 000000000..5fa29dab3 --- /dev/null +++ b/fs/bcachefs/util.h @@ -0,0 +1,846 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_UTIL_H +#define _BCACHEFS_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "darray.h" + +struct closure; + +#ifdef CONFIG_BCACHEFS_DEBUG +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) +#endif + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define CPU_BIG_ENDIAN 0 +#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ +#define CPU_BIG_ENDIAN 1 +#endif + +/* type hackery */ + +#define type_is_exact(_val, _type) \ + __builtin_types_compatible_p(typeof(_val), _type) + +#define type_is(_val, _type) \ + (__builtin_types_compatible_p(typeof(_val), _type) || \ + __builtin_types_compatible_p(typeof(_val), const _type)) + +/* Userspace doesn't align allocations as nicely as the kernel allocators: */ +static inline size_t buf_pages(void *p, size_t len) +{ + return DIV_ROUND_UP(len + + ((unsigned long) p & (PAGE_SIZE - 1)), + PAGE_SIZE); +} + +static inline void vpfree(void *p, size_t size) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + free_pages((unsigned long) p, get_order(size)); +} + +static inline void *vpmalloc_noprof(size_t size, gfp_t gfp_mask) +{ + return (void *) get_free_pages_noprof(gfp_mask|__GFP_NOWARN, + get_order(size)) ?: + __vmalloc_noprof(size, gfp_mask); +} +#define vpmalloc(_size, _gfp) alloc_hooks(vpmalloc_noprof(_size, _gfp)) + +static inline void kvpfree(void *p, size_t size) +{ + if (size < PAGE_SIZE) + kfree(p); + else + vpfree(p, size); +} + +static inline void *kvpmalloc_noprof(size_t size, gfp_t gfp_mask) +{ + return size < PAGE_SIZE + ? kmalloc_noprof(size, gfp_mask) + : vpmalloc_noprof(size, gfp_mask); +} +#define kvpmalloc(_size, _gfp) alloc_hooks(kvpmalloc_noprof(_size, _gfp)) + +int mempool_init_kvpmalloc_pool(mempool_t *, int, size_t); + +#define HEAP(type) \ +struct { \ + size_t size, used; \ + type *data; \ +} + +#define DECLARE_HEAP(type, name) HEAP(type) name + +#define init_heap(heap, _size, gfp) \ +({ \ + (heap)->used = 0; \ + (heap)->size = (_size); \ + (heap)->data = kvpmalloc((heap)->size * sizeof((heap)->data[0]),\ + (gfp)); \ +}) + +#define free_heap(heap) \ +do { \ + kvpfree((heap)->data, (heap)->size * sizeof((heap)->data[0])); \ + (heap)->data = NULL; \ +} while (0) + +#define heap_set_backpointer(h, i, _fn) \ +do { \ + void (*fn)(typeof(h), size_t) = _fn; \ + if (fn) \ + fn(h, i); \ +} while (0) + +#define heap_swap(h, i, j, set_backpointer) \ +do { \ + swap((h)->data[i], (h)->data[j]); \ + heap_set_backpointer(h, i, set_backpointer); \ + heap_set_backpointer(h, j, set_backpointer); \ +} while (0) + +#define heap_peek(h) \ +({ \ + EBUG_ON(!(h)->used); \ + (h)->data[0]; \ +}) + +#define heap_full(h) ((h)->used == (h)->size) + +#define heap_sift_down(h, i, cmp, set_backpointer) \ +do { \ + size_t _c, _j = i; \ + \ + for (; _j * 2 + 1 < (h)->used; _j = _c) { \ + _c = _j * 2 + 1; \ + if (_c + 1 < (h)->used && \ + cmp(h, (h)->data[_c], (h)->data[_c + 1]) >= 0) \ + _c++; \ + \ + if (cmp(h, (h)->data[_c], (h)->data[_j]) >= 0) \ + break; \ + heap_swap(h, _c, _j, set_backpointer); \ + } \ +} while (0) + +#define heap_sift_up(h, i, cmp, set_backpointer) \ +do { \ + while (i) { \ + size_t p = (i - 1) / 2; \ + if (cmp(h, (h)->data[i], (h)->data[p]) >= 0) \ + break; \ + heap_swap(h, i, p, set_backpointer); \ + i = p; \ + } \ +} while (0) + +#define __heap_add(h, d, cmp, set_backpointer) \ +({ \ + size_t _i = (h)->used++; \ + (h)->data[_i] = d; \ + heap_set_backpointer(h, _i, set_backpointer); \ + \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + _i; \ +}) + +#define heap_add(h, d, cmp, set_backpointer) \ +({ \ + bool _r = !heap_full(h); \ + if (_r) \ + __heap_add(h, d, cmp, set_backpointer); \ + _r; \ +}) + +#define heap_add_or_replace(h, new, cmp, set_backpointer) \ +do { \ + if (!heap_add(h, new, cmp, set_backpointer) && \ + cmp(h, new, heap_peek(h)) >= 0) { \ + (h)->data[0] = new; \ + heap_set_backpointer(h, 0, set_backpointer); \ + heap_sift_down(h, 0, cmp, set_backpointer); \ + } \ +} while (0) + +#define heap_del(h, i, cmp, set_backpointer) \ +do { \ + size_t _i = (i); \ + \ + BUG_ON(_i >= (h)->used); \ + (h)->used--; \ + if ((_i) < (h)->used) { \ + heap_swap(h, _i, (h)->used, set_backpointer); \ + heap_sift_up(h, _i, cmp, set_backpointer); \ + heap_sift_down(h, _i, cmp, set_backpointer); \ + } \ +} while (0) + +#define heap_pop(h, d, cmp, set_backpointer) \ +({ \ + bool _r = (h)->used; \ + if (_r) { \ + (d) = (h)->data[0]; \ + heap_del(h, 0, cmp, set_backpointer); \ + } \ + _r; \ +}) + +#define heap_resort(heap, cmp, set_backpointer) \ +do { \ + ssize_t _i; \ + for (_i = (ssize_t) (heap)->used / 2 - 1; _i >= 0; --_i) \ + heap_sift_down(heap, _i, cmp, set_backpointer); \ +} while (0) + +#define ANYSINT_MAX(t) \ + ((((t) 1 << (sizeof(t) * 8 - 2)) - (t) 1) * (t) 2 + (t) 1) + +#include "printbuf.h" + +#define prt_vprintf(_out, ...) bch2_prt_vprintf(_out, __VA_ARGS__) +#define prt_printf(_out, ...) bch2_prt_printf(_out, __VA_ARGS__) +#define printbuf_str(_buf) bch2_printbuf_str(_buf) +#define printbuf_exit(_buf) bch2_printbuf_exit(_buf) + +#define printbuf_tabstops_reset(_buf) bch2_printbuf_tabstops_reset(_buf) +#define printbuf_tabstop_pop(_buf) bch2_printbuf_tabstop_pop(_buf) +#define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) + +#define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) +#define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) + +#define prt_newline(_out) bch2_prt_newline(_out) +#define prt_tab(_out) bch2_prt_tab(_out) +#define prt_tab_rjust(_out) bch2_prt_tab_rjust(_out) + +#define prt_bytes_indented(...) bch2_prt_bytes_indented(__VA_ARGS__) +#define prt_u64(_out, _v) prt_printf(_out, "%llu", (u64) (_v)) +#define prt_human_readable_u64(...) bch2_prt_human_readable_u64(__VA_ARGS__) +#define prt_human_readable_s64(...) bch2_prt_human_readable_s64(__VA_ARGS__) +#define prt_units_u64(...) bch2_prt_units_u64(__VA_ARGS__) +#define prt_units_s64(...) bch2_prt_units_s64(__VA_ARGS__) +#define prt_string_option(...) bch2_prt_string_option(__VA_ARGS__) +#define prt_bitflags(...) bch2_prt_bitflags(__VA_ARGS__) + +void bch2_pr_time_units(struct printbuf *, u64); + +#ifdef __KERNEL__ +static inline void pr_time(struct printbuf *out, u64 time) +{ + prt_printf(out, "%llu", time); +} +#else +#include +static inline void pr_time(struct printbuf *out, u64 _time) +{ + char time_str[64]; + time_t time = _time; + struct tm *tm = localtime(&time); + size_t err = strftime(time_str, sizeof(time_str), "%c", tm); + if (!err) + prt_printf(out, "(formatting error)"); + else + prt_printf(out, "%s", time_str); +} +#endif + +#ifdef __KERNEL__ +static inline void uuid_unparse_lower(u8 *uuid, char *out) +{ + sprintf(out, "%pUb", uuid); +} +#else +#include +#endif + +static inline void pr_uuid(struct printbuf *out, u8 *uuid) +{ + char uuid_str[40]; + + uuid_unparse_lower(uuid, uuid_str); + prt_printf(out, "%s", uuid_str); +} + +int bch2_strtoint_h(const char *, int *); +int bch2_strtouint_h(const char *, unsigned int *); +int bch2_strtoll_h(const char *, long long *); +int bch2_strtoull_h(const char *, unsigned long long *); +int bch2_strtou64_h(const char *, u64 *); + +static inline int bch2_strtol_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch2_strtoint_h(cp, (int *) res); +#else + return bch2_strtoll_h(cp, (long long *) res); +#endif +} + +static inline int bch2_strtoul_h(const char *cp, long *res) +{ +#if BITS_PER_LONG == 32 + return bch2_strtouint_h(cp, (unsigned int *) res); +#else + return bch2_strtoull_h(cp, (unsigned long long *) res); +#endif +} + +#define strtoi_h(cp, res) \ + ( type_is(*res, int) ? bch2_strtoint_h(cp, (void *) res)\ + : type_is(*res, long) ? bch2_strtol_h(cp, (void *) res)\ + : type_is(*res, long long) ? bch2_strtoll_h(cp, (void *) res)\ + : type_is(*res, unsigned) ? bch2_strtouint_h(cp, (void *) res)\ + : type_is(*res, unsigned long) ? bch2_strtoul_h(cp, (void *) res)\ + : type_is(*res, unsigned long long) ? bch2_strtoull_h(cp, (void *) res)\ + : -EINVAL) + +#define strtoul_safe(cp, var) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = _v; \ + _r; \ +}) + +#define strtoul_safe_clamp(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r) \ + var = clamp_t(typeof(var), _v, min, max); \ + _r; \ +}) + +#define strtoul_safe_restrict(cp, var, min, max) \ +({ \ + unsigned long _v; \ + int _r = kstrtoul(cp, 10, &_v); \ + if (!_r && _v >= min && _v <= max) \ + var = _v; \ + else \ + _r = -EINVAL; \ + _r; \ +}) + +#define snprint(out, var) \ + prt_printf(out, \ + type_is(var, int) ? "%i\n" \ + : type_is(var, unsigned) ? "%u\n" \ + : type_is(var, long) ? "%li\n" \ + : type_is(var, unsigned long) ? "%lu\n" \ + : type_is(var, s64) ? "%lli\n" \ + : type_is(var, u64) ? "%llu\n" \ + : type_is(var, char *) ? "%s\n" \ + : "%i\n", var) + +bool bch2_is_zero(const void *, size_t); + +u64 bch2_read_flag_list(char *, const char * const[]); + +void bch2_prt_u64_binary(struct printbuf *, u64, unsigned); + +void bch2_print_string_as_lines(const char *prefix, const char *lines); + +typedef DARRAY(unsigned long) bch_stacktrace; +int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *); +void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *); +int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *); + +#define NR_QUANTILES 15 +#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES) +#define QUANTILE_FIRST eytzinger0_first(NR_QUANTILES) +#define QUANTILE_LAST eytzinger0_last(NR_QUANTILES) + +struct bch2_quantiles { + struct bch2_quantile_entry { + u64 m; + u64 step; + } entries[NR_QUANTILES]; +}; + +struct bch2_time_stat_buffer { + unsigned nr; + struct bch2_time_stat_buffer_entry { + u64 start; + u64 end; + } entries[32]; +}; + +struct bch2_time_stats { + spinlock_t lock; + /* all fields are in nanoseconds */ + u64 max_duration; + u64 min_duration; + u64 max_freq; + u64 min_freq; + u64 last_event; + struct bch2_quantiles quantiles; + + struct mean_and_variance duration_stats; + struct mean_and_variance_weighted duration_stats_weighted; + struct mean_and_variance freq_stats; + struct mean_and_variance_weighted freq_stats_weighted; + struct bch2_time_stat_buffer __percpu *buffer; +}; + +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64); +#else +static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {} +#endif + +static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) +{ + __bch2_time_stats_update(stats, start, local_clock()); +} + +void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *); + +void bch2_time_stats_exit(struct bch2_time_stats *); +void bch2_time_stats_init(struct bch2_time_stats *); + +#define ewma_add(ewma, val, weight) \ +({ \ + typeof(ewma) _ewma = (ewma); \ + typeof(weight) _weight = (weight); \ + \ + (((_ewma << _weight) - _ewma) + (val)) >> _weight; \ +}) + +struct bch_ratelimit { + /* Next time we want to do some work, in nanoseconds */ + u64 next; + + /* + * Rate at which we want to do work, in units per nanosecond + * The units here correspond to the units passed to + * bch2_ratelimit_increment() + */ + unsigned rate; +}; + +static inline void bch2_ratelimit_reset(struct bch_ratelimit *d) +{ + d->next = local_clock(); +} + +u64 bch2_ratelimit_delay(struct bch_ratelimit *); +void bch2_ratelimit_increment(struct bch_ratelimit *, u64); + +struct bch_pd_controller { + struct bch_ratelimit rate; + unsigned long last_update; + + s64 last_actual; + s64 smoothed_derivative; + + unsigned p_term_inverse; + unsigned d_smooth; + unsigned d_term; + + /* for exporting to sysfs (no effect on behavior) */ + s64 last_derivative; + s64 last_proportional; + s64 last_change; + s64 last_target; + + /* If true, the rate will not increase if bch2_ratelimit_delay() + * is not being called often enough. */ + bool backpressure; +}; + +void bch2_pd_controller_update(struct bch_pd_controller *, s64, s64, int); +void bch2_pd_controller_init(struct bch_pd_controller *); +void bch2_pd_controller_debug_to_text(struct printbuf *, struct bch_pd_controller *); + +#define sysfs_pd_controller_attribute(name) \ + rw_attribute(name##_rate); \ + rw_attribute(name##_rate_bytes); \ + rw_attribute(name##_rate_d_term); \ + rw_attribute(name##_rate_p_term_inverse); \ + read_attribute(name##_rate_debug) + +#define sysfs_pd_controller_files(name) \ + &sysfs_##name##_rate, \ + &sysfs_##name##_rate_bytes, \ + &sysfs_##name##_rate_d_term, \ + &sysfs_##name##_rate_p_term_inverse, \ + &sysfs_##name##_rate_debug + +#define sysfs_pd_controller_show(name, var) \ +do { \ + sysfs_hprint(name##_rate, (var)->rate.rate); \ + sysfs_print(name##_rate_bytes, (var)->rate.rate); \ + sysfs_print(name##_rate_d_term, (var)->d_term); \ + sysfs_print(name##_rate_p_term_inverse, (var)->p_term_inverse); \ + \ + if (attr == &sysfs_##name##_rate_debug) \ + bch2_pd_controller_debug_to_text(out, var); \ +} while (0) + +#define sysfs_pd_controller_store(name, var) \ +do { \ + sysfs_strtoul_clamp(name##_rate, \ + (var)->rate.rate, 1, UINT_MAX); \ + sysfs_strtoul_clamp(name##_rate_bytes, \ + (var)->rate.rate, 1, UINT_MAX); \ + sysfs_strtoul(name##_rate_d_term, (var)->d_term); \ + sysfs_strtoul_clamp(name##_rate_p_term_inverse, \ + (var)->p_term_inverse, 1, INT_MAX); \ +} while (0) + +#define container_of_or_null(ptr, type, member) \ +({ \ + typeof(ptr) _ptr = ptr; \ + _ptr ? container_of(_ptr, type, member) : NULL; \ +}) + +/* Does linear interpolation between powers of two */ +static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits) +{ + unsigned fract = x & ~(~0 << fract_bits); + + x >>= fract_bits; + x = 1 << x; + x += (x * fract) >> fract_bits; + + return x; +} + +void bch2_bio_map(struct bio *bio, void *base, size_t); +int bch2_bio_alloc_pages_noprof(struct bio *, size_t, gfp_t); +#define bch2_bio_alloc_pages(_bio, _size, _gfp) \ + alloc_hooks(bch2_bio_alloc_pages_noprof(_bio, _size, _gfp)) + +static inline sector_t bdev_sectors(struct block_device *bdev) +{ + return bdev->bd_inode->i_size >> 9; +} + +#define closure_bio_submit(bio, cl) \ +do { \ + closure_get(cl); \ + submit_bio(bio); \ +} while (0) + +#define kthread_wait(cond) \ +({ \ + int _ret = 0; \ + \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (kthread_should_stop()) { \ + _ret = -1; \ + break; \ + } \ + \ + if (cond) \ + break; \ + \ + schedule(); \ + } \ + set_current_state(TASK_RUNNING); \ + _ret; \ +}) + +#define kthread_wait_freezable(cond) \ +({ \ + int _ret = 0; \ + while (1) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (kthread_should_stop()) { \ + _ret = -1; \ + break; \ + } \ + \ + if (cond) \ + break; \ + \ + schedule(); \ + try_to_freeze(); \ + } \ + set_current_state(TASK_RUNNING); \ + _ret; \ +}) + +size_t bch2_rand_range(size_t); + +void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); +void memcpy_from_bio(void *, struct bio *, struct bvec_iter); + +static inline void memcpy_u64s_small(void *dst, const void *src, + unsigned u64s) +{ + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +} + +static inline void __memcpy_u64s(void *dst, const void *src, + unsigned u64s) +{ +#ifdef CONFIG_X86_64 + long d0, d1, d2; + asm volatile("rep ; movsq" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) + : "memory"); +#else + u64 *d = dst; + const u64 *s = src; + + while (u64s--) + *d++ = *s++; +#endif +} + +static inline void memcpy_u64s(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(!(dst >= src + u64s * sizeof(u64) || + dst + u64s * sizeof(u64) <= src)); + + __memcpy_u64s(dst, src, u64s); +} + +static inline void __memmove_u64s_down(void *dst, const void *src, + unsigned u64s) +{ + __memcpy_u64s(dst, src, u64s); +} + +static inline void memmove_u64s_down(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down(dst, src, u64s); +} + +static inline void __memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + memcpy_u64s_small(dst, src, u64s); +} + +static inline void memmove_u64s_down_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst > src); + + __memmove_u64s_down_small(dst, src, u64s); +} + +static inline void __memmove_u64s_up_small(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s; + u64 *src = (u64 *) _src + u64s; + + while (u64s--) + *--dst = *--src; +} + +static inline void memmove_u64s_up_small(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up_small(dst, src, u64s); +} + +static inline void __memmove_u64s_up(void *_dst, const void *_src, + unsigned u64s) +{ + u64 *dst = (u64 *) _dst + u64s - 1; + u64 *src = (u64 *) _src + u64s - 1; + +#ifdef CONFIG_X86_64 + long d0, d1, d2; + asm volatile("std ;\n" + "rep ; movsq\n" + "cld ;\n" + : "=&c" (d0), "=&D" (d1), "=&S" (d2) + : "0" (u64s), "1" (dst), "2" (src) + : "memory"); +#else + while (u64s--) + *dst-- = *src--; +#endif +} + +static inline void memmove_u64s_up(void *dst, const void *src, + unsigned u64s) +{ + EBUG_ON(dst < src); + + __memmove_u64s_up(dst, src, u64s); +} + +static inline void memmove_u64s(void *dst, const void *src, + unsigned u64s) +{ + if (dst < src) + __memmove_u64s_down(dst, src, u64s); + else + __memmove_u64s_up(dst, src, u64s); +} + +/* Set the last few bytes up to a u64 boundary given an offset into a buffer. */ +static inline void memset_u64s_tail(void *s, int c, unsigned bytes) +{ + unsigned rem = round_up(bytes, sizeof(u64)) - bytes; + + memset(s + bytes, c, rem); +} + +void sort_cmp_size(void *base, size_t num, size_t size, + int (*cmp_func)(const void *, const void *, size_t), + void (*swap_func)(void *, void *, size_t)); + +/* just the memmove, doesn't update @_nr */ +#define __array_insert_item(_array, _nr, _pos) \ + memmove(&(_array)[(_pos) + 1], \ + &(_array)[(_pos)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))) + +#define array_insert_item(_array, _nr, _pos, _new_item) \ +do { \ + __array_insert_item(_array, _nr, _pos); \ + (_nr)++; \ + (_array)[(_pos)] = (_new_item); \ +} while (0) + +#define array_remove_items(_array, _nr, _pos, _nr_to_remove) \ +do { \ + (_nr) -= (_nr_to_remove); \ + memmove(&(_array)[(_pos)], \ + &(_array)[(_pos) + (_nr_to_remove)], \ + sizeof((_array)[0]) * ((_nr) - (_pos))); \ +} while (0) + +#define array_remove_item(_array, _nr, _pos) \ + array_remove_items(_array, _nr, _pos, 1) + +static inline void __move_gap(void *array, size_t element_size, + size_t nr, size_t size, + size_t old_gap, size_t new_gap) +{ + size_t gap_end = old_gap + size - nr; + + if (new_gap < old_gap) { + size_t move = old_gap - new_gap; + + memmove(array + element_size * (gap_end - move), + array + element_size * (old_gap - move), + element_size * move); + } else if (new_gap > old_gap) { + size_t move = new_gap - old_gap; + + memmove(array + element_size * old_gap, + array + element_size * gap_end, + element_size * move); + } +} + +/* Move the gap in a gap buffer: */ +#define move_gap(_array, _nr, _size, _old_gap, _new_gap) \ + __move_gap(_array, sizeof(_array[0]), _nr, _size, _old_gap, _new_gap) + +#define bubble_sort(_base, _nr, _cmp) \ +do { \ + ssize_t _i, _end; \ + bool _swapped = true; \ + \ + for (_end = (ssize_t) (_nr) - 1; _end > 0 && _swapped; --_end) {\ + _swapped = false; \ + for (_i = 0; _i < _end; _i++) \ + if (_cmp((_base)[_i], (_base)[_i + 1]) > 0) { \ + swap((_base)[_i], (_base)[_i + 1]); \ + _swapped = true; \ + } \ + } \ +} while (0) + +static inline u64 percpu_u64_get(u64 __percpu *src) +{ + u64 ret = 0; + int cpu; + + for_each_possible_cpu(cpu) + ret += *per_cpu_ptr(src, cpu); + return ret; +} + +static inline void percpu_u64_set(u64 __percpu *dst, u64 src) +{ + int cpu; + + for_each_possible_cpu(cpu) + *per_cpu_ptr(dst, cpu) = 0; + this_cpu_write(*dst, src); +} + +static inline void acc_u64s(u64 *acc, const u64 *src, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) + acc[i] += src[i]; +} + +static inline void acc_u64s_percpu(u64 *acc, const u64 __percpu *src, + unsigned nr) +{ + int cpu; + + for_each_possible_cpu(cpu) + acc_u64s(acc, per_cpu_ptr(src, cpu), nr); +} + +static inline void percpu_memset(void __percpu *p, int c, size_t bytes) +{ + int cpu; + + for_each_possible_cpu(cpu) + memset(per_cpu_ptr(p, cpu), c, bytes); +} + +u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); + +#define cmp_int(l, r) ((l > r) - (l < r)) + +static inline int u8_cmp(u8 l, u8 r) +{ + return cmp_int(l, r); +} + +#include + +#endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/varint.c b/fs/bcachefs/varint.c new file mode 100644 index 000000000..ef030fc02 --- /dev/null +++ b/fs/bcachefs/varint.c @@ -0,0 +1,122 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include + +#ifdef CONFIG_VALGRIND +#include +#endif + +#include "varint.h" + +/** + * bch2_varint_encode - encode a variable length integer + * @out - destination to encode to + * @v - unsigned integer to encode + * + * Returns the size in bytes of the encoded integer - at most 9 bytes + */ +int bch2_varint_encode(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + __le64 v_le; + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + v_le = cpu_to_le64(v); + memcpy(out, &v_le, bytes); + } else { + *out++ = 255; + bytes = 9; + put_unaligned_le64(v, out); + } + + return bytes; +} + +/** + * bch2_varint_decode - encode a variable length integer + * @in - varint to decode + * @end - end of buffer to decode from + * @out - on success, decoded integer + * + * Returns the size in bytes of the decoded integer - or -1 on failure (would + * have read past the end of the buffer) + */ +int bch2_varint_decode(const u8 *in, const u8 *end, u64 *out) +{ + unsigned bytes = likely(in < end) + ? ffz(*in & 255) + 1 + : 1; + u64 v; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + __le64 v_le = 0; + memcpy(&v_le, in, bytes); + v = le64_to_cpu(v_le); + v >>= bytes; + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} + +/** + * bch2_varint_encode_fast - fast version of bch2_varint_encode + * + * This version assumes it's always safe to write 8 bytes to @out, even if the + * encoded integer would be smaller. + */ +int bch2_varint_encode_fast(u8 *out, u64 v) +{ + unsigned bits = fls64(v|1); + unsigned bytes = DIV_ROUND_UP(bits, 7); + + if (likely(bytes < 9)) { + v <<= bytes; + v |= ~(~0 << (bytes - 1)); + } else { + *out++ = 255; + bytes = 9; + } + + put_unaligned_le64(v, out); + return bytes; +} + +/** + * bch2_varint_decode_fast - fast version of bch2_varint_decode + * + * This version assumes that it is safe to read at most 8 bytes past the end of + * @end (we still return an error if the varint extends past @end). + */ +int bch2_varint_decode_fast(const u8 *in, const u8 *end, u64 *out) +{ +#ifdef CONFIG_VALGRIND + VALGRIND_MAKE_MEM_DEFINED(in, 8); +#endif + u64 v = get_unaligned_le64(in); + unsigned bytes = ffz(*in) + 1; + + if (unlikely(in + bytes > end)) + return -1; + + if (likely(bytes < 9)) { + v >>= bytes; + v &= ~(~0ULL << (7 * bytes)); + } else { + v = get_unaligned_le64(++in); + } + + *out = v; + return bytes; +} diff --git a/fs/bcachefs/varint.h b/fs/bcachefs/varint.h new file mode 100644 index 000000000..92a182fb3 --- /dev/null +++ b/fs/bcachefs/varint.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_VARINT_H +#define _BCACHEFS_VARINT_H + +int bch2_varint_encode(u8 *, u64); +int bch2_varint_decode(const u8 *, const u8 *, u64 *); + +int bch2_varint_encode_fast(u8 *, u64); +int bch2_varint_decode_fast(const u8 *, const u8 *, u64 *); + +#endif /* _BCACHEFS_VARINT_H */ diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h new file mode 100644 index 000000000..53a694d71 --- /dev/null +++ b/fs/bcachefs/vstructs.h @@ -0,0 +1,63 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _VSTRUCTS_H +#define _VSTRUCTS_H + +#include "util.h" + +/* + * NOTE: we can't differentiate between __le64 and u64 with type_is - this + * assumes u64 is little endian: + */ +#define __vstruct_u64s(_s) \ +({ \ + ( type_is((_s)->u64s, u64) ? le64_to_cpu((__force __le64) (_s)->u64s) \ + : type_is((_s)->u64s, u32) ? le32_to_cpu((__force __le32) (_s)->u64s) \ + : type_is((_s)->u64s, u16) ? le16_to_cpu((__force __le16) (_s)->u64s) \ + : ((__force u8) ((_s)->u64s))); \ +}) + +#define __vstruct_bytes(_type, _u64s) \ +({ \ + BUILD_BUG_ON(offsetof(_type, _data) % sizeof(u64)); \ + \ + (size_t) (offsetof(_type, _data) + (_u64s) * sizeof(u64)); \ +}) + +#define vstruct_bytes(_s) \ + __vstruct_bytes(typeof(*(_s)), __vstruct_u64s(_s)) + +#define __vstruct_blocks(_type, _sector_block_bits, _u64s) \ + (round_up(__vstruct_bytes(_type, _u64s), \ + 512 << (_sector_block_bits)) >> (9 + (_sector_block_bits))) + +#define vstruct_blocks(_s, _sector_block_bits) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, __vstruct_u64s(_s)) + +#define vstruct_blocks_plus(_s, _sector_block_bits, _u64s) \ + __vstruct_blocks(typeof(*(_s)), _sector_block_bits, \ + __vstruct_u64s(_s) + (_u64s)) + +#define vstruct_sectors(_s, _sector_block_bits) \ + (round_up(vstruct_bytes(_s), 512 << (_sector_block_bits)) >> 9) + +#define vstruct_next(_s) \ + ((typeof(_s)) ((_s)->_data + __vstruct_u64s(_s))) +#define vstruct_last(_s) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + __vstruct_u64s(_s))) +#define vstruct_end(_s) \ + ((void *) ((_s)->_data + __vstruct_u64s(_s))) + +#define vstruct_for_each(_s, _i) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s); \ + _i = vstruct_next(_i)) + +#define vstruct_for_each_safe(_s, _i, _t) \ + for (_i = (_s)->start; \ + _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \ + _i = _t) + +#define vstruct_idx(_s, _idx) \ + ((typeof(&(_s)->start[0])) ((_s)->_data + (_idx))) + +#endif /* _VSTRUCTS_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c new file mode 100644 index 000000000..70f78006d --- /dev/null +++ b/fs/bcachefs/xattr.c @@ -0,0 +1,648 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "bkey_methods.h" +#include "btree_update.h" +#include "extents.h" +#include "fs.h" +#include "rebalance.h" +#include "str_hash.h" +#include "xattr.h" + +#include +#include +#include + +static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned); + +static u64 bch2_xattr_hash(const struct bch_hash_info *info, + const struct xattr_search_key *key) +{ + struct bch_str_hash_ctx ctx; + + bch2_str_hash_init(&ctx, info); + bch2_str_hash_update(&ctx, info, &key->type, sizeof(key->type)); + bch2_str_hash_update(&ctx, info, key->name.name, key->name.len); + + return bch2_str_hash_end(&ctx, info); +} + +static u64 xattr_hash_key(const struct bch_hash_info *info, const void *key) +{ + return bch2_xattr_hash(info, key); +} + +static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) +{ + struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); + + return bch2_xattr_hash(info, + &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); +} + +static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) +{ + struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); + const struct xattr_search_key *r = _r; + + return l.v->x_type != r->type || + l.v->x_name_len != r->name.len || + memcmp(l.v->x_name, r->name.name, r->name.len); +} + +static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) +{ + struct bkey_s_c_xattr l = bkey_s_c_to_xattr(_l); + struct bkey_s_c_xattr r = bkey_s_c_to_xattr(_r); + + return l.v->x_type != r.v->x_type || + l.v->x_name_len != r.v->x_name_len || + memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); +} + +const struct bch_hash_desc bch2_xattr_hash_desc = { + .btree_id = BTREE_ID_xattrs, + .key_type = KEY_TYPE_xattr, + .hash_key = xattr_hash_key, + .hash_bkey = xattr_hash_bkey, + .cmp_key = xattr_cmp_key, + .cmp_bkey = xattr_cmp_bkey, +}; + +int bch2_xattr_invalid(const struct bch_fs *c, struct bkey_s_c k, + enum bkey_invalid_flags flags, + struct printbuf *err) +{ + const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + + if (bkey_val_u64s(k.k) < + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len))) { + prt_printf(err, "value too small (%zu < %u)", + bkey_val_u64s(k.k), + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len))); + return -BCH_ERR_invalid_bkey; + } + + /* XXX why +4 ? */ + if (bkey_val_u64s(k.k) > + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4)) { + prt_printf(err, "value too big (%zu > %u)", + bkey_val_u64s(k.k), + xattr_val_u64s(xattr.v->x_name_len, + le16_to_cpu(xattr.v->x_val_len) + 4)); + return -BCH_ERR_invalid_bkey; + } + + handler = bch2_xattr_type_to_handler(xattr.v->x_type); + if (!handler) { + prt_printf(err, "invalid type (%u)", xattr.v->x_type); + return -BCH_ERR_invalid_bkey; + } + + if (memchr(xattr.v->x_name, '\0', xattr.v->x_name_len)) { + prt_printf(err, "xattr name has invalid characters"); + return -BCH_ERR_invalid_bkey; + } + + return 0; +} + +void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, + struct bkey_s_c k) +{ + const struct xattr_handler *handler; + struct bkey_s_c_xattr xattr = bkey_s_c_to_xattr(k); + + handler = bch2_xattr_type_to_handler(xattr.v->x_type); + if (handler && handler->prefix) + prt_printf(out, "%s", handler->prefix); + else if (handler) + prt_printf(out, "(type %u)", xattr.v->x_type); + else + prt_printf(out, "(unknown type %u)", xattr.v->x_type); + + prt_printf(out, "%.*s:%.*s", + xattr.v->x_name_len, + xattr.v->x_name, + le16_to_cpu(xattr.v->x_val_len), + (char *) xattr_val(xattr.v)); +} + +static int bch2_xattr_get_trans(struct btree_trans *trans, struct bch_inode_info *inode, + const char *name, void *buffer, size_t size, int type) +{ + struct bch_hash_info hash = bch2_hash_info_init(trans->c, &inode->ei_inode); + struct xattr_search_key search = X_SEARCH(type, name, strlen(name)); + struct btree_iter iter; + struct bkey_s_c_xattr xattr; + struct bkey_s_c k; + int ret; + + ret = bch2_hash_lookup(trans, &iter, bch2_xattr_hash_desc, &hash, + inode_inum(inode), &search, 0); + if (ret) + goto err1; + + k = bch2_btree_iter_peek_slot(&iter); + ret = bkey_err(k); + if (ret) + goto err2; + + xattr = bkey_s_c_to_xattr(k); + ret = le16_to_cpu(xattr.v->x_val_len); + if (buffer) { + if (ret > size) + ret = -ERANGE; + else + memcpy(buffer, xattr_val(xattr.v), ret); + } +err2: + bch2_trans_iter_exit(trans, &iter); +err1: + return ret < 0 && bch2_err_matches(ret, ENOENT) ? -ENODATA : ret; +} + +int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode_u, + const struct bch_hash_info *hash_info, + const char *name, const void *value, size_t size, + int type, int flags) +{ + struct bch_fs *c = trans->c; + struct btree_iter inode_iter = { NULL }; + int ret; + + ret = bch2_inode_peek(trans, &inode_iter, inode_u, inum, BTREE_ITER_INTENT); + if (ret) + return ret; + + inode_u->bi_ctime = bch2_current_time(c); + + ret = bch2_inode_write(trans, &inode_iter, inode_u); + bch2_trans_iter_exit(trans, &inode_iter); + + if (ret) + return ret; + + if (value) { + struct bkey_i_xattr *xattr; + unsigned namelen = strlen(name); + unsigned u64s = BKEY_U64s + + xattr_val_u64s(namelen, size); + + if (u64s > U8_MAX) + return -ERANGE; + + xattr = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); + if (IS_ERR(xattr)) + return PTR_ERR(xattr); + + bkey_xattr_init(&xattr->k_i); + xattr->k.u64s = u64s; + xattr->v.x_type = type; + xattr->v.x_name_len = namelen; + xattr->v.x_val_len = cpu_to_le16(size); + memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr_val(&xattr->v), value, size); + + ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, + inum, &xattr->k_i, + (flags & XATTR_CREATE ? BCH_HASH_SET_MUST_CREATE : 0)| + (flags & XATTR_REPLACE ? BCH_HASH_SET_MUST_REPLACE : 0)); + } else { + struct xattr_search_key search = + X_SEARCH(type, name, strlen(name)); + + ret = bch2_hash_delete(trans, bch2_xattr_hash_desc, + hash_info, inum, &search); + } + + if (bch2_err_matches(ret, ENOENT)) + ret = flags & XATTR_REPLACE ? -ENODATA : 0; + + return ret; +} + +struct xattr_buf { + char *buf; + size_t len; + size_t used; +}; + +static int __bch2_xattr_emit(const char *prefix, + const char *name, size_t name_len, + struct xattr_buf *buf) +{ + const size_t prefix_len = strlen(prefix); + const size_t total_len = prefix_len + name_len + 1; + + if (buf->buf) { + if (buf->used + total_len > buf->len) + return -ERANGE; + + memcpy(buf->buf + buf->used, prefix, prefix_len); + memcpy(buf->buf + buf->used + prefix_len, + name, name_len); + buf->buf[buf->used + prefix_len + name_len] = '\0'; + } + + buf->used += total_len; + return 0; +} + +static int bch2_xattr_emit(struct dentry *dentry, + const struct bch_xattr *xattr, + struct xattr_buf *buf) +{ + const struct xattr_handler *handler = + bch2_xattr_type_to_handler(xattr->x_type); + + return handler && (!handler->list || handler->list(dentry)) + ? __bch2_xattr_emit(handler->prefix ?: handler->name, + xattr->x_name, xattr->x_name_len, buf) + : 0; +} + +static int bch2_xattr_list_bcachefs(struct bch_fs *c, + struct bch_inode_unpacked *inode, + struct xattr_buf *buf, + bool all) +{ + const char *prefix = all ? "bcachefs_effective." : "bcachefs."; + unsigned id; + int ret = 0; + u64 v; + + for (id = 0; id < Inode_opt_nr; id++) { + v = bch2_inode_opt_get(inode, id); + if (!v) + continue; + + if (!all && + !(inode->bi_fields_set & (1 << id))) + continue; + + ret = __bch2_xattr_emit(prefix, bch2_inode_opts[id], + strlen(bch2_inode_opts[id]), buf); + if (ret) + break; + } + + return ret; +} + +ssize_t bch2_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + struct bch_fs *c = dentry->d_sb->s_fs_info; + struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); + struct btree_trans trans; + struct btree_iter iter; + struct bkey_s_c k; + struct xattr_buf buf = { .buf = buffer, .len = buffer_size }; + u64 offset = 0, inum = inode->ei_inode.bi_inum; + u32 snapshot; + int ret; + + bch2_trans_init(&trans, c, 0, 0); +retry: + bch2_trans_begin(&trans); + iter = (struct btree_iter) { NULL }; + + ret = bch2_subvolume_get_snapshot(&trans, inode->ei_subvol, &snapshot); + if (ret) + goto err; + + for_each_btree_key_upto_norestart(&trans, iter, BTREE_ID_xattrs, + SPOS(inum, offset, snapshot), + POS(inum, U64_MAX), 0, k, ret) { + if (k.k->type != KEY_TYPE_xattr) + continue; + + ret = bch2_xattr_emit(dentry, bkey_s_c_to_xattr(k).v, &buf); + if (ret) + break; + } + + offset = iter.pos.offset; + bch2_trans_iter_exit(&trans, &iter); +err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + goto retry; + + bch2_trans_exit(&trans); + + if (ret) + goto out; + + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, false); + if (ret) + goto out; + + ret = bch2_xattr_list_bcachefs(c, &inode->ei_inode, &buf, true); + if (ret) + goto out; + + return buf.used; +out: + return bch2_err_class(ret); +} + +static int bch2_xattr_get_handler(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + int ret = bch2_trans_do(c, NULL, NULL, 0, + bch2_xattr_get_trans(&trans, inode, name, buffer, size, handler->flags)); + + return bch2_err_class(ret); +} + +static int bch2_xattr_set_handler(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); + struct bch_inode_unpacked inode_u; + struct btree_trans trans; + int ret; + + bch2_trans_init(&trans, c, 0, 0); + + ret = commit_do(&trans, NULL, NULL, 0, + bch2_xattr_set(&trans, inode_inum(inode), &inode_u, + &hash, name, value, size, + handler->flags, flags)); + if (!ret) + bch2_inode_update_after_write(&trans, inode, &inode_u, ATTR_CTIME); + bch2_trans_exit(&trans); + + return bch2_err_class(ret); +} + +static const struct xattr_handler bch_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = KEY_TYPE_XATTR_INDEX_USER, +}; + +static bool bch2_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); +} + +static const struct xattr_handler bch_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = bch2_xattr_trusted_list, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = KEY_TYPE_XATTR_INDEX_TRUSTED, +}; + +static const struct xattr_handler bch_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = bch2_xattr_get_handler, + .set = bch2_xattr_set_handler, + .flags = KEY_TYPE_XATTR_INDEX_SECURITY, +}; + +#ifndef NO_BCACHEFS_FS + +static int opt_to_inode_opt(int id) +{ + switch (id) { +#define x(name, ...) \ + case Opt_##name: return Inode_opt_##name; + BCH_INODE_OPTS() +#undef x + default: + return -1; + } +} + +static int __bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size, + bool all) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_opts opts = + bch2_inode_opts_to_opts(&inode->ei_inode); + const struct bch_option *opt; + int id, inode_opt_id; + struct printbuf out = PRINTBUF; + int ret; + u64 v; + + id = bch2_opt_lookup(name); + if (id < 0 || !bch2_opt_is_inode_opt(id)) + return -EINVAL; + + inode_opt_id = opt_to_inode_opt(id); + if (inode_opt_id < 0) + return -EINVAL; + + opt = bch2_opt_table + id; + + if (!bch2_opt_defined_by_id(&opts, id)) + return -ENODATA; + + if (!all && + !(inode->ei_inode.bi_fields_set & (1 << inode_opt_id))) + return -ENODATA; + + v = bch2_opt_get_by_id(&opts, id); + bch2_opt_to_text(&out, c, c->disk_sb.sb, opt, v, 0); + + ret = out.pos; + + if (out.allocation_failure) { + ret = -ENOMEM; + } else if (buffer) { + if (out.pos > size) + ret = -ERANGE; + else + memcpy(buffer, out.buf, out.pos); + } + + printbuf_exit(&out); + return ret; +} + +static int bch2_xattr_bcachefs_get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + return __bch2_xattr_bcachefs_get(handler, dentry, vinode, + name, buffer, size, false); +} + +struct inode_opt_set { + int id; + u64 v; + bool defined; +}; + +static int inode_opt_set_fn(struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct inode_opt_set *s = p; + + if (s->defined) + bi->bi_fields_set |= 1U << s->id; + else + bi->bi_fields_set &= ~(1U << s->id); + + bch2_inode_opt_set(bi, s->id, s->v); + + return 0; +} + +static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *dentry, struct inode *vinode, + const char *name, const void *value, + size_t size, int flags) +{ + struct bch_inode_info *inode = to_bch_ei(vinode); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + const struct bch_option *opt; + char *buf; + struct inode_opt_set s; + int opt_id, inode_opt_id, ret; + + opt_id = bch2_opt_lookup(name); + if (opt_id < 0) + return -EINVAL; + + opt = bch2_opt_table + opt_id; + + inode_opt_id = opt_to_inode_opt(opt_id); + if (inode_opt_id < 0) + return -EINVAL; + + s.id = inode_opt_id; + + if (value) { + u64 v = 0; + + buf = kmalloc(size + 1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, value, size); + buf[size] = '\0'; + + ret = bch2_opt_parse(c, opt, buf, &v, NULL); + kfree(buf); + + if (ret < 0) + return ret; + + ret = bch2_opt_check_may_set(c, opt_id, v); + if (ret < 0) + return ret; + + s.v = v + 1; + s.defined = true; + } else { + if (!IS_ROOT(dentry)) { + struct bch_inode_info *dir = + to_bch_ei(d_inode(dentry->d_parent)); + + s.v = bch2_inode_opt_get(&dir->ei_inode, inode_opt_id); + } else { + s.v = 0; + } + + s.defined = false; + } + + mutex_lock(&inode->ei_update_lock); + if (inode_opt_id == Inode_opt_project) { + /* + * inode fields accessible via the xattr interface are stored + * with a +1 bias, so that 0 means unset: + */ + ret = bch2_set_projid(c, inode, s.v ? s.v - 1 : 0); + if (ret) + goto err; + } + + ret = bch2_write_inode(c, inode, inode_opt_set_fn, &s, 0); +err: + mutex_unlock(&inode->ei_update_lock); + + if (value && + (opt_id == Opt_background_compression || + opt_id == Opt_background_target)) + bch2_rebalance_add_work(c, inode->v.i_blocks); + + return bch2_err_class(ret); +} + +static const struct xattr_handler bch_xattr_bcachefs_handler = { + .prefix = "bcachefs.", + .get = bch2_xattr_bcachefs_get, + .set = bch2_xattr_bcachefs_set, +}; + +static int bch2_xattr_bcachefs_get_effective( + const struct xattr_handler *handler, + struct dentry *dentry, struct inode *vinode, + const char *name, void *buffer, size_t size) +{ + return __bch2_xattr_bcachefs_get(handler, dentry, vinode, + name, buffer, size, true); +} + +static const struct xattr_handler bch_xattr_bcachefs_effective_handler = { + .prefix = "bcachefs_effective.", + .get = bch2_xattr_bcachefs_get_effective, + .set = bch2_xattr_bcachefs_set, +}; + +#endif /* NO_BCACHEFS_FS */ + +const struct xattr_handler *bch2_xattr_handlers[] = { + &bch_xattr_user_handler, +#ifdef CONFIG_BCACHEFS_POSIX_ACL + &nop_posix_acl_access, + &nop_posix_acl_default, +#endif + &bch_xattr_trusted_handler, + &bch_xattr_security_handler, +#ifndef NO_BCACHEFS_FS + &bch_xattr_bcachefs_handler, + &bch_xattr_bcachefs_effective_handler, +#endif + NULL +}; + +static const struct xattr_handler *bch_xattr_handler_map[] = { + [KEY_TYPE_XATTR_INDEX_USER] = &bch_xattr_user_handler, + [KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS] = + &nop_posix_acl_access, + [KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT] = + &nop_posix_acl_default, + [KEY_TYPE_XATTR_INDEX_TRUSTED] = &bch_xattr_trusted_handler, + [KEY_TYPE_XATTR_INDEX_SECURITY] = &bch_xattr_security_handler, +}; + +static const struct xattr_handler *bch2_xattr_type_to_handler(unsigned type) +{ + return type < ARRAY_SIZE(bch_xattr_handler_map) + ? bch_xattr_handler_map[type] + : NULL; +} diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h new file mode 100644 index 000000000..f5a52e3a6 --- /dev/null +++ b/fs/bcachefs/xattr.h @@ -0,0 +1,50 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_XATTR_H +#define _BCACHEFS_XATTR_H + +#include "str_hash.h" + +extern const struct bch_hash_desc bch2_xattr_hash_desc; + +int bch2_xattr_invalid(const struct bch_fs *, struct bkey_s_c, + enum bkey_invalid_flags, struct printbuf *); +void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); + +#define bch2_bkey_ops_xattr ((struct bkey_ops) { \ + .key_invalid = bch2_xattr_invalid, \ + .val_to_text = bch2_xattr_to_text, \ + .min_val_size = 8, \ +}) + +static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) +{ + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + name_len + val_len, sizeof(u64)); +} + +#define xattr_val(_xattr) \ + ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + +struct xattr_search_key { + u8 type; + struct qstr name; +}; + +#define X_SEARCH(_type, _name, _len) ((struct xattr_search_key) \ + { .type = _type, .name = QSTR_INIT(_name, _len) }) + +struct dentry; +struct xattr_handler; +struct bch_hash_info; +struct bch_inode_info; + +/* Exported for cmd_migrate.c in tools: */ +int bch2_xattr_set(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, const struct bch_hash_info *, + const char *, const void *, size_t, int, int); + +ssize_t bch2_xattr_list(struct dentry *, char *, size_t); + +extern const struct xattr_handler *bch2_xattr_handlers[]; + +#endif /* _BCACHEFS_XATTR_H */ diff --git a/fs/dcache.c b/fs/dcache.c index 52e6d5fda..dbdafa261 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -3249,11 +3249,10 @@ void d_genocide(struct dentry *parent) EXPORT_SYMBOL(d_genocide); -void d_tmpfile(struct file *file, struct inode *inode) +void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; - inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); @@ -3263,6 +3262,15 @@ void d_tmpfile(struct file *file, struct inode *inode) (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); +} +EXPORT_SYMBOL(d_mark_tmpfile); + +void d_tmpfile(struct file *file, struct inode *inode) +{ + struct dentry *dentry = file->f_path.dentry; + + inode_dec_link_count(inode); + d_mark_tmpfile(file, inode); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); diff --git a/fs/inode.c b/fs/inode.c index 577799b78..7a32d6aa4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -57,8 +57,23 @@ static unsigned int i_hash_mask __read_mostly; static unsigned int i_hash_shift __read_mostly; -static struct hlist_head *inode_hashtable __read_mostly; -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); +static struct hlist_bl_head *inode_hashtable __read_mostly; + +static unsigned long hash(struct super_block *sb, unsigned long hashval) +{ + unsigned long tmp; + + tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / + L1_CACHE_BYTES; + tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); + return tmp & i_hash_mask; +} + +static inline struct hlist_bl_head *i_hash_head(struct super_block *sb, + unsigned int hashval) +{ + return inode_hashtable + hash(sb, hashval); +} /* * Empty aops. Can be used for the cases where the user does not @@ -417,7 +432,7 @@ EXPORT_SYMBOL(address_space_init_once); void inode_init_once(struct inode *inode) { memset(inode, 0, sizeof(*inode)); - INIT_HLIST_NODE(&inode->i_hash); + INIT_HLIST_BL_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); @@ -506,14 +521,15 @@ static inline void inode_sb_list_del(struct inode *inode) } } -static unsigned long hash(struct super_block *sb, unsigned long hashval) +/* + * Ensure that we store the hash head in the inode when we insert the inode into + * the hlist_bl_head... + */ +static inline void +__insert_inode_hash_head(struct inode *inode, struct hlist_bl_head *b) { - unsigned long tmp; - - tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) / - L1_CACHE_BYTES; - tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> i_hash_shift); - return tmp & i_hash_mask; + hlist_bl_add_head_rcu(&inode->i_hash, b); + inode->i_hash_head = b; } /** @@ -526,13 +542,13 @@ static unsigned long hash(struct super_block *sb, unsigned long hashval) */ void __insert_inode_hash(struct inode *inode, unsigned long hashval) { - struct hlist_head *b = inode_hashtable + hash(inode->i_sb, hashval); + struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); - spin_lock(&inode_hash_lock); + hlist_bl_lock(b); spin_lock(&inode->i_lock); - hlist_add_head_rcu(&inode->i_hash, b); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); } EXPORT_SYMBOL(__insert_inode_hash); @@ -544,11 +560,44 @@ EXPORT_SYMBOL(__insert_inode_hash); */ void __remove_inode_hash(struct inode *inode) { - spin_lock(&inode_hash_lock); - spin_lock(&inode->i_lock); - hlist_del_init_rcu(&inode->i_hash); - spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + struct hlist_bl_head *b = inode->i_hash_head; + + /* + * There are some callers that come through here without synchronisation + * and potentially with multiple references to the inode. Hence we have + * to handle the case that we might race with a remove and insert to a + * different list. Coda, in particular, seems to have a userspace API + * that can directly trigger "unhash/rehash to different list" behaviour + * without any serialisation at all. + * + * Hence we have to handle the situation where the inode->i_hash_head + * might point to a different list than what we expect, indicating that + * we raced with another unhash and potentially a new insertion. This + * means we have to retest the head once we have everything locked up + * and loop again if it doesn't match. + */ + while (b) { + hlist_bl_lock(b); + spin_lock(&inode->i_lock); + if (b != inode->i_hash_head) { + hlist_bl_unlock(b); + b = inode->i_hash_head; + spin_unlock(&inode->i_lock); + continue; + } + /* + * Need to set the pprev pointer to NULL after list removal so + * that both RCU traversals and hlist_bl_unhashed() work + * correctly at this point. + */ + hlist_bl_del_rcu(&inode->i_hash); + inode->i_hash.pprev = NULL; + inode->i_hash_head = NULL; + spin_unlock(&inode->i_lock); + hlist_bl_unlock(b); + break; + } + } EXPORT_SYMBOL(__remove_inode_hash); @@ -897,26 +946,28 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc) return freed; } -static void __wait_on_freeing_inode(struct inode *inode); +static void __wait_on_freeing_inode(struct hlist_bl_head *b, + struct inode *inode); /* * Called with the inode lock held. */ static struct inode *find_inode(struct super_block *sb, - struct hlist_head *head, + struct hlist_bl_head *b, int (*test)(struct inode *, void *), void *data) { + struct hlist_bl_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_bl_for_each_entry(inode, node, b, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(b, inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { @@ -935,19 +986,20 @@ static struct inode *find_inode(struct super_block *sb, * iget_locked for details. */ static struct inode *find_inode_fast(struct super_block *sb, - struct hlist_head *head, unsigned long ino) + struct hlist_bl_head *b, unsigned long ino) { + struct hlist_bl_node *node; struct inode *inode = NULL; repeat: - hlist_for_each_entry(inode, head, i_hash) { + hlist_bl_for_each_entry(inode, node, b, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) continue; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE)) { - __wait_on_freeing_inode(inode); + __wait_on_freeing_inode(b, inode); goto repeat; } if (unlikely(inode->i_state & I_CREATING)) { @@ -1155,25 +1207,25 @@ EXPORT_SYMBOL(unlock_two_nondirectories); * return it locked, hashed, and with the I_NEW flag set. The file system gets * to fill it in before unlocking it via unlock_new_inode(). * - * Note both @test and @set are called with the inode_hash_lock held, so can't - * sleep. + * Note both @test and @set are called with the inode hash chain lock held, + * so can't sleep. */ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(inode->i_sb, hashval); + struct hlist_bl_head *b = i_hash_head(inode->i_sb, hashval); struct inode *old; again: - spin_lock(&inode_hash_lock); - old = find_inode(inode->i_sb, head, test, data); + hlist_bl_lock(b); + old = find_inode(inode->i_sb, b, test, data); if (unlikely(old)) { /* * Uhhuh, somebody else created the same inode under us. * Use the old inode instead of the preallocated one. */ - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); if (IS_ERR(old)) return NULL; wait_on_inode(old); @@ -1195,7 +1247,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, */ spin_lock(&inode->i_lock); inode->i_state |= I_NEW; - hlist_add_head_rcu(&inode->i_hash, head); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); /* @@ -1205,7 +1257,7 @@ struct inode *inode_insert5(struct inode *inode, unsigned long hashval, if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); unlock: - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return inode; } @@ -1266,12 +1318,12 @@ EXPORT_SYMBOL(iget5_locked); */ struct inode *iget_locked(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - spin_unlock(&inode_hash_lock); + hlist_bl_lock(b); + inode = find_inode_fast(sb, b, ino); + hlist_bl_unlock(b); if (inode) { if (IS_ERR(inode)) return NULL; @@ -1287,17 +1339,17 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) if (inode) { struct inode *old; - spin_lock(&inode_hash_lock); + hlist_bl_lock(b); /* We released the lock, so.. */ - old = find_inode_fast(sb, head, ino); + old = find_inode_fast(sb, b, ino); if (!old) { inode->i_ino = ino; spin_lock(&inode->i_lock); inode->i_state = I_NEW; - hlist_add_head_rcu(&inode->i_hash, head); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); inode_sb_list_add(inode); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -1310,7 +1362,7 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino) * us. Use the old inode instead of the one we just * allocated. */ - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); destroy_inode(inode); if (IS_ERR(old)) return NULL; @@ -1334,10 +1386,11 @@ EXPORT_SYMBOL(iget_locked); */ static int test_inode_iunique(struct super_block *sb, unsigned long ino) { - struct hlist_head *b = inode_hashtable + hash(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); + struct hlist_bl_node *node; struct inode *inode; - hlist_for_each_entry_rcu(inode, b, i_hash) { + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb) return 0; } @@ -1421,12 +1474,12 @@ EXPORT_SYMBOL(igrab); struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct hlist_bl_head *b = i_hash_head(sb, hashval); struct inode *inode; - spin_lock(&inode_hash_lock); - inode = find_inode(sb, head, test, data); - spin_unlock(&inode_hash_lock); + hlist_bl_lock(b); + inode = find_inode(sb, b, test, data); + hlist_bl_unlock(b); return IS_ERR(inode) ? NULL : inode; } @@ -1476,12 +1529,12 @@ EXPORT_SYMBOL(ilookup5); */ struct inode *ilookup(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); struct inode *inode; again: - spin_lock(&inode_hash_lock); - inode = find_inode_fast(sb, head, ino); - spin_unlock(&inode_hash_lock); + hlist_bl_lock(b); + inode = find_inode_fast(sb, b, ino); + hlist_bl_unlock(b); if (inode) { if (IS_ERR(inode)) @@ -1525,12 +1578,13 @@ struct inode *find_inode_nowait(struct super_block *sb, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct hlist_bl_head *b = i_hash_head(sb, hashval); + struct hlist_bl_node *node; struct inode *inode, *ret_inode = NULL; int mval; - spin_lock(&inode_hash_lock); - hlist_for_each_entry(inode, head, i_hash) { + hlist_bl_lock(b); + hlist_bl_for_each_entry(inode, node, b, i_hash) { if (inode->i_sb != sb) continue; mval = match(inode, hashval, data); @@ -1541,7 +1595,7 @@ struct inode *find_inode_nowait(struct super_block *sb, goto out; } out: - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return ret_inode; } EXPORT_SYMBOL(find_inode_nowait); @@ -1570,13 +1624,14 @@ EXPORT_SYMBOL(find_inode_nowait); struct inode *find_inode_rcu(struct super_block *sb, unsigned long hashval, int (*test)(struct inode *, void *), void *data) { - struct hlist_head *head = inode_hashtable + hash(sb, hashval); + struct hlist_bl_head *b = i_hash_head(sb, hashval); + struct hlist_bl_node *node; struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_rcu() usage"); - hlist_for_each_entry_rcu(inode, head, i_hash) { + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { if (inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE)) && test(inode, data)) @@ -1608,13 +1663,14 @@ EXPORT_SYMBOL(find_inode_rcu); struct inode *find_inode_by_ino_rcu(struct super_block *sb, unsigned long ino) { - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); + struct hlist_bl_node *node; struct inode *inode; RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious find_inode_by_ino_rcu() usage"); - hlist_for_each_entry_rcu(inode, head, i_hash) { + hlist_bl_for_each_entry_rcu(inode, node, b, i_hash) { if (inode->i_ino == ino && inode->i_sb == sb && !(READ_ONCE(inode->i_state) & (I_FREEING | I_WILL_FREE))) @@ -1628,39 +1684,42 @@ int insert_inode_locked(struct inode *inode) { struct super_block *sb = inode->i_sb; ino_t ino = inode->i_ino; - struct hlist_head *head = inode_hashtable + hash(sb, ino); + struct hlist_bl_head *b = i_hash_head(sb, ino); while (1) { - struct inode *old = NULL; - spin_lock(&inode_hash_lock); - hlist_for_each_entry(old, head, i_hash) { - if (old->i_ino != ino) + struct hlist_bl_node *node; + struct inode *old = NULL, *t; + + hlist_bl_lock(b); + hlist_bl_for_each_entry(t, node, b, i_hash) { + if (t->i_ino != ino) continue; - if (old->i_sb != sb) + if (t->i_sb != sb) continue; - spin_lock(&old->i_lock); - if (old->i_state & (I_FREEING|I_WILL_FREE)) { - spin_unlock(&old->i_lock); + spin_lock(&t->i_lock); + if (t->i_state & (I_FREEING|I_WILL_FREE)) { + spin_unlock(&t->i_lock); continue; } + old = t; break; } if (likely(!old)) { spin_lock(&inode->i_lock); inode->i_state |= I_NEW | I_CREATING; - hlist_add_head_rcu(&inode->i_hash, head); + __insert_inode_hash_head(inode, b); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return 0; } if (unlikely(old->i_state & I_CREATING)) { spin_unlock(&old->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); return -EBUSY; } __iget(old); spin_unlock(&old->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); wait_on_inode(old); if (unlikely(!inode_unhashed(old))) { iput(old); @@ -2185,17 +2244,18 @@ EXPORT_SYMBOL(inode_needs_sync); * wake_up_bit(&inode->i_state, __I_NEW) after removing from the hash list * will DTRT. */ -static void __wait_on_freeing_inode(struct inode *inode) +static void __wait_on_freeing_inode(struct hlist_bl_head *b, + struct inode *inode) { wait_queue_head_t *wq; DEFINE_WAIT_BIT(wait, &inode->i_state, __I_NEW); wq = bit_waitqueue(&inode->i_state, __I_NEW); prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); spin_unlock(&inode->i_lock); - spin_unlock(&inode_hash_lock); + hlist_bl_unlock(b); schedule(); finish_wait(wq, &wait.wq_entry); - spin_lock(&inode_hash_lock); + hlist_bl_lock(b); } static __initdata unsigned long ihash_entries; @@ -2221,7 +2281,7 @@ void __init inode_init_early(void) inode_hashtable = alloc_large_system_hash("Inode-cache", - sizeof(struct hlist_head), + sizeof(struct hlist_bl_head), ihash_entries, 14, HASH_EARLY | HASH_ZERO, @@ -2247,7 +2307,7 @@ void __init inode_init(void) inode_hashtable = alloc_large_system_hash("Inode-cache", - sizeof(struct hlist_head), + sizeof(struct hlist_bl_head), ihash_entries, 14, HASH_ZERO, diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 063133ec7..13c40c09d 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -292,8 +292,12 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, gfp_t orig_gfp = gfp; unsigned int nr_vecs = DIV_ROUND_UP(length, PAGE_SIZE); - if (ctx->bio) - submit_bio(ctx->bio); + if (ctx->bio) { + if (iomap->flags & IOMAP_F_NOSUBMIT) + bio_endio(ctx->bio); + else + submit_bio(ctx->bio); + } if (ctx->rac) /* same as readahead_gfp_mask */ gfp |= __GFP_NORETRY | __GFP_NOWARN; @@ -346,7 +350,10 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) folio_set_error(folio); if (ctx.bio) { - submit_bio(ctx.bio); + if (iter.iomap.flags & IOMAP_F_NOSUBMIT) + bio_endio(ctx.bio); + else + submit_bio(ctx.bio); WARN_ON_ONCE(!ctx.cur_folio_in_bio); } else { WARN_ON_ONCE(ctx.cur_folio_in_bio); @@ -418,8 +425,12 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) while (iomap_iter(&iter, ops) > 0) iter.processed = iomap_readahead_iter(&iter, &ctx); - if (ctx.bio) - submit_bio(ctx.bio); + if (ctx.bio) { + if (iter.iomap.flags & IOMAP_F_NOSUBMIT) + bio_endio(ctx.bio); + else + submit_bio(ctx.bio); + } if (ctx.cur_folio) { if (!ctx.cur_folio_in_bio) folio_unlock(ctx.cur_folio); @@ -536,11 +547,17 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, { struct bio_vec bvec; struct bio bio; + int ret = 0; bio_init(&bio, iomap->bdev, &bvec, 1, REQ_OP_READ); bio.bi_iter.bi_sector = iomap_sector(iomap, block_start); bio_add_folio(&bio, folio, plen, poff); - return submit_bio_wait(&bio); + + if (iomap->flags & IOMAP_F_NOSUBMIT) + bio_endio(&bio); + else + ret = submit_bio_wait(&bio); + return ret; } static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, @@ -1486,7 +1503,10 @@ iomap_submit_ioend(struct iomap_writepage_ctx *wpc, struct iomap_ioend *ioend, return error; } - submit_bio(ioend->io_bio); + if (wpc->iomap.flags & IOMAP_F_NOSUBMIT) + bio_endio(ioend->io_bio); + else + submit_bio(ioend->io_bio); return 0; } @@ -1524,8 +1544,9 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, * traversal in iomap_finish_ioend(). */ static struct bio * -iomap_chain_bio(struct bio *prev) +iomap_chain_bio(struct iomap_writepage_ctx *wpc) { + struct bio *prev = wpc->ioend->io_bio; struct bio *new; new = bio_alloc(prev->bi_bdev, BIO_MAX_VECS, prev->bi_opf, GFP_NOFS); @@ -1534,7 +1555,11 @@ iomap_chain_bio(struct bio *prev) bio_chain(prev, new); bio_get(prev); /* for iomap_finish_ioend */ - submit_bio(prev); + + if (wpc->iomap.flags & IOMAP_F_NOSUBMIT) + bio_endio(prev); + else + submit_bio(prev); return new; } @@ -1581,7 +1606,7 @@ iomap_add_to_ioend(struct inode *inode, loff_t pos, struct folio *folio, } if (!bio_add_folio(wpc->ioend->io_bio, folio, len, poff)) { - wpc->ioend->io_bio = iomap_chain_bio(wpc->ioend->io_bio); + wpc->ioend->io_bio = iomap_chain_bio(wpc); bio_add_folio(wpc->ioend->io_bio, folio, len, poff); } diff --git a/fs/super.c b/fs/super.c index 04bc62ab7..a2decce02 100644 --- a/fs/super.c +++ b/fs/super.c @@ -791,14 +791,7 @@ void iterate_supers_type(struct file_system_type *type, EXPORT_SYMBOL(iterate_supers_type); -/** - * get_super - get the superblock of a device - * @bdev: device to get the superblock for - * - * Scans the superblock list and finds the superblock of the file system - * mounted on the device given. %NULL is returned if no match is found. - */ -struct super_block *get_super(struct block_device *bdev) +static struct super_block *__get_super(struct block_device *bdev, bool try) { struct super_block *sb; @@ -813,7 +806,12 @@ struct super_block *get_super(struct block_device *bdev) if (sb->s_bdev == bdev) { sb->s_count++; spin_unlock(&sb_lock); - down_read(&sb->s_umount); + + if (!try) + down_read(&sb->s_umount); + else if (!down_read_trylock(&sb->s_umount)) + return NULL; + /* still alive? */ if (sb->s_root && (sb->s_flags & SB_BORN)) return sb; @@ -828,6 +826,30 @@ struct super_block *get_super(struct block_device *bdev) return NULL; } +/** + * get_super - get the superblock of a device + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ +struct super_block *get_super(struct block_device *bdev) +{ + return __get_super(bdev, false); +} + +/** + * try_get_super - get the superblock of a device, using trylock on sb->s_umount + * @bdev: device to get the superblock for + * + * Scans the superblock list and finds the superblock of the file system + * mounted on the device given. %NULL is returned if no match is found. + */ +struct super_block *try_get_super(struct block_device *bdev) +{ + return __get_super(bdev, true); +} + /** * get_active_super - get an active reference to the superblock of a device * @bdev: device to get the superblock for diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 18c8f168b..f0003446f 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -99,6 +99,9 @@ xfs_bmbt_to_iomap( struct xfs_mount *mp = ip->i_mount; struct xfs_buftarg *target = xfs_inode_buftarg(ip); + if (xfs_has_nodataio(mp)) + iomap_flags |= IOMAP_F_NOSUBMIT; + if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) return xfs_alert_fsblock_zero(ip, imap); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6c09f8953..2733c5484 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -284,6 +284,7 @@ typedef struct xfs_mount { #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ /* Mount features */ +#define XFS_FEAT_NODATAIO (1ULL << 47) /* skip all data I/O */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ @@ -353,6 +354,7 @@ __XFS_HAS_FEAT(large_extent_counts, NREXT64) * bit inodes and read-only state, are kept as operational state rather than * features. */ +__XFS_HAS_FEAT(nodataio, NODATAIO) __XFS_HAS_FEAT(noattr2, NOATTR2) __XFS_HAS_FEAT(noalign, NOALIGN) __XFS_HAS_FEAT(allocsize, ALLOCSIZE) diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 4120bd1cb..83a0a043b 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -121,7 +121,7 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_nodataio, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -166,6 +166,7 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_flag("nodataio", Opt_nodataio), {} }; @@ -1376,6 +1377,9 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_nodataio: + parsing_mp->m_features |= XFS_FEAT_NODATAIO; + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h new file mode 100644 index 000000000..16fbf74ed --- /dev/null +++ b/include/asm-generic/codetag.lds.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef __ASM_GENERIC_CODETAG_LDS_H +#define __ASM_GENERIC_CODETAG_LDS_H + +#define SECTION_WITH_BOUNDARIES(_name) \ + . = ALIGN(8); \ + __start_##_name = .; \ + KEEP(*(_name)) \ + __stop_##_name = .; + +#define CODETAG_SECTIONS() \ + SECTION_WITH_BOUNDARIES(alloc_tags) \ + SECTION_WITH_BOUNDARIES(dynamic_fault_tags) + +#endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index da9e5629e..47dd57ca7 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -50,6 +50,8 @@ * [__nosave_begin, __nosave_end] for the nosave data */ +#include + #ifndef LOAD_OFFSET #define LOAD_OFFSET 0 #endif @@ -374,6 +376,7 @@ . = ALIGN(8); \ BOUNDED_SECTION_BY(__dyndbg_classes, ___dyndbg_classes) \ BOUNDED_SECTION_BY(__dyndbg, ___dyndbg) \ + CODETAG_SECTIONS() \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h new file mode 100644 index 000000000..6c1b7e1dc --- /dev/null +++ b/include/linux/alloc_tag.h @@ -0,0 +1,160 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * allocation tagging + */ +#ifndef _LINUX_ALLOC_TAG_H +#define _LINUX_ALLOC_TAG_H + +#include +#include +#include +#include +#include +#include +#include + +/* + * An instance of this structure is created in a special ELF section at every + * allocation callsite. At runtime, the special section is treated as + * an array of these. Embedded codetag utilizes codetag framework. + */ +struct alloc_tag { + struct codetag ct; + u64 __percpu *bytes_allocated; +} __aligned(8); + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +void alloc_tags_show_mem_report(struct seq_buf *s); + +static inline struct alloc_tag *ct_to_alloc_tag(struct codetag *ct) +{ + return container_of(ct, struct alloc_tag, ct); +} + +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) \ + static struct alloc_tag _alloc_tag __used __aligned(8) \ + __section("alloc_tags") = { .ct = CODE_TAG_INIT }; \ + struct alloc_tag * __maybe_unused _old = alloc_tag_save(&_alloc_tag) + +DECLARE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static inline bool mem_alloc_profiling_enabled(void) +{ + return static_branch_maybe(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + &mem_alloc_profiling_key); +} + +static inline u64 alloc_tag_read(struct alloc_tag *tag) +{ + u64 v = 0; + int cpu; + + for_each_possible_cpu(cpu) + v += *per_cpu_ptr(tag->bytes_allocated, cpu); + + return v; +} + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +#define CODETAG_EMPTY (void *)1 + +static inline bool is_codetag_empty(union codetag_ref *ref) +{ + return ref->ct == CODETAG_EMPTY; +} + +static inline void set_codetag_empty(union codetag_ref *ref) +{ + if (ref) + ref->ct = CODETAG_EMPTY; +} + +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline bool is_codetag_empty(union codetag_ref *ref) { return false; } +static inline void set_codetag_empty(union codetag_ref *ref) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void __alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + struct alloc_tag *tag; + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN_ONCE(ref && !ref->ct, "alloc_tag was not set\n"); +#endif + if (!ref || !ref->ct) + return; + + if (is_codetag_empty(ref)) { + ref->ct = NULL; + return; + } + + tag = ct_to_alloc_tag(ref->ct); + + this_cpu_add(*tag->bytes_allocated, -bytes); + ref->ct = NULL; +} + +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) +{ + __alloc_tag_sub(ref, bytes); +} + +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) +{ + __alloc_tag_sub(ref, bytes); +} + +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, size_t bytes) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN_ONCE(ref && ref->ct, + "alloc_tag was not cleared (got tag for %s:%u)\n",\ + ref->ct->filename, ref->ct->lineno); + + WARN_ONCE(!tag, "current->alloc_tag not set"); +#endif + if (!ref || !tag) + return; + + ref->ct = &tag->ct; + this_cpu_add(*tag->bytes_allocated, bytes); +} + +#else + +#define DEFINE_ALLOC_TAG(_alloc_tag, _old) +static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_sub_noalloc(union codetag_ref *ref, size_t bytes) {} +static inline void alloc_tag_add(union codetag_ref *ref, struct alloc_tag *tag, + size_t bytes) {} +static inline void set_codetag_empty(union codetag_ref *ref) {} + +#endif + +typedef struct mempool_s mempool_t; + +#define res_type_to_err(_res) _Generic((_res), \ + struct folio *: NULL, \ + struct page *: NULL, \ + mempool_t *: NULL, \ + void *: NULL, \ + unsigned long: 0, \ + int: -ENOMEM) + +#define alloc_hooks(_do_alloc) \ +({ \ + typeof(_do_alloc) _res; \ + DEFINE_ALLOC_TAG(_alloc_tag, _old); \ + \ + _res = !memory_fault() ? _do_alloc : res_type_to_err(_res); \ + alloc_tag_restore(&_alloc_tag, _old); \ + _res; \ +}) + +#endif /* _LINUX_ALLOC_TAG_H */ diff --git a/include/linux/bio.h b/include/linux/bio.h index b3e7529ff..f2620f8d1 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -484,7 +484,12 @@ extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, extern void bio_copy_data(struct bio *dst, struct bio *src); extern void bio_free_pages(struct bio *bio); void guard_bio_eod(struct bio *bio); -void zero_fill_bio(struct bio *bio); +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter iter); + +static inline void zero_fill_bio(struct bio *bio) +{ + zero_fill_bio_iter(bio, bio->bi_iter); +} static inline void bio_release_pages(struct bio *bio, bool mark_dirty) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c0ffe203a..7a32dc98e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -854,6 +854,7 @@ extern const char *blk_op_str(enum req_op op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); +const char *blk_status_to_str(blk_status_t status); /* only poll the hardware once, don't continue until a completion was found */ #define BLK_POLL_ONESHOT (1 << 0) diff --git a/drivers/md/bcache/closure.h b/include/linux/closure.h similarity index 93% rename from drivers/md/bcache/closure.h rename to include/linux/closure.h index c88cdc4ae..722a586bb 100644 --- a/drivers/md/bcache/closure.h +++ b/include/linux/closure.h @@ -155,7 +155,7 @@ struct closure { atomic_t remaining; -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES #define CLOSURE_MAGIC_DEAD 0xc054dead #define CLOSURE_MAGIC_ALIVE 0xc054a11e @@ -172,6 +172,11 @@ void __closure_wake_up(struct closure_waitlist *list); bool closure_wait(struct closure_waitlist *list, struct closure *cl); void __closure_sync(struct closure *cl); +static inline unsigned closure_nr_remaining(struct closure *cl) +{ + return atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK; +} + /** * closure_sync - sleep until a closure a closure has nothing left to wait on * @@ -180,19 +185,17 @@ void __closure_sync(struct closure *cl); */ static inline void closure_sync(struct closure *cl) { - if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1) + if (closure_nr_remaining(cl) != 1) __closure_sync(cl); } -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES -void closure_debug_init(void); void closure_debug_create(struct closure *cl); void closure_debug_destroy(struct closure *cl); #else -static inline void closure_debug_init(void) {} static inline void closure_debug_create(struct closure *cl) {} static inline void closure_debug_destroy(struct closure *cl) {} @@ -200,21 +203,21 @@ static inline void closure_debug_destroy(struct closure *cl) {} static inline void closure_set_ip(struct closure *cl) { -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES cl->ip = _THIS_IP_; #endif } static inline void closure_set_ret_ip(struct closure *cl) { -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES cl->ip = _RET_IP_; #endif } static inline void closure_set_waiting(struct closure *cl, unsigned long f) { -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES cl->waiting_on = f; #endif } @@ -243,6 +246,7 @@ static inline void closure_queue(struct closure *cl) */ BUILD_BUG_ON(offsetof(struct closure, fn) != offsetof(struct work_struct, func)); + if (wq) { INIT_WORK(&cl->work, cl->work.func); BUG_ON(!queue_work(wq, &cl->work)); @@ -255,7 +259,7 @@ static inline void closure_queue(struct closure *cl) */ static inline void closure_get(struct closure *cl) { -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES BUG_ON((atomic_inc_return(&cl->remaining) & CLOSURE_REMAINING_MASK) <= 1); #else @@ -271,7 +275,7 @@ static inline void closure_get(struct closure *cl) */ static inline void closure_init(struct closure *cl, struct closure *parent) { - memset(cl, 0, sizeof(struct closure)); + cl->fn = NULL; cl->parent = parent; if (parent) closure_get(parent); @@ -375,4 +379,26 @@ static inline void closure_call(struct closure *cl, closure_fn fn, continue_at_nobarrier(cl, fn, wq); } +#define __closure_wait_event(waitlist, _cond) \ +do { \ + struct closure cl; \ + \ + closure_init_stack(&cl); \ + \ + while (1) { \ + closure_wait(waitlist, &cl); \ + if (_cond) \ + break; \ + closure_sync(&cl); \ + } \ + closure_wake_up(waitlist); \ + closure_sync(&cl); \ +} while (0) + +#define closure_wait_event(waitlist, _cond) \ +do { \ + if (!(_cond)) \ + __closure_wait_event(waitlist, _cond); \ +} while (0) + #endif /* _LINUX_CLOSURE_H */ diff --git a/include/linux/codetag.h b/include/linux/codetag.h new file mode 100644 index 000000000..87207f199 --- /dev/null +++ b/include/linux/codetag.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * code tagging framework + */ +#ifndef _LINUX_CODETAG_H +#define _LINUX_CODETAG_H + +#include + +struct codetag_iterator; +struct codetag_type; +struct seq_buf; +struct module; + +/* + * An instance of this structure is created in a special ELF section at every + * code location being tagged. At runtime, the special section is treated as + * an array of these. + */ +struct codetag { + unsigned int flags; /* used in later patches */ + unsigned int lineno; + const char *modname; + const char *function; + const char *filename; +} __aligned(8); + +union codetag_ref { + struct codetag *ct; +}; + +struct codetag_range { + struct codetag *start; + struct codetag *stop; +}; + +struct codetag_module { + struct module *mod; + struct codetag_range range; +}; + +struct codetag_type_desc { + const char *section; + size_t tag_size; + void (*module_load)(struct codetag_type *cttype, + struct codetag_module *cmod); + bool (*module_unload)(struct codetag_type *cttype, + struct codetag_module *cmod); +}; + +struct codetag_iterator { + struct codetag_type *cttype; + struct codetag_module *cmod; + unsigned long mod_id; + struct codetag *ct; +}; + +#define CODE_TAG_INIT { \ + .modname = KBUILD_MODNAME, \ + .function = __func__, \ + .filename = __FILE__, \ + .lineno = __LINE__, \ + .flags = 0, \ +} + +void codetag_lock_module_list(struct codetag_type *cttype, bool lock); +struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype); +struct codetag *codetag_next_ct(struct codetag_iterator *iter); + +void codetag_to_text(struct seq_buf *out, struct codetag *ct); + +struct codetag_type * +codetag_register_type(const struct codetag_type_desc *desc); + +#ifdef CONFIG_CODE_TAGGING +void codetag_load_module(struct module *mod); +bool codetag_unload_module(struct module *mod); +#else +static inline void codetag_load_module(struct module *mod) {} +static inline bool codetag_unload_module(struct module *mod) { return true; } +#endif + +/* Codetag query parsing */ + +struct codetag_query { + const char *filename; + const char *module; + const char *function; + const char *class; + unsigned int first_line, last_line; + unsigned int first_index, last_index; + unsigned int cur_index; + + bool match_line:1; + bool match_index:1; + + unsigned int set_enabled:1; + unsigned int enabled:2; + + unsigned int set_frequency:1; + unsigned int frequency; +}; + +char *codetag_query_parse(struct codetag_query *q, char *buf); +bool codetag_matches_query(struct codetag_query *q, + const struct codetag *ct, + const struct codetag_module *mod, + const char *class); + +#endif /* _LINUX_CODETAG_H */ diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6b351e009..3da2f0545 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -251,6 +251,7 @@ extern struct dentry * d_make_root(struct inode *); /* - the ramfs-type tree */ extern void d_genocide(struct dentry *); +extern void d_mark_tmpfile(struct file *, struct inode *); extern void d_tmpfile(struct file *, struct inode *); extern struct dentry *d_find_alias(struct inode *); diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index 31f114f48..d741940dc 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -27,7 +27,7 @@ struct dma_map_ops { unsigned long attrs); void (*free)(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, unsigned long attrs); - struct page *(*alloc_pages)(struct device *dev, size_t size, + struct page *(*alloc_pages_op)(struct device *dev, size_t size, dma_addr_t *dma_handle, enum dma_data_direction dir, gfp_t gfp); void (*free_pages)(struct device *dev, size_t size, struct page *vaddr, diff --git a/include/linux/dynamic_fault.h b/include/linux/dynamic_fault.h new file mode 100644 index 000000000..526a33209 --- /dev/null +++ b/include/linux/dynamic_fault.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_DYNAMIC_FAULT_H +#define _LINUX_DYNAMIC_FAULT_H + +/* + * Dynamic/code tagging fault injection: + * + * Originally based on the dynamic debug trick of putting types in a special elf + * section, then rewritten using code tagging: + * + * To use, simply insert a call to dynamic_fault("fault_class"), which will + * return true if an error should be injected. + * + * Fault injection sites may be listed and enabled via debugfs, under + * /sys/kernel/debug/dynamic_faults. + */ + +#ifdef CONFIG_CODETAG_FAULT_INJECTION + +#include +#include + +#define DFAULT_STATES() \ + x(disabled) \ + x(enabled) \ + x(oneshot) + +enum dfault_enabled { +#define x(n) DFAULT_##n, + DFAULT_STATES() +#undef x +}; + +union dfault_state { + struct { + unsigned int enabled:2; + unsigned int count:30; + }; + + struct { + unsigned int v; + }; +}; + +struct dfault { + struct codetag tag; + const char *class; + unsigned int frequency; + union dfault_state state; + struct static_key_false enabled; +}; + +bool __dynamic_fault_enabled(struct dfault *df); + +#define dynamic_fault(_class) \ +({ \ + static struct dfault \ + __used \ + __section("dynamic_fault_tags") \ + __aligned(8) df = { \ + .tag = CODE_TAG_INIT, \ + .class = _class, \ + .enabled = STATIC_KEY_FALSE_INIT, \ + }; \ + \ + static_key_false(&df.enabled.key) && \ + __dynamic_fault_enabled(&df); \ +}) + +#else + +#define dynamic_fault(_class) false + +#endif /* CODETAG_FAULT_INJECTION */ + +#define memory_fault() dynamic_fault("memory") + +#endif /* _LINUX_DYNAMIC_FAULT_H */ diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h index 9edb29101..4bf7c8466 100644 --- a/include/linux/exportfs.h +++ b/include/linux/exportfs.h @@ -98,6 +98,12 @@ enum fid_type { */ FILEID_FAT_WITH_PARENT = 0x72, + /* + * 64 bit inode number, 32 bit subvolume, 32 bit generation number: + */ + FILEID_BCACHEFS_WITHOUT_PARENT = 0x80, + FILEID_BCACHEFS_WITH_PARENT = 0x81, + /* * 128 bit child FID (struct lu_fid) * 128 bit parent FID (struct lu_fid) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index c9de1f59e..6f36fff09 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -689,9 +689,9 @@ __FORTIFY_INLINE void *memchr_inv(const void * const POS0 p, int c, size_t size) return __real_memchr_inv(p, c, size); } -extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup) +extern void *__real_kmemdup(const void *src, size_t len, gfp_t gfp) __RENAME(kmemdup_noprof) __realloc_size(2); -__FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp) +__FORTIFY_INLINE void *kmemdup_noprof(const void * const POS0 p, size_t size, gfp_t gfp) { size_t p_size = __struct_size(p); @@ -701,6 +701,7 @@ __FORTIFY_INLINE void *kmemdup(const void * const POS0 p, size_t size, gfp_t gfp fortify_panic(__func__); return __real_kmemdup(p, size, gfp); } +#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) /** * strcpy - Copy a string into another string buffer diff --git a/include/linux/fs.h b/include/linux/fs.h index 133f0640f..f04872975 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -664,7 +664,8 @@ struct inode { unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; - struct hlist_node i_hash; + struct hlist_bl_node i_hash; + struct hlist_bl_head *i_hash_head; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ @@ -730,7 +731,7 @@ static inline unsigned int i_blocksize(const struct inode *node) static inline int inode_unhashed(struct inode *inode) { - return hlist_unhashed(&inode->i_hash); + return hlist_bl_unhashed(&inode->i_hash); } /* @@ -741,7 +742,7 @@ static inline int inode_unhashed(struct inode *inode) */ static inline void inode_fake_hash(struct inode *inode) { - hlist_add_fake(&inode->i_hash); + hlist_bl_add_fake(&inode->i_hash); } /* @@ -2699,11 +2700,7 @@ int setattr_should_drop_sgid(struct mnt_idmap *idmap, * This must be used for allocating filesystems specific inodes to set * up the inode reclaim context correctly. */ -static inline void * -alloc_inode_sb(struct super_block *sb, struct kmem_cache *cache, gfp_t gfp) -{ - return kmem_cache_alloc_lru(cache, &sb->s_inode_lru, gfp); -} +#define alloc_inode_sb(_sb, _cache, _gfp) kmem_cache_alloc_lru(_cache, &_sb->s_inode_lru, _gfp) extern void __insert_inode_hash(struct inode *, unsigned long hashval); static inline void insert_inode_hash(struct inode *inode) @@ -2714,7 +2711,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { - if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) + if (!inode_unhashed(inode) && !hlist_bl_fake(&inode->i_hash)) __remove_inode_hash(inode); } @@ -2897,6 +2894,7 @@ extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern struct super_block *get_super(struct block_device *); +extern struct super_block *try_get_super(struct block_device *); extern struct super_block *get_active_super(struct block_device *bdev); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 107613f7d..c74b73769 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -38,6 +38,7 @@ #include #include +#include #include #include #include @@ -116,6 +117,11 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_cast(_radix) (typeof((_radix)->type[0]) *) #define __genradix_obj_size(_radix) sizeof((_radix)->type[0]) +#define __genradix_objs_per_page(_radix) \ + (PAGE_SIZE / sizeof((_radix)->type[0])) +#define __genradix_page_remainder(_radix) \ + (PAGE_SIZE % sizeof((_radix)->type[0])) + #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) @@ -179,11 +185,35 @@ void *__genradix_iter_peek(struct genradix_iter *, struct __genradix *, size_t); #define genradix_iter_peek(_iter, _radix) \ (__genradix_cast(_radix) \ __genradix_iter_peek(_iter, &(_radix)->tree, \ - PAGE_SIZE / __genradix_obj_size(_radix))) + __genradix_objs_per_page(_radix))) + +void *__genradix_iter_peek_prev(struct genradix_iter *, struct __genradix *, + size_t, size_t); + +/** + * genradix_iter_peek - get first entry at or below iterator's current + * position + * @_iter: a genradix_iter + * @_radix: genradix being iterated over + * + * If no more entries exist at or below @_iter's current position, returns NULL + */ +#define genradix_iter_peek_prev(_iter, _radix) \ + (__genradix_cast(_radix) \ + __genradix_iter_peek_prev(_iter, &(_radix)->tree, \ + __genradix_objs_per_page(_radix), \ + __genradix_obj_size(_radix) + \ + __genradix_page_remainder(_radix))) static inline void __genradix_iter_advance(struct genradix_iter *iter, size_t obj_size) { + if (iter->offset + obj_size < iter->offset) { + iter->offset = SIZE_MAX; + iter->pos = SIZE_MAX; + return; + } + iter->offset += obj_size; if (!is_power_of_2(obj_size) && @@ -196,6 +226,25 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, #define genradix_iter_advance(_iter, _radix) \ __genradix_iter_advance(_iter, __genradix_obj_size(_radix)) +static inline void __genradix_iter_rewind(struct genradix_iter *iter, + size_t obj_size) +{ + if (iter->offset == 0 || + iter->offset == SIZE_MAX) { + iter->offset = SIZE_MAX; + return; + } + + if ((iter->offset & (PAGE_SIZE - 1)) == 0) + iter->offset -= PAGE_SIZE % obj_size; + + iter->offset -= obj_size; + iter->pos--; +} + +#define genradix_iter_rewind(_iter, _radix) \ + __genradix_iter_rewind(_iter, __genradix_obj_size(_radix)) + #define genradix_for_each_from(_radix, _iter, _p, _start) \ for (_iter = genradix_iter_init(_radix, _start); \ (_p = genradix_iter_peek(&_iter, _radix)) != NULL; \ @@ -213,6 +262,23 @@ static inline void __genradix_iter_advance(struct genradix_iter *iter, #define genradix_for_each(_radix, _iter, _p) \ genradix_for_each_from(_radix, _iter, _p, 0) +#define genradix_last_pos(_radix) \ + (SIZE_MAX / PAGE_SIZE * __genradix_objs_per_page(_radix) - 1) + +/** + * genradix_for_each_reverse - iterate over entry in a genradix, reverse order + * @_radix: genradix to iterate over + * @_iter: a genradix_iter to track current position + * @_p: pointer to genradix entry type + * + * On every iteration, @_p will point to the current entry, and @_iter.pos + * will be the current entry's index. + */ +#define genradix_for_each_reverse(_radix, _iter, _p) \ + for (_iter = genradix_iter_init(_radix, genradix_last_pos(_radix));\ + (_p = genradix_iter_peek_prev(&_iter, _radix)) != NULL;\ + genradix_iter_rewind(&_iter, _radix)) + int __genradix_prealloc(struct __genradix *, size_t, gfp_t); /** diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed8cb537c..495745c99 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -6,6 +6,8 @@ #include #include +#include +#include struct vm_area_struct; @@ -174,42 +176,43 @@ static inline void arch_free_page(struct page *page, int order) { } static inline void arch_alloc_page(struct page *page, int order) { } #endif -struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); -struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, +#define __alloc_pages(...) alloc_hooks(__alloc_pages_noprof(__VA_ARGS__)) + +struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask); +#define __folio_alloc(...) alloc_hooks(__folio_alloc_noprof(__VA_ARGS__)) -unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, +unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct list_head *page_list, struct page **page_array); +#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__)) -unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, +unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array); +#define alloc_pages_bulk_array_mempolicy(...) alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__)) /* Bulk allocate order-0 pages */ -static inline unsigned long -alloc_pages_bulk_list(gfp_t gfp, unsigned long nr_pages, struct list_head *list) -{ - return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, list, NULL); -} +#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \ + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL) -static inline unsigned long -alloc_pages_bulk_array(gfp_t gfp, unsigned long nr_pages, struct page **page_array) -{ - return __alloc_pages_bulk(gfp, numa_mem_id(), NULL, nr_pages, NULL, page_array); -} +#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \ + __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array) static inline unsigned long -alloc_pages_bulk_array_node(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) +alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages, struct page **page_array) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return __alloc_pages_bulk(gfp, nid, NULL, nr_pages, NULL, page_array); + return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array); } +#define alloc_pages_bulk_array_node(...) alloc_hooks(alloc_pages_bulk_array_node_noprof(__VA_ARGS__)) + static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) { gfp_t warn_gfp = gfp_mask & (__GFP_THISNODE|__GFP_NOWARN); @@ -229,21 +232,23 @@ static inline void warn_if_node_offline(int this_node, gfp_t gfp_mask) * online. For more general interface, see alloc_pages_node(). */ static inline struct page * -__alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +__alloc_pages_node_noprof(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp_mask); - return __alloc_pages(gfp_mask, order, nid, NULL); + return __alloc_pages_noprof(gfp_mask, order, nid, NULL); } +#define __alloc_pages_node(...) alloc_hooks(__alloc_pages_node_noprof(__VA_ARGS__)) + static inline struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); warn_if_node_offline(nid, gfp); - return __folio_alloc(gfp, order, nid, NULL); + return __folio_alloc_noprof(gfp, order, nid, NULL); } /* @@ -251,53 +256,69 @@ struct folio *__folio_alloc_node(gfp_t gfp, unsigned int order, int nid) * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ -static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, - unsigned int order) +static inline struct page *alloc_pages_node_noprof(int nid, gfp_t gfp_mask, + unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); - return __alloc_pages_node(nid, gfp_mask, order); + return __alloc_pages_node_noprof(nid, gfp_mask, order); } +#define alloc_pages_node(...) alloc_hooks(alloc_pages_node_noprof(__VA_ARGS__)) + #ifdef CONFIG_NUMA -struct page *alloc_pages(gfp_t gfp, unsigned int order); -struct folio *folio_alloc(gfp_t gfp, unsigned order); -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else -static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) +static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { - return alloc_pages_node(numa_node_id(), gfp_mask, order); + return alloc_pages_node_noprof(numa_node_id(), gfp_mask, order); } -static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) +static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \ - folio_alloc(gfp, order) +#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ + folio_alloc_noprof(gfp, order) #endif + +#define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) +#define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) +#define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) + #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -static inline struct page *alloc_page_vma(gfp_t gfp, + +static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false); + struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); return &folio->page; } +#define alloc_page_vma(...) alloc_hooks(alloc_page_vma_noprof(__VA_ARGS__)) + +extern unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order); +#define __get_free_pages(...) alloc_hooks(get_free_pages_noprof(__VA_ARGS__)) -extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); -extern unsigned long get_zeroed_page(gfp_t gfp_mask); +extern unsigned long get_zeroed_page_noprof(gfp_t gfp_mask); +#define get_zeroed_page(...) alloc_hooks(get_zeroed_page_noprof(__VA_ARGS__)) + +void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) __alloc_size(1); +#define alloc_pages_exact(...) alloc_hooks(alloc_pages_exact_noprof(__VA_ARGS__)) -void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __alloc_size(1); void free_pages_exact(void *virt, size_t size); -__meminit void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); -#define __get_free_page(gfp_mask) \ - __get_free_pages((gfp_mask), 0) +__meminit void *alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) __alloc_size(2); +#define alloc_pages_exact_nid(...) alloc_hooks(alloc_pages_exact_nid_noprof(__VA_ARGS__)) + +#define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) -#define __get_dma_pages(gfp_mask, order) \ - __get_free_pages((gfp_mask) | GFP_DMA, (order)) +#define __get_dma_pages(gfp_mask, order) \ + __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); @@ -354,10 +375,14 @@ static inline bool pm_suspended_storage(void) #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ -extern int alloc_contig_range(unsigned long start, unsigned long end, +extern int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); -extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask); +#define alloc_contig_range(...) alloc_hooks(alloc_contig_range_noprof(__VA_ARGS__)) + +extern struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask); +#define alloc_contig_pages(...) alloc_hooks(alloc_contig_pages_noprof(__VA_ARGS__)) + #endif void free_contig_range(unsigned long pfn, unsigned long nr_pages); diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 6583a5867..1c6573d69 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -21,44 +21,86 @@ typedef unsigned int __bitwise gfp_t; * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ +enum { + ___GFP_DMA_BIT, + ___GFP_HIGHMEM_BIT, + ___GFP_DMA32_BIT, + ___GFP_MOVABLE_BIT, + ___GFP_RECLAIMABLE_BIT, + ___GFP_HIGH_BIT, + ___GFP_IO_BIT, + ___GFP_FS_BIT, + ___GFP_ZERO_BIT, + ___GFP_UNUSED_BIT, /* 0x200u unused */ + ___GFP_DIRECT_RECLAIM_BIT, + ___GFP_KSWAPD_RECLAIM_BIT, + ___GFP_WRITE_BIT, + ___GFP_NOWARN_BIT, + ___GFP_RETRY_MAYFAIL_BIT, + ___GFP_NOFAIL_BIT, + ___GFP_NORETRY_BIT, + ___GFP_MEMALLOC_BIT, + ___GFP_COMP_BIT, + ___GFP_NOMEMALLOC_BIT, + ___GFP_HARDWALL_BIT, + ___GFP_THISNODE_BIT, + ___GFP_ACCOUNT_BIT, + ___GFP_ZEROTAGS_BIT, +#ifdef CONFIG_KASAN_HW_TAGS + ___GFP_SKIP_ZERO_BIT, + ___GFP_SKIP_KASAN_BIT, +#endif +#ifdef CONFIG_LOCKDEP + ___GFP_NOLOCKDEP_BIT, +#endif +#ifdef CONFIG_SLAB_OBJ_EXT + ___GFP_NO_OBJ_EXT_BIT, +#endif + ___GFP_LAST_BIT +}; + /* Plain integer GFP bitmasks. Do not use this directly. */ -#define ___GFP_DMA 0x01u -#define ___GFP_HIGHMEM 0x02u -#define ___GFP_DMA32 0x04u -#define ___GFP_MOVABLE 0x08u -#define ___GFP_RECLAIMABLE 0x10u -#define ___GFP_HIGH 0x20u -#define ___GFP_IO 0x40u -#define ___GFP_FS 0x80u -#define ___GFP_ZERO 0x100u +#define ___GFP_DMA BIT(___GFP_DMA_BIT) +#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) +#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) +#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) +#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) +#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) /* 0x200u unused */ -#define ___GFP_DIRECT_RECLAIM 0x400u -#define ___GFP_KSWAPD_RECLAIM 0x800u -#define ___GFP_WRITE 0x1000u -#define ___GFP_NOWARN 0x2000u -#define ___GFP_RETRY_MAYFAIL 0x4000u -#define ___GFP_NOFAIL 0x8000u -#define ___GFP_NORETRY 0x10000u -#define ___GFP_MEMALLOC 0x20000u -#define ___GFP_COMP 0x40000u -#define ___GFP_NOMEMALLOC 0x80000u -#define ___GFP_HARDWALL 0x100000u -#define ___GFP_THISNODE 0x200000u -#define ___GFP_ACCOUNT 0x400000u -#define ___GFP_ZEROTAGS 0x800000u +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) +#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) +#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) +#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) +#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) +#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) +#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) +#define ___GFP_COMP BIT(___GFP_COMP_BIT) +#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) +#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) +#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) +#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) +#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) #ifdef CONFIG_KASAN_HW_TAGS -#define ___GFP_SKIP_ZERO 0x1000000u -#define ___GFP_SKIP_KASAN 0x2000000u +#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) +#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) #else #define ___GFP_SKIP_ZERO 0 #define ___GFP_SKIP_KASAN 0 #endif #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x4000000u +#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) #else #define ___GFP_NOLOCKDEP 0 #endif -/* If the above are modified, __GFP_BITS_SHIFT may need updating */ +#ifdef CONFIG_SLAB_OBJ_EXT +#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) +#else +#define ___GFP_NO_OBJ_EXT 0 +#endif /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -99,12 +141,15 @@ typedef unsigned int __bitwise gfp_t; * node with no fallbacks or placement policy enforcements. * * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. + * + * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) +#define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT) /** * DOC: Watermark modifiers @@ -249,7 +294,7 @@ typedef unsigned int __bitwise gfp_t; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (26 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT ___GFP_LAST_BIT #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 0ee140176..e67349e84 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/include/linux/iomap.h b/include/linux/iomap.h index e2b836c2e..a774d074b 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -60,6 +60,7 @@ struct vm_fault; #define IOMAP_F_MERGED (1U << 3) #define IOMAP_F_BUFFER_HEAD (1U << 4) #define IOMAP_F_XATTR (1U << 5) +#define IOMAP_F_NOSUBMIT (1U << 6) /* * Flags set by the core iomap code during operations: diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h index ae1b54144..8ee2bf5af 100644 --- a/include/linux/list_bl.h +++ b/include/linux/list_bl.h @@ -143,6 +143,28 @@ static inline void hlist_bl_del_init(struct hlist_bl_node *n) } } +/** + * hlist_bl_add_fake - create a fake list consisting of a single headless node + * @n: Node to make a fake list out of + * + * This makes @n appear to be its own predecessor on a headless hlist. + * The point of this is to allow things like hlist_bl_del() to work correctly + * in cases where there is no list. + */ +static inline void hlist_bl_add_fake(struct hlist_bl_node *n) +{ + n->pprev = &n->next; +} + +/** + * hlist_fake: Is this node a fake hlist_bl? + * @h: Node to check for being a self-referential fake hlist. + */ +static inline bool hlist_bl_fake(struct hlist_bl_node *n) +{ + return n->pprev == &n->next; +} + static inline void hlist_bl_lock(struct hlist_bl_head *b) { bit_spin_lock(0, (unsigned long *)b); diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 74bd269a8..3bb30499d 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -344,6 +344,8 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) +int lock_class_is_held(struct lock_class_key *key); + /* * Must use lock_map_aquire_try() with override maps to avoid * lockdep thinking they participate in the block chain. @@ -442,6 +444,8 @@ extern int lockdep_is_held(const void *); #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) +static inline int lock_class_is_held(struct lock_class_key *key) { return 0; } + #define DEFINE_WAIT_OVERRIDE_MAP(_name, _wait_type) \ struct lockdep_map __maybe_unused _name = {} @@ -681,4 +685,10 @@ lockdep_rcu_suspicious(const char *file, const int line, const char *s) } #endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_set_no_check_recursion(struct lockdep_map *map); +#else +static inline void lockdep_set_no_check_recursion(struct lockdep_map *map) {} +#endif + #endif /* __LINUX_LOCKDEP_H */ diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 59f4fb162..f90c779e4 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -129,7 +129,7 @@ struct lock_class { u8 wait_type_inner; u8 wait_type_outer; u8 lock_type; - /* u8 hole; */ + u8 no_check_recursion; #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; diff --git a/include/linux/mean_and_variance.h b/include/linux/mean_and_variance.h new file mode 100644 index 000000000..647505010 --- /dev/null +++ b/include/linux/mean_and_variance.h @@ -0,0 +1,198 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef MEAN_AND_VARIANCE_H_ +#define MEAN_AND_VARIANCE_H_ + +#include +#include +#include +#include + +#define SQRT_U64_MAX 4294967295ULL + +/* + * u128_u: u128 user mode, because not all architectures support a real int128 + * type + */ + +#ifdef __SIZEOF_INT128__ + +typedef struct { + unsigned __int128 v; +} __aligned(16) u128_u; + +static inline u128_u u64_to_u128(u64 a) +{ + return (u128_u) { .v = a }; +} + +static inline u64 u128_lo(u128_u a) +{ + return a.v; +} + +static inline u64 u128_hi(u128_u a) +{ + return a.v >> 64; +} + +static inline u128_u u128_add(u128_u a, u128_u b) +{ + a.v += b.v; + return a; +} + +static inline u128_u u128_sub(u128_u a, u128_u b) +{ + a.v -= b.v; + return a; +} + +static inline u128_u u128_shl(u128_u a, s8 shift) +{ + a.v <<= shift; + return a; +} + +static inline u128_u u128_square(u64 a) +{ + u128_u b = u64_to_u128(a); + + b.v *= b.v; + return b; +} + +#else + +typedef struct { + u64 hi, lo; +} __aligned(16) u128_u; + +/* conversions */ + +static inline u128_u u64_to_u128(u64 a) +{ + return (u128_u) { .lo = a }; +} + +static inline u64 u128_lo(u128_u a) +{ + return a.lo; +} + +static inline u64 u128_hi(u128_u a) +{ + return a.hi; +} + +/* arithmetic */ + +static inline u128_u u128_add(u128_u a, u128_u b) +{ + u128_u c; + + c.lo = a.lo + b.lo; + c.hi = a.hi + b.hi + (c.lo < a.lo); + return c; +} + +static inline u128_u u128_sub(u128_u a, u128_u b) +{ + u128_u c; + + c.lo = a.lo - b.lo; + c.hi = a.hi - b.hi - (c.lo > a.lo); + return c; +} + +static inline u128_u u128_shl(u128_u i, s8 shift) +{ + u128_u r; + + r.lo = i.lo << shift; + if (shift < 64) + r.hi = (i.hi << shift) | (i.lo >> (64 - shift)); + else { + r.hi = i.lo << (shift - 64); + r.lo = 0; + } + return r; +} + +static inline u128_u u128_square(u64 i) +{ + u128_u r; + u64 h = i >> 32, l = i & U32_MAX; + + r = u128_shl(u64_to_u128(h*h), 64); + r = u128_add(r, u128_shl(u64_to_u128(h*l), 32)); + r = u128_add(r, u128_shl(u64_to_u128(l*h), 32)); + r = u128_add(r, u64_to_u128(l*l)); + return r; +} + +#endif + +static inline u128_u u64s_to_u128(u64 hi, u64 lo) +{ + u128_u c = u64_to_u128(hi); + + c = u128_shl(c, 64); + c = u128_add(c, u64_to_u128(lo)); + return c; +} + +u128_u u128_div(u128_u n, u64 d); + +struct mean_and_variance { + s64 n; + s64 sum; + u128_u sum_squares; +}; + +/* expontentially weighted variant */ +struct mean_and_variance_weighted { + bool init; + u8 weight; /* base 2 logarithim */ + s64 mean; + u64 variance; +}; + +/** + * fast_divpow2() - fast approximation for n / (1 << d) + * @n: numerator + * @d: the power of 2 denominator. + * + * note: this rounds towards 0. + */ +static inline s64 fast_divpow2(s64 n, u8 d) +{ + return (n + ((n < 0) ? ((1 << d) - 1) : 0)) >> d; +} + +/** + * mean_and_variance_update() - update a mean_and_variance struct @s1 with a new sample @v1 + * and return it. + * @s1: the mean_and_variance to update. + * @v1: the new sample. + * + * see linked pdf equation 12. + */ +static inline void +mean_and_variance_update(struct mean_and_variance *s, s64 v) +{ + s->n++; + s->sum += v; + s->sum_squares = u128_add(s->sum_squares, u128_square(abs(v))); +} + +s64 mean_and_variance_get_mean(struct mean_and_variance s); +u64 mean_and_variance_get_variance(struct mean_and_variance s1); +u32 mean_and_variance_get_stddev(struct mean_and_variance s); + +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 v); + +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s); +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s); +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s); + +#endif // MEAN_AND_VAIRANCE_H_ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 222d73701..3eb8975c1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -339,15 +339,32 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; enum page_memcg_data_flags { - /* page->memcg_data is a pointer to an objcgs vector */ - MEMCG_DATA_OBJCGS = (1UL << 0), + /* page->memcg_data is a pointer to an slabobj_ext vector */ + MEMCG_DATA_OBJEXTS = (1UL << 0), /* page has been accounted as a non-slab kernel page */ MEMCG_DATA_KMEM = (1UL << 1), /* the next bit after the last actual flag */ __NR_MEMCG_DATA_FLAGS = (1UL << 2), }; -#define MEMCG_DATA_FLAGS_MASK (__NR_MEMCG_DATA_FLAGS - 1) +#define __FIRST_OBJEXT_FLAG __NR_MEMCG_DATA_FLAGS + +#else /* CONFIG_MEMCG */ + +#define __FIRST_OBJEXT_FLAG (1UL << 0) + +#endif /* CONFIG_MEMCG */ + +enum objext_flags { + /* slabobj_ext vector failed to allocate */ + OBJEXTS_ALLOC_FAIL = __FIRST_OBJEXT_FLAG, + /* the next bit after the last actual flag */ + __NR_OBJEXTS_FLAGS = (__FIRST_OBJEXT_FLAG << 1), +}; + +#define OBJEXTS_FLAGS_MASK (__NR_OBJEXTS_FLAGS - 1) + +#ifdef CONFIG_MEMCG static inline bool folio_memcg_kmem(struct folio *folio); @@ -378,10 +395,10 @@ static inline struct mem_cgroup *__folio_memcg(struct folio *folio) unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_KMEM, folio); - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* @@ -399,10 +416,10 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) unsigned long memcg_data = folio->memcg_data; VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(memcg_data & MEMCG_DATA_OBJEXTS, folio); VM_BUG_ON_FOLIO(!(memcg_data & MEMCG_DATA_KMEM), folio); - return (struct obj_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return (struct obj_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* @@ -459,11 +476,11 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; - objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } /* @@ -496,17 +513,17 @@ static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) */ unsigned long memcg_data = READ_ONCE(folio->memcg_data); - if (memcg_data & MEMCG_DATA_OBJCGS) + if (memcg_data & MEMCG_DATA_OBJEXTS) return NULL; if (memcg_data & MEMCG_DATA_KMEM) { struct obj_cgroup *objcg; - objcg = (void *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); return obj_cgroup_memcg(objcg); } - return (struct mem_cgroup *)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); + return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); } static inline struct mem_cgroup *page_memcg_check(struct page *page) @@ -542,7 +559,7 @@ static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *ob static inline bool folio_memcg_kmem(struct folio *folio) { VM_BUG_ON_PGFLAGS(PageTail(&folio->page), &folio->page); - VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJCGS, folio); + VM_BUG_ON_FOLIO(folio->memcg_data & MEMCG_DATA_OBJEXTS, folio); return folio->memcg_data & MEMCG_DATA_KMEM; } @@ -1606,6 +1623,19 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, } #endif /* CONFIG_MEMCG */ +/* + * Extended information for slab objects stored as an array in page->memcg_data + * if MEMCG_DATA_OBJEXTS is set. + */ +struct slabobj_ext { +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *objcg; +#endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + union codetag_ref ref; +#endif +} __aligned(8); + static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { __mod_lruvec_kmem_state(p, idx, 1); diff --git a/include/linux/mempool.h b/include/linux/mempool.h index 4aae6c06c..9fa126aa1 100644 --- a/include/linux/mempool.h +++ b/include/linux/mempool.h @@ -5,6 +5,8 @@ #ifndef _LINUX_MEMPOOL_H #define _LINUX_MEMPOOL_H +#include +#include #include #include @@ -39,18 +41,32 @@ void mempool_exit(mempool_t *pool); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id); -int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + +int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); +#define mempool_init(...) \ + alloc_hooks(mempool_init_noprof(__VA_ARGS__)) extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); -extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, + +extern mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int nid); +#define mempool_create_node(...) \ + alloc_hooks(mempool_create_node_noprof(__VA_ARGS__)) + +#define mempool_create(_min_nr, _alloc_fn, _free_fn, _pool_data) \ + mempool_create_node(_min_nr, _alloc_fn, _free_fn, _pool_data, \ + GFP_KERNEL, NUMA_NO_NODE) extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); -extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; + +extern void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) __malloc; +#define mempool_alloc(...) \ + alloc_hooks(mempool_alloc_noprof(__VA_ARGS__)) + extern void mempool_free(void *element, mempool_t *pool); /* @@ -61,19 +77,10 @@ extern void mempool_free(void *element, mempool_t *pool); void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); void mempool_free_slab(void *element, void *pool_data); -static inline int -mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc) -{ - return mempool_init(pool, min_nr, mempool_alloc_slab, - mempool_free_slab, (void *) kc); -} - -static inline mempool_t * -mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) -{ - return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab, - (void *) kc); -} +#define mempool_init_slab_pool(_pool, _min_nr, _kc) \ + mempool_init(_pool, (_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc)) +#define mempool_create_slab_pool(_min_nr, _kc) \ + mempool_create((_min_nr), mempool_alloc_slab, mempool_free_slab, (void *)(_kc)) /* * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the @@ -82,17 +89,12 @@ mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); void mempool_kfree(void *element, void *pool_data); -static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size) -{ - return mempool_init(pool, min_nr, mempool_kmalloc, - mempool_kfree, (void *) size); -} - -static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) -{ - return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, - (void *) size); -} +#define mempool_init_kmalloc_pool(_pool, _min_nr, _size) \ + mempool_init(_pool, (_min_nr), mempool_kmalloc, mempool_kfree, \ + (void *)(unsigned long)(_size)) +#define mempool_create_kmalloc_pool(_min_nr, _size) \ + mempool_create((_min_nr), mempool_kmalloc, mempool_kfree, \ + (void *)(unsigned long)(_size)) /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that @@ -101,16 +103,11 @@ static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data); void mempool_free_pages(void *element, void *pool_data); -static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) -{ - return mempool_init(pool, min_nr, mempool_alloc_pages, - mempool_free_pages, (void *)(long)order); -} - -static inline mempool_t *mempool_create_page_pool(int min_nr, int order) -{ - return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages, - (void *)(long)order); -} +#define mempool_init_page_pool(_pool, _min_nr, _order) \ + mempool_init(_pool, (_min_nr), mempool_alloc_pages, \ + mempool_free_pages, (void *)(long)(_order)) +#define mempool_create_page_pool(_min_nr, _order) \ + mempool_create((_min_nr), mempool_alloc_pages, \ + mempool_free_pages, (void *)(long)(_order)) #endif /* _LINUX_MEMPOOL_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 3c6c4c836..88b45fb4f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -2925,6 +2926,13 @@ extern void reserve_bootmem_region(phys_addr_t start, phys_addr_t end); /* Free the reserved page into the buddy system, so it gets managed. */ static inline void free_reserved_page(struct page *page) { + union codetag_ref *ref; + + ref = get_page_tag_ref(page); + if (ref) { + set_codetag_empty(ref); + put_page_tag_ref(ref); + } ClearPageReserved(page); init_page_count(page); __free_page(page); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 306a3d1a0..e79303e1e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -194,7 +194,7 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; -#ifdef CONFIG_MEMCG +#ifdef CONFIG_SLAB_OBJ_EXT unsigned long memcg_data; #endif @@ -320,7 +320,7 @@ struct folio { void *private; atomic_t _mapcount; atomic_t _refcount; -#ifdef CONFIG_MEMCG +#ifdef CONFIG_SLAB_OBJ_EXT unsigned long memcg_data; #endif /* private: the union with struct page is transitional */ diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index bb0ee8052..fda37b6df 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -93,10 +93,10 @@ #include #include #include +#include #include #include -typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; /** diff --git a/include/linux/nodemask_types.h b/include/linux/nodemask_types.h new file mode 100644 index 000000000..84c2f47c4 --- /dev/null +++ b/include/linux/nodemask_types.h @@ -0,0 +1,9 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_NODEMASK_TYPES_H +#define __LINUX_NODEMASK_TYPES_H + +#include + +typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; + +#endif /* __LINUX_NODEMASK_TYPES_H */ diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index 67314f648..cff15ee54 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -4,7 +4,6 @@ #include #include -#include struct pglist_data; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 08328b579..347ba7f86 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -467,14 +467,17 @@ static inline void *detach_page_private(struct page *page) } #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order); +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order); #else -static inline struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +static inline struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { - return folio_alloc(gfp, order); + return folio_alloc_noprof(gfp, order); } #endif +#define filemap_alloc_folio(...) \ + alloc_hooks(filemap_alloc_folio_noprof(__VA_ARGS__)) + static inline struct page *__page_cache_alloc(gfp_t gfp) { return &filemap_alloc_folio(gfp, 0)->page; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1338ea2aa..dc50dedb0 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -2,12 +2,14 @@ #ifndef __LINUX_PERCPU_H #define __LINUX_PERCPU_H +#include #include #include #include #include #include #include +#include #include @@ -116,7 +118,6 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif -extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); @@ -124,10 +125,15 @@ extern bool is_kernel_percpu_address(unsigned long addr); extern void __init setup_per_cpu_areas(void); #endif -extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __alloc_size(1); -extern void __percpu *__alloc_percpu(size_t size, size_t align) __alloc_size(1); -extern void free_percpu(void __percpu *__pdata); -extern phys_addr_t per_cpu_ptr_to_phys(void *addr); +extern void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, + gfp_t gfp) __alloc_size(1); + +#define __alloc_percpu_gfp(_size, _align, _gfp) \ + alloc_hooks(pcpu_alloc_noprof(_size, _align, false, _gfp)) +#define __alloc_percpu(_size, _align) \ + alloc_hooks(pcpu_alloc_noprof(_size, _align, false, GFP_KERNEL)) +#define __alloc_reserved_percpu(_size, _align) \ + alloc_hooks(pcpu_alloc_noprof(_size, _align, true, GFP_KERNEL)) #define alloc_percpu_gfp(type, gfp) \ (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ @@ -136,6 +142,9 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +extern void free_percpu(void __percpu *__pdata); +extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + extern unsigned long pcpu_nr_pages(void); #endif /* __LINUX_PERCPU_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h new file mode 100644 index 000000000..ae9b0f359 --- /dev/null +++ b/include/linux/pgalloc_tag.h @@ -0,0 +1,105 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * page allocation tagging + */ +#ifndef _LINUX_PGALLOC_TAG_H +#define _LINUX_PGALLOC_TAG_H + +#include + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +#include + +extern struct page_ext_operations page_alloc_tagging_ops; +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); + +static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) +{ + return (void *)page_ext + page_alloc_tagging_ops.offset; +} + +static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) +{ + return (void *)ref - page_alloc_tagging_ops.offset; +} + +static inline union codetag_ref *get_page_tag_ref(struct page *page) +{ + if (page && mem_alloc_profiling_enabled()) { + struct page_ext *page_ext = page_ext_get(page); + + if (page_ext) + return codetag_ref_from_page_ext(page_ext); + } + return NULL; +} + +static inline void put_page_tag_ref(union codetag_ref *ref) +{ + page_ext_put(page_ext_from_codetag_ref(ref)); +} + +static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int order) +{ + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE << order); + put_page_tag_ref(ref); + } +} + +static inline void pgalloc_tag_sub(struct page *page, unsigned int order) +{ + union codetag_ref *ref = get_page_tag_ref(page); + + if (ref) { + alloc_tag_sub(ref, PAGE_SIZE << order); + put_page_tag_ref(ref); + } +} + +static inline void pgalloc_tag_split(struct page *page, unsigned int nr) +{ + int i; + struct page_ext *page_ext; + union codetag_ref *ref; + struct alloc_tag *tag; + + if (!mem_alloc_profiling_enabled()) + return; + + page_ext = page_ext_get(page); + if (unlikely(!page_ext)) + return; + + ref = codetag_ref_from_page_ext(page_ext); + if (!ref->ct) + goto out; + + tag = ct_to_alloc_tag(ref->ct); + page_ext = page_ext_next(page_ext); + for (i = 1; i < nr; i++) { + /* New reference with 0 bytes accounted */ + alloc_tag_add(codetag_ref_from_page_ext(page_ext), tag, 0); + page_ext = page_ext_next(page_ext); + } +out: + page_ext_put(page_ext); +} + +#else /* CONFIG_MEM_ALLOC_PROFILING */ + +static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } +static inline void put_page_tag_ref(union codetag_ref *ref) {} +static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, + unsigned int order) {} +static inline void pgalloc_tag_sub(struct page *page, unsigned int order) {} +static inline void pgalloc_tag_split(struct page *page, unsigned int nr) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + +#endif /* _LINUX_PGALLOC_TAG_H */ diff --git a/include/linux/prandom.h b/include/linux/prandom.h index f2ed5b72b..f7f1e5251 100644 --- a/include/linux/prandom.h +++ b/include/linux/prandom.h @@ -10,7 +10,6 @@ #include #include -#include #include struct rnd_state { diff --git a/include/linux/rhashtable-types.h b/include/linux/rhashtable-types.h index 57467cbf4..92a8e670c 100644 --- a/include/linux/rhashtable-types.h +++ b/include/linux/rhashtable-types.h @@ -9,6 +9,7 @@ #ifndef _LINUX_RHASHTABLE_TYPES_H #define _LINUX_RHASHTABLE_TYPES_H +#include #include #include #include @@ -88,6 +89,7 @@ struct rhashtable { struct mutex mutex; spinlock_t lock; atomic_t nelems; + struct alloc_tag *alloc_tag; }; /** @@ -127,9 +129,12 @@ struct rhashtable_iter { bool end_of_table; }; -int rhashtable_init(struct rhashtable *ht, +int rhashtable_init_noprof(struct rhashtable *ht, const struct rhashtable_params *params); -int rhltable_init(struct rhltable *hlt, +#define rhashtable_init(...) alloc_hooks(rhashtable_init_noprof(__VA_ARGS__)) + +int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params); +#define rhltable_init(...) alloc_hooks(rhltable_init_noprof(__VA_ARGS__)) #endif /* _LINUX_RHASHTABLE_TYPES_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 847332470..5c359b8b2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -763,6 +763,10 @@ struct task_struct { unsigned int flags; unsigned int ptrace; +#ifdef CONFIG_MEM_ALLOC_PROFILING + struct alloc_tag *alloc_tag; +#endif + #ifdef CONFIG_SMP int on_cpu; struct __call_single_node wake_entry; @@ -802,6 +806,7 @@ struct task_struct { struct task_group *sched_task_group; #endif + #ifdef CONFIG_UCLAMP_TASK /* * Clamp values requested for a scheduling entity. @@ -871,6 +876,7 @@ struct task_struct { struct mm_struct *mm; struct mm_struct *active_mm; + struct address_space *faults_disabled_mapping; int exit_state; int exit_code; @@ -1163,7 +1169,7 @@ struct task_struct { #endif #ifdef CONFIG_LOCKDEP -# define MAX_LOCK_DEPTH 48UL +# define MAX_LOCK_DEPTH 63UL u64 curr_chain_key; int lockdep_depth; unsigned int lockdep_recursion; @@ -2446,4 +2452,23 @@ static inline int sched_core_idle_cpu(int cpu) { return idle_cpu(cpu); } extern void sched_set_stop_task(int cpu, struct task_struct *stop); +#ifdef CONFIG_MEM_ALLOC_PROFILING +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) +{ + swap(current->alloc_tag, tag); + return tag; +} + +static inline void alloc_tag_restore(struct alloc_tag *tag, struct alloc_tag *old) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + WARN(current->alloc_tag != tag, "current->alloc_tag was changed:\n"); +#endif + current->alloc_tag = old; +} +#else +static inline struct alloc_tag *alloc_tag_save(struct alloc_tag *tag) { return NULL; } +#define alloc_tag_restore(_tag, _old) +#endif + #endif diff --git a/include/linux/seq_buf.h b/include/linux/seq_buf.h index 515d7fcb9..cc02410f2 100644 --- a/include/linux/seq_buf.h +++ b/include/linux/seq_buf.h @@ -161,4 +161,6 @@ seq_buf_bprintf(struct seq_buf *s, const char *fmt, const u32 *binary); void seq_buf_do_printk(struct seq_buf *s, const char *lvl); +void seq_buf_human_readable_u64(struct seq_buf *, u64); + #endif /* _LINUX_SEQ_BUF_H */ diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 224293b2d..a15a45d06 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -5,6 +5,8 @@ #include #include +struct seq_buf; + /* * This struct is used to pass information from page reclaim to the shrinkers. * We consolidate the values for easier extension later. @@ -61,10 +63,12 @@ struct shrink_control { * @flags determine the shrinker abilities, like numa awareness */ struct shrinker { + const char *name; unsigned long (*count_objects)(struct shrinker *, struct shrink_control *sc); unsigned long (*scan_objects)(struct shrinker *, struct shrink_control *sc); + void (*to_text)(struct seq_buf *, struct shrinker *); long batch; /* reclaim batch size, 0 = default */ int seeks; /* seeks to recreate an obj */ @@ -78,11 +82,13 @@ struct shrinker { #endif #ifdef CONFIG_SHRINKER_DEBUG int debugfs_id; - const char *name; struct dentry *debugfs_entry; #endif /* objs pending delete, per node */ atomic_long_t *nr_deferred; + + atomic_long_t objects_requested_to_free; + atomic_long_t objects_freed; }; #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */ @@ -104,6 +110,7 @@ extern int __printf(2, 3) register_shrinker(struct shrinker *shrinker, extern void unregister_shrinker(struct shrinker *shrinker); extern void free_prealloced_shrinker(struct shrinker *shrinker); extern void synchronize_shrinkers(void); +void shrinkers_to_text(struct seq_buf *); #ifdef CONFIG_SHRINKER_DEBUG extern int shrinker_debugfs_add(struct shrinker *shrinker); diff --git a/include/linux/six.h b/include/linux/six.h new file mode 100644 index 000000000..394da423c --- /dev/null +++ b/include/linux/six.h @@ -0,0 +1,388 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_SIX_H +#define _LINUX_SIX_H + +/** + * DOC: SIX locks overview + * + * Shared/intent/exclusive locks: sleepable read/write locks, like rw semaphores + * but with an additional state: read/shared, intent, exclusive/write + * + * The purpose of the intent state is to allow for greater concurrency on tree + * structures without deadlocking. In general, a read can't be upgraded to a + * write lock without deadlocking, so an operation that updates multiple nodes + * will have to take write locks for the full duration of the operation. + * + * But by adding an intent state, which is exclusive with other intent locks but + * not with readers, we can take intent locks at thte start of the operation, + * and then take write locks only for the actual update to each individual + * nodes, without deadlocking. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * + * An intent lock must be held before taking a write lock: + * six_lock_intent(&foo->lock); + * six_lock_write(&foo->lock); + * six_unlock_write(&foo->lock); + * six_unlock_intent(&foo->lock); + * + * Other operations: + * six_trylock_read() + * six_trylock_intent() + * six_trylock_write() + * + * six_lock_downgrade() convert from intent to read + * six_lock_tryupgrade() attempt to convert from read to intent, may fail + * + * There are also interfaces that take the lock type as an enum: + * + * six_lock_type(&foo->lock, SIX_LOCK_read); + * six_trylock_convert(&foo->lock, SIX_LOCK_read, SIX_LOCK_intent) + * six_lock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_write); + * six_unlock_type(&foo->lock, SIX_LOCK_intent); + * + * Lock sequence numbers - unlock(), relock(): + * + * Locks embed sequences numbers, which are incremented on write lock/unlock. + * This allows locks to be dropped and the retaken iff the state they protect + * hasn't changed; this makes it much easier to avoid holding locks while e.g. + * doing IO or allocating memory. + * + * Example usage: + * six_lock_read(&foo->lock); + * u32 seq = six_lock_seq(&foo->lock); + * six_unlock_read(&foo->lock); + * + * some_operation_that_may_block(); + * + * if (six_relock_read(&foo->lock, seq)) { ... } + * + * If the relock operation succeeds, it is as if the lock was never unlocked. + * + * Reentrancy: + * + * Six locks are not by themselves reentrent, but have counters for both the + * read and intent states that can be used to provide reentrency by an upper + * layer that tracks held locks. If a lock is known to already be held in the + * read or intent state, six_lock_increment() can be used to bump the "lock + * held in this state" counter, increasing the number of unlock calls that + * will be required to fully unlock it. + * + * Example usage: + * six_lock_read(&foo->lock); + * six_lock_increment(&foo->lock, SIX_LOCK_read); + * six_unlock_read(&foo->lock); + * six_unlock_read(&foo->lock); + * foo->lock is now fully unlocked. + * + * Since the intent state supercedes read, it's legal to increment the read + * counter when holding an intent lock, but not the reverse. + * + * A lock may only be held once for write: six_lock_increment(.., SIX_LOCK_write) + * is not legal. + * + * should_sleep_fn: + * + * There is a six_lock() variant that takes a function pointer that is called + * immediately prior to schedule() when blocking, and may return an error to + * abort. + * + * One possible use for this feature is when objects being locked are part of + * a cache and may reused, and lock ordering is based on a property of the + * object that will change when the object is reused - i.e. logical key order. + * + * If looking up an object in the cache may race with object reuse, and lock + * ordering is required to prevent deadlock, object reuse may change the + * correct lock order for that object and cause a deadlock. should_sleep_fn + * can be used to check if the object is still the object we want and avoid + * this deadlock. + * + * Wait list entry interface: + * + * There is a six_lock() variant, six_lock_waiter(), that takes a pointer to a + * wait list entry. By embedding six_lock_waiter into another object, and by + * traversing lock waitlists, it is then possible for an upper layer to + * implement full cycle detection for deadlock avoidance. + * + * should_sleep_fn should be used for invoking the cycle detector, walking the + * graph of held locks to check for a deadlock. The upper layer must track + * held locks for each thread, and each thread's held locks must be reachable + * from its six_lock_waiter object. + * + * six_lock_waiter() will add the wait object to the waitlist re-trying taking + * the lock, and before calling should_sleep_fn, and the wait object will not + * be removed from the waitlist until either the lock has been successfully + * acquired, or we aborted because should_sleep_fn returned an error. + * + * Also, six_lock_waiter contains a timestamp, and waiters on a waitlist will + * have timestamps in strictly ascending order - this is so the timestamp can + * be used as a cursor for lock graph traverse. + */ + +#include +#include +#include +#include + +enum six_lock_type { + SIX_LOCK_read, + SIX_LOCK_intent, + SIX_LOCK_write, +}; + +struct six_lock { + atomic_t state; + u32 seq; + unsigned intent_lock_recurse; + struct task_struct *owner; + unsigned __percpu *readers; + struct optimistic_spin_queue osq; + raw_spinlock_t wait_lock; + struct list_head wait_list; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +struct six_lock_waiter { + struct list_head list; + struct task_struct *task; + enum six_lock_type lock_want; + bool lock_acquired; + u64 start_time; +}; + +typedef int (*six_lock_should_sleep_fn)(struct six_lock *lock, void *); + +void six_lock_exit(struct six_lock *lock); + +enum six_lock_init_flags { + SIX_LOCK_INIT_PCPU = 1U << 0, +}; + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags); + +/** + * six_lock_init - initialize a six lock + * @lock: lock to initialize + * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU + */ +#define six_lock_init(lock, flags) \ +do { \ + static struct lock_class_key __key; \ + \ + __six_lock_init((lock), #lock, &__key, flags); \ +} while (0) + +/** + * six_lock_seq - obtain current lock sequence number + * @lock: six_lock to obtain sequence number for + * + * @lock should be held for read or intent, and not write + * + * By saving the lock sequence number, we can unlock @lock and then (typically + * after some blocking operation) attempt to relock it: the relock will succeed + * if the sequence number hasn't changed, meaning no write locks have been taken + * and state corresponding to what @lock protects is still valid. + */ +static inline u32 six_lock_seq(const struct six_lock *lock) +{ + return lock->seq; +} + +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_trylock_type - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * Return: true on success, false on failure. + */ +static inline bool six_trylock_type(struct six_lock *lock, enum six_lock_type type) +{ + return six_trylock_ip(lock, type, _THIS_IP_); +} + +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip); + +/** + * six_lock_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * This is a convenience wrapper around six_lock_ip_waiter(), see that function + * for full documentation. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + return six_lock_ip_waiter(lock, type, wait, should_sleep_fn, p, _THIS_IP_); +} + +/** + * six_lock_ip - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_ip(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, ip); +} + +/** + * six_lock_type - take a six lock lock + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +static inline int six_lock_type(struct six_lock *lock, enum six_lock_type type, + six_lock_should_sleep_fn should_sleep_fn, void *p) +{ + struct six_lock_waiter wait; + + return six_lock_ip_waiter(lock, type, &wait, should_sleep_fn, p, _THIS_IP_); +} + +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip); + +/** + * six_relock_type - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * + * Return: true on success, false on failure. + */ +static inline bool six_relock_type(struct six_lock *lock, enum six_lock_type type, + unsigned seq) +{ + return six_relock_ip(lock, type, seq, _THIS_IP_); +} + +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip); + +/** + * six_unlock_type - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +static inline void six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + six_unlock_ip(lock, type, _THIS_IP_); +} + +#define __SIX_LOCK(type) \ +static inline bool six_trylock_ip_##type(struct six_lock *lock, unsigned long ip)\ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline bool six_trylock_##type(struct six_lock *lock) \ +{ \ + return six_trylock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} \ + \ +static inline int six_lock_ip_waiter_##type(struct six_lock *lock, \ + struct six_lock_waiter *wait, \ + six_lock_should_sleep_fn should_sleep_fn, void *p,\ + unsigned long ip) \ +{ \ + return six_lock_ip_waiter(lock, SIX_LOCK_##type, wait, should_sleep_fn, p, ip);\ +} \ + \ +static inline int six_lock_ip_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn should_sleep_fn, void *p, \ + unsigned long ip) \ +{ \ + return six_lock_ip(lock, SIX_LOCK_##type, should_sleep_fn, p, ip);\ +} \ + \ +static inline bool six_relock_ip_##type(struct six_lock *lock, u32 seq, unsigned long ip)\ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, ip); \ +} \ + \ +static inline bool six_relock_##type(struct six_lock *lock, u32 seq) \ +{ \ + return six_relock_ip(lock, SIX_LOCK_##type, seq, _THIS_IP_); \ +} \ + \ +static inline int six_lock_##type(struct six_lock *lock, \ + six_lock_should_sleep_fn fn, void *p)\ +{ \ + return six_lock_ip_##type(lock, fn, p, _THIS_IP_); \ +} \ + \ +static inline void six_unlock_ip_##type(struct six_lock *lock, unsigned long ip) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, ip); \ +} \ + \ +static inline void six_unlock_##type(struct six_lock *lock) \ +{ \ + six_unlock_ip(lock, SIX_LOCK_##type, _THIS_IP_); \ +} + +__SIX_LOCK(read) +__SIX_LOCK(intent) +__SIX_LOCK(write) +#undef __SIX_LOCK + +void six_lock_downgrade(struct six_lock *); +bool six_lock_tryupgrade(struct six_lock *); +bool six_trylock_convert(struct six_lock *, enum six_lock_type, + enum six_lock_type); + +void six_lock_increment(struct six_lock *, enum six_lock_type); + +void six_lock_wakeup_all(struct six_lock *); + +struct six_lock_count { + unsigned n[3]; +}; + +struct six_lock_count six_lock_counts(struct six_lock *); +void six_lock_readers_add(struct six_lock *, int); + +#endif /* _LINUX_SIX_H */ diff --git a/include/linux/slab.h b/include/linux/slab.h index 6b3e155b7..f7bc3ab70 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -147,6 +147,13 @@ #endif #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ +#ifdef CONFIG_SLAB_OBJ_EXT +/* Slab created using create_boot_cache */ +#define SLAB_NO_OBJ_EXT ((slab_flags_t __force)0x20000000U) +#else +#define SLAB_NO_OBJ_EXT 0 +#endif + /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * @@ -206,7 +213,9 @@ int kmem_cache_shrink(struct kmem_cache *s); /* * Common kmalloc functions provided by all allocators */ -void * __must_check krealloc(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); +void * __must_check krealloc_noprof(const void *objp, size_t new_size, gfp_t flags) __realloc_size(2); +#define krealloc(...) alloc_hooks(krealloc_noprof(__VA_ARGS__)) + void kfree(const void *objp); void kfree_sensitive(const void *objp); size_t __ksize(const void *objp); @@ -444,7 +453,10 @@ static __always_inline unsigned int __kmalloc_index(size_t size, static_assert(PAGE_SHIFT <= 20); #define kmalloc_index(s) __kmalloc_index(s, true) -void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); +#include + +void *__kmalloc_noprof(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_size(1); +#define __kmalloc(...) alloc_hooks(__kmalloc_noprof(__VA_ARGS__)) /** * kmem_cache_alloc - Allocate an object @@ -456,9 +468,13 @@ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __alloc_siz * * Return: pointer to the new object or %NULL in case of error */ -void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; -void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, - gfp_t gfpflags) __assume_slab_alignment __malloc; +void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) __assume_slab_alignment __malloc; +#define kmem_cache_alloc(...) alloc_hooks(kmem_cache_alloc_noprof(__VA_ARGS__)) + +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, + gfp_t gfpflags) __assume_slab_alignment __malloc; +#define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__)) + void kmem_cache_free(struct kmem_cache *s, void *objp); /* @@ -469,29 +485,40 @@ void kmem_cache_free(struct kmem_cache *s, void *objp); * Note that interrupts must be enabled when calling these functions. */ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p); -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p); + +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, void **p); +#define kmem_cache_alloc_bulk(...) alloc_hooks(kmem_cache_alloc_bulk_noprof(__VA_ARGS__)) static __always_inline void kfree_bulk(size_t size, void **p) { kmem_cache_free_bulk(NULL, size, p); } -void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment +void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __alloc_size(1); -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment - __malloc; +#define __kmalloc_node(...) alloc_hooks(__kmalloc_node_noprof(__VA_ARGS__)) -void *kmalloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t flags, int node) __assume_slab_alignment + __malloc; +#define kmem_cache_alloc_node(...) alloc_hooks(kmem_cache_alloc_node_noprof(__VA_ARGS__)) + +void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t flags, size_t size) __assume_kmalloc_alignment __alloc_size(3); -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node, size_t size) __assume_kmalloc_alignment +void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, + int node, size_t size) __assume_kmalloc_alignment __alloc_size(4); -void *kmalloc_large(size_t size, gfp_t flags) __assume_page_alignment +#define kmalloc_trace(...) alloc_hooks(kmalloc_trace_noprof(__VA_ARGS__)) + +#define kmalloc_node_trace(...) alloc_hooks(kmalloc_node_trace_noprof(__VA_ARGS__)) + +void *kmalloc_large_noprof(size_t size, gfp_t flags) __assume_page_alignment __alloc_size(1); +#define kmalloc_large(...) alloc_hooks(kmalloc_large_noprof(__VA_ARGS__)) -void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_alignment +void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) __assume_page_alignment __alloc_size(1); +#define kmalloc_large_node(...) alloc_hooks(kmalloc_large_node_noprof(__VA_ARGS__)) /** * kmalloc - allocate kernel memory @@ -547,37 +574,39 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) __assume_page_align * Try really hard to succeed the allocation but fail * eventually. */ -static __always_inline __alloc_size(1) void *kmalloc(size_t size, gfp_t flags) +static __always_inline __alloc_size(1) void *kmalloc_noprof(size_t size, gfp_t flags) { if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) - return kmalloc_large(size, flags); + return kmalloc_large_noprof(size, flags); index = kmalloc_index(size); - return kmalloc_trace( + return kmalloc_trace_noprof( kmalloc_caches[kmalloc_type(flags)][index], flags, size); } - return __kmalloc(size, flags); + return __kmalloc_noprof(size, flags); } +#define kmalloc(...) alloc_hooks(kmalloc_noprof(__VA_ARGS__)) -static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t flags, int node) +static __always_inline __alloc_size(1) void *kmalloc_node_noprof(size_t size, gfp_t flags, int node) { if (__builtin_constant_p(size) && size) { unsigned int index; if (size > KMALLOC_MAX_CACHE_SIZE) - return kmalloc_large_node(size, flags, node); + return kmalloc_large_node_noprof(size, flags, node); index = kmalloc_index(size); - return kmalloc_node_trace( + return kmalloc_node_trace_noprof( kmalloc_caches[kmalloc_type(flags)][index], flags, node, size); } - return __kmalloc_node(size, flags, node); + return __kmalloc_node_noprof(size, flags, node); } +#define kmalloc_node(...) alloc_hooks(kmalloc_node_noprof(__VA_ARGS__)) /** * kmalloc_array - allocate memory for an array. @@ -585,16 +614,17 @@ static __always_inline __alloc_size(1) void *kmalloc_node(size_t size, gfp_t fla * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_t flags) +static inline __alloc_size(1, 2) void *kmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) - return kmalloc(bytes, flags); - return __kmalloc(bytes, flags); + return kmalloc_noprof(bytes, flags); + return kmalloc_noprof(bytes, flags); } +#define kmalloc_array(...) alloc_hooks(kmalloc_array_noprof(__VA_ARGS__)) /** * krealloc_array - reallocate memory for an array. @@ -603,18 +633,19 @@ static inline __alloc_size(1, 2) void *kmalloc_array(size_t n, size_t size, gfp_ * @new_size: new size of a single member of the array * @flags: the type of memory to allocate (see kmalloc) */ -static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, - size_t new_n, - size_t new_size, - gfp_t flags) +static inline __realloc_size(2, 3) void * __must_check krealloc_array_noprof(void *p, + size_t new_n, + size_t new_size, + gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(new_n, new_size, &bytes))) return NULL; - return krealloc(p, bytes, flags); + return krealloc_noprof(p, bytes, flags); } +#define krealloc_array(...) alloc_hooks(krealloc_array_noprof(__VA_ARGS__)) /** * kcalloc - allocate memory for an array. The memory is set to zero. @@ -622,16 +653,11 @@ static inline __realloc_size(2, 3) void * __must_check krealloc_array(void *p, * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -static inline __alloc_size(1, 2) void *kcalloc(size_t n, size_t size, gfp_t flags) -{ - return kmalloc_array(n, size, flags | __GFP_ZERO); -} +#define kcalloc(_n, _size, _flags) kmalloc_array(_n, _size, (_flags) | __GFP_ZERO) -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, +void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, int node, unsigned long caller) __alloc_size(1); -#define kmalloc_node_track_caller(size, flags, node) \ - __kmalloc_node_track_caller(size, flags, node, \ - _RET_IP_) +#define kmalloc_node_track_caller(...) alloc_hooks(kmalloc_node_track_caller_noprof(__VA_ARGS__, _RET_IP_)) /* * kmalloc_track_caller is a special version of kmalloc that records the @@ -641,11 +667,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t flags, int node, * allocator where we care about the real place the memory allocation * request comes from. */ -#define kmalloc_track_caller(size, flags) \ - __kmalloc_node_track_caller(size, flags, \ - NUMA_NO_NODE, _RET_IP_) +#define kmalloc_track_caller(...) kmalloc_node_track_caller(__VA_ARGS__, NUMA_NO_NODE) -static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, +static inline __alloc_size(1, 2) void *kmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) { size_t bytes; @@ -653,75 +677,51 @@ static inline __alloc_size(1, 2) void *kmalloc_array_node(size_t n, size_t size, if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) - return kmalloc_node(bytes, flags, node); - return __kmalloc_node(bytes, flags, node); + return kmalloc_node_noprof(bytes, flags, node); + return __kmalloc_node_noprof(bytes, flags, node); } +#define kmalloc_array_node(...) alloc_hooks(kmalloc_array_node_noprof(__VA_ARGS__)) -static inline __alloc_size(1, 2) void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) -{ - return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); -} +#define kcalloc_node(_n, _size, _flags, _node) kmalloc_array_node(_n, _size, (_flags) | __GFP_ZERO, _node) /* * Shortcuts */ -static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) -{ - return kmem_cache_alloc(k, flags | __GFP_ZERO); -} +#define kmem_cache_zalloc(_k, _flags) kmem_cache_alloc(_k, (_flags)|__GFP_ZERO) /** * kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ -static inline __alloc_size(1) void *kzalloc(size_t size, gfp_t flags) -{ - return kmalloc(size, flags | __GFP_ZERO); -} - -/** - * kzalloc_node - allocate zeroed memory from a particular memory node. - * @size: how many bytes of memory are required. - * @flags: the type of memory to allocate (see kmalloc). - * @node: memory node from which to allocate - */ -static inline __alloc_size(1) void *kzalloc_node(size_t size, gfp_t flags, int node) +static inline __alloc_size(1) void *kzalloc_noprof(size_t size, gfp_t flags) { - return kmalloc_node(size, flags | __GFP_ZERO, node); + return kmalloc_noprof(size, flags | __GFP_ZERO); } +#define kzalloc(...) alloc_hooks(kzalloc_noprof(__VA_ARGS__)) +#define kzalloc_node(_size, _flags, _node) kmalloc_node(_size, (_flags)|__GFP_ZERO, _node) -extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __alloc_size(1); -static inline __alloc_size(1) void *kvmalloc(size_t size, gfp_t flags) -{ - return kvmalloc_node(size, flags, NUMA_NO_NODE); -} -static inline __alloc_size(1) void *kvzalloc_node(size_t size, gfp_t flags, int node) -{ - return kvmalloc_node(size, flags | __GFP_ZERO, node); -} -static inline __alloc_size(1) void *kvzalloc(size_t size, gfp_t flags) -{ - return kvmalloc(size, flags | __GFP_ZERO); -} +extern void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) __alloc_size(1); +#define kvmalloc_node(...) alloc_hooks(kvmalloc_node_noprof(__VA_ARGS__)) -static inline __alloc_size(1, 2) void *kvmalloc_array(size_t n, size_t size, gfp_t flags) -{ - size_t bytes; +#define kvmalloc(_size, _flags) kvmalloc_node(_size, _flags, NUMA_NO_NODE) +#define kvzalloc(_size, _flags) kvmalloc(_size, _flags|__GFP_ZERO) - if (unlikely(check_mul_overflow(n, size, &bytes))) - return NULL; +#define kvzalloc_node(_size, _flags, _node) kvmalloc_node(_size, _flags|__GFP_ZERO, _node) - return kvmalloc(bytes, flags); -} +#define kvmalloc_array(_n, _size, _flags) \ +({ \ + size_t _bytes; \ + \ + !check_mul_overflow(_n, _size, &_bytes) ? kvmalloc(_bytes, _flags) : NULL; \ +}) -static inline __alloc_size(1, 2) void *kvcalloc(size_t n, size_t size, gfp_t flags) -{ - return kvmalloc_array(n, size, flags | __GFP_ZERO); -} +#define kvcalloc(_n, _size, _flags) kvmalloc_array(_n, _size, _flags|__GFP_ZERO) -extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) __realloc_size(3); +#define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) + extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index a61e7d55d..23f14dcb8 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -107,7 +107,7 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct slab *slab, const void *obj) { u32 offset = (obj - slab->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index f6df03f93..e8be5b368 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -176,14 +176,14 @@ static inline void *nearest_obj(struct kmem_cache *cache, const struct slab *sla /* Determine object index from a given position */ static inline unsigned int __obj_to_index(const struct kmem_cache *cache, - void *addr, void *obj) + void *addr, const void *obj) { return reciprocal_divide(kasan_reset_tag(obj) - addr, cache->reciprocal_size); } static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct slab *slab, void *obj) + const struct slab *slab, const void *obj) { if (is_kfence_address(obj)) return 0; diff --git a/include/linux/string.h b/include/linux/string.h index c062c581a..198ca51ed 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -96,6 +96,7 @@ extern char * strpbrk(const char *,const char *); #ifndef __HAVE_ARCH_STRSEP extern char * strsep(char **,const char *); #endif +extern char *strsep_no_empty(char **, const char *); #ifndef __HAVE_ARCH_STRSPN extern __kernel_size_t strspn(const char *,const char *); #endif @@ -176,7 +177,9 @@ extern void kfree_const(const void *x); extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); -extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); +extern void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) __realloc_size(2); +#define kmemdup(...) alloc_hooks(kmemdup_noprof(__VA_ARGS__)) + extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h index fae6beaaa..ae51580b9 100644 --- a/include/linux/string_helpers.h +++ b/include/linux/string_helpers.h @@ -16,15 +16,14 @@ static inline bool string_is_terminated(const char *s, int len) return memchr(s, '\0', len) ? true : false; } -/* Descriptions of the types of units to - * print in */ -enum string_size_units { - STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ - STRING_UNITS_2, /* use binary powers of 2^10 */ +enum string_size_flags { + STRING_SIZE_BASE2 = (1 << 0), + STRING_SIZE_NOSPACE = (1 << 1), + STRING_SIZE_NOBYTES = (1 << 2), }; -void string_get_size(u64 size, u64 blk_size, enum string_size_units units, - char *buf, int len); +int string_get_size(u64 size, u64 blk_size, enum string_size_flags flags, + char *buf, int len); int parse_int_array_user(const char __user *from, size_t count, int **array); diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index bb9d3f554..d8e0cacfc 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -11,6 +11,8 @@ struct user_namespace; extern struct user_namespace init_user_ns; +struct vm_area_struct; + struct timens_offsets { struct timespec64 monotonic; struct timespec64 boottime; diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c720be70c..106d78e75 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -2,6 +2,8 @@ #ifndef _LINUX_VMALLOC_H #define _LINUX_VMALLOC_H +#include +#include #include #include #include @@ -137,26 +139,54 @@ extern unsigned long vmalloc_nr_pages(void); static inline unsigned long vmalloc_nr_pages(void) { return 0; } #endif -extern void *vmalloc(unsigned long size) __alloc_size(1); -extern void *vzalloc(unsigned long size) __alloc_size(1); -extern void *vmalloc_user(unsigned long size) __alloc_size(1); -extern void *vmalloc_node(unsigned long size, int node) __alloc_size(1); -extern void *vzalloc_node(unsigned long size, int node) __alloc_size(1); -extern void *vmalloc_32(unsigned long size) __alloc_size(1); -extern void *vmalloc_32_user(unsigned long size) __alloc_size(1); -extern void *__vmalloc(unsigned long size, gfp_t gfp_mask) __alloc_size(1); -extern void *__vmalloc_node_range(unsigned long size, unsigned long align, +extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); +#define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) + +extern void *vzalloc_noprof(unsigned long size) __alloc_size(1); +#define vzalloc(...) alloc_hooks(vzalloc_noprof(__VA_ARGS__)) + +extern void *vmalloc_user_noprof(unsigned long size) __alloc_size(1); +#define vmalloc_user(...) alloc_hooks(vmalloc_user_noprof(__VA_ARGS__)) + +extern void *vmalloc_node_noprof(unsigned long size, int node) __alloc_size(1); +#define vmalloc_node(...) alloc_hooks(vmalloc_node_noprof(__VA_ARGS__)) + +extern void *vzalloc_node_noprof(unsigned long size, int node) __alloc_size(1); +#define vzalloc_node(...) alloc_hooks(vzalloc_node_noprof(__VA_ARGS__)) + +extern void *vmalloc_32_noprof(unsigned long size) __alloc_size(1); +#define vmalloc_32(...) alloc_hooks(vmalloc_32_noprof(__VA_ARGS__)) + +extern void *vmalloc_32_user_noprof(unsigned long size) __alloc_size(1); +#define vmalloc_32_user(...) alloc_hooks(vmalloc_32_user_noprof(__VA_ARGS__)) + +extern void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +#define __vmalloc(...) alloc_hooks(__vmalloc_noprof(__VA_ARGS__)) + +extern void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) __alloc_size(1); -void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, +#define __vmalloc_node_range(...) alloc_hooks(__vmalloc_node_range_noprof(__VA_ARGS__)) + +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) __alloc_size(1); -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +#define __vmalloc_node(...) alloc_hooks(__vmalloc_node_noprof(__VA_ARGS__)) + +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) __alloc_size(1); +#define vmalloc_huge(...) alloc_hooks(vmalloc_huge_noprof(__VA_ARGS__)) + +extern void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); +#define __vmalloc_array(...) alloc_hooks(__vmalloc_array_noprof(__VA_ARGS__)) + +extern void *vmalloc_array_noprof(size_t n, size_t size) __alloc_size(1, 2); +#define vmalloc_array(...) alloc_hooks(vmalloc_array_noprof(__VA_ARGS__)) + +extern void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); +#define __vcalloc(...) alloc_hooks(__vcalloc_noprof(__VA_ARGS__)) -extern void *__vmalloc_array(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); -extern void *vmalloc_array(size_t n, size_t size) __alloc_size(1, 2); -extern void *__vcalloc(size_t n, size_t size, gfp_t flags) __alloc_size(1, 2); -extern void *vcalloc(size_t n, size_t size) __alloc_size(1, 2); +extern void *vcalloc_noprof(size_t n, size_t size) __alloc_size(1, 2); +#define vcalloc(...) alloc_hooks(vcalloc_noprof(__VA_ARGS__)) extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/init/Kconfig b/init/Kconfig index b6d38eccc..cec6bac1a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -940,10 +940,14 @@ config CGROUP_FAVOR_DYNMODS Say N if unsure. +config SLAB_OBJ_EXT + bool + config MEMCG bool "Memory controller" select PAGE_COUNTER select EVENTFD + select SLAB_OBJ_EXT help Provides control over the memory footprint of tasks in a cgroup. diff --git a/init/init_task.c b/init/init_task.c index ff6c4b9bf..f703116e0 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -85,6 +85,7 @@ struct task_struct init_task .nr_cpus_allowed= NR_CPUS, .mm = NULL, .active_mm = &init_mm, + .faults_disabled_mapping = NULL, .restart_block = { .fn = do_no_restart_syscall, }, diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 4198f0273..b2abd9a5d 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -259,3 +259,6 @@ config ARCH_HAS_MMIOWB config MMIOWB def_bool y if ARCH_HAS_MMIOWB depends on SMP + +config SIXLOCKS + bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 9a4db5cce..fc42930af 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -570,9 +570,9 @@ static struct page *__dma_alloc_pages(struct device *dev, size_t size, size = PAGE_ALIGN(size); if (dma_alloc_direct(dev, ops)) return dma_direct_alloc_pages(dev, size, dma_handle, dir, gfp); - if (!ops->alloc_pages) + if (!ops->alloc_pages_op) return NULL; - return ops->alloc_pages(dev, size, dma_handle, dir, gfp); + return ops->alloc_pages_op(dev, size, dma_handle, dir, gfp); } struct page *dma_alloc_pages(struct device *dev, size_t size, diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 0db4093d1..a095dbbf0 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -32,3 +32,4 @@ obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o obj-$(CONFIG_LOCK_EVENT_COUNTS) += lock_events.o +obj-$(CONFIG_SIXLOCKS) += six.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4dfd2f3e0..0463302e2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3039,6 +3039,9 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) if ((next->read == 2) && prev->read) continue; + if (hlock_class(next)->no_check_recursion) + continue; + /* * We're holding the nest_lock, which serializes this lock's * nesting behaviour. @@ -3100,6 +3103,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 2; } + if (hlock_class(prev) == hlock_class(next) && + hlock_class(prev)->no_check_recursion) + return 2; + /* * Prove that the new -> dependency would not * create a circular dependency in the graph. (We do this by @@ -6551,6 +6558,26 @@ void debug_check_no_locks_held(void) } EXPORT_SYMBOL_GPL(debug_check_no_locks_held); +#ifdef CONFIG_LOCKDEP +int lock_class_is_held(struct lock_class_key *key) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + + if (unlikely(!debug_locks)) + return 0; + + for (hlock = curr->held_locks; + hlock < curr->held_locks + curr->lockdep_depth; + hlock++) + if (hlock->instance->key == key) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(lock_class_is_held); +#endif + #ifdef __KERNEL__ void debug_show_all_locks(void) { @@ -6664,3 +6691,22 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) warn_rcu_exit(rcu); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void lockdep_set_no_check_recursion(struct lockdep_map *lock) +{ + struct lock_class *class = lock->class_cache[0]; + unsigned long flags; + + raw_local_irq_save(flags); + lockdep_recursion_inc(); + + if (!class) + class = register_lock_class(lock, 0, 0); + if (class) + class->no_check_recursion = true; + lockdep_recursion_finish(); + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lockdep_set_no_check_recursion); +#endif diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d5610ad52..b752ec5cc 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -203,6 +203,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) return false; } +EXPORT_SYMBOL_GPL(osq_lock); void osq_unlock(struct optimistic_spin_queue *lock) { @@ -230,3 +231,4 @@ void osq_unlock(struct optimistic_spin_queue *lock) if (next) WRITE_ONCE(next->locked, 1); } +EXPORT_SYMBOL_GPL(osq_unlock); diff --git a/kernel/locking/six.c b/kernel/locking/six.c new file mode 100644 index 000000000..0b9c4bb7c --- /dev/null +++ b/kernel/locking/six.c @@ -0,0 +1,893 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef DEBUG +#define EBUG_ON(cond) BUG_ON(cond) +#else +#define EBUG_ON(cond) do {} while (0) +#endif + +#define six_acquire(l, t, r, ip) lock_acquire(l, 0, t, r, 1, NULL, ip) +#define six_release(l, ip) lock_release(l, ip) + +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type); + +#define SIX_LOCK_HELD_read_OFFSET 0 +#define SIX_LOCK_HELD_read ~(~0U << 26) +#define SIX_LOCK_HELD_intent (1U << 26) +#define SIX_LOCK_HELD_write (1U << 27) +#define SIX_LOCK_WAITING_read (1U << (28 + SIX_LOCK_read)) +#define SIX_LOCK_WAITING_intent (1U << (28 + SIX_LOCK_intent)) +#define SIX_LOCK_WAITING_write (1U << (28 + SIX_LOCK_write)) +#define SIX_LOCK_NOSPIN (1U << 31) + +struct six_lock_vals { + /* Value we add to the lock in order to take the lock: */ + u32 lock_val; + + /* If the lock has this value (used as a mask), taking the lock fails: */ + u32 lock_fail; + + /* Mask that indicates lock is held for this type: */ + u32 held_mask; + + /* Waitlist we wakeup when releasing the lock: */ + enum six_lock_type unlock_wakeup; +}; + +static const struct six_lock_vals l[] = { + [SIX_LOCK_read] = { + .lock_val = 1U << SIX_LOCK_HELD_read_OFFSET, + .lock_fail = SIX_LOCK_HELD_write, + .held_mask = SIX_LOCK_HELD_read, + .unlock_wakeup = SIX_LOCK_write, + }, + [SIX_LOCK_intent] = { + .lock_val = SIX_LOCK_HELD_intent, + .lock_fail = SIX_LOCK_HELD_intent, + .held_mask = SIX_LOCK_HELD_intent, + .unlock_wakeup = SIX_LOCK_intent, + }, + [SIX_LOCK_write] = { + .lock_val = SIX_LOCK_HELD_write, + .lock_fail = SIX_LOCK_HELD_read, + .held_mask = SIX_LOCK_HELD_write, + .unlock_wakeup = SIX_LOCK_read, + }, +}; + +static inline void six_set_bitmask(struct six_lock *lock, u32 mask) +{ + if ((atomic_read(&lock->state) & mask) != mask) + atomic_or(mask, &lock->state); +} + +static inline void six_clear_bitmask(struct six_lock *lock, u32 mask) +{ + if (atomic_read(&lock->state) & mask) + atomic_and(~mask, &lock->state); +} + +static inline void six_set_owner(struct six_lock *lock, enum six_lock_type type, + u32 old, struct task_struct *owner) +{ + if (type != SIX_LOCK_intent) + return; + + if (!(old & SIX_LOCK_HELD_intent)) { + EBUG_ON(lock->owner); + lock->owner = owner; + } else { + EBUG_ON(lock->owner != current); + } +} + +static inline unsigned pcpu_read_count(struct six_lock *lock) +{ + unsigned read_count = 0; + int cpu; + + for_each_possible_cpu(cpu) + read_count += *per_cpu_ptr(lock->readers, cpu); + return read_count; +} + +/* + * __do_six_trylock() - main trylock routine + * + * Returns 1 on success, 0 on failure + * + * In percpu reader mode, a failed trylock may cause a spurious trylock failure + * for anoter thread taking the competing lock type, and we may havve to do a + * wakeup: when a wakeup is required, we return -1 - wakeup_type. + */ +static int __do_six_trylock(struct six_lock *lock, enum six_lock_type type, + struct task_struct *task, bool try) +{ + int ret; + u32 old; + + EBUG_ON(type == SIX_LOCK_write && lock->owner != task); + EBUG_ON(type == SIX_LOCK_write && + (try != !(atomic_read(&lock->state) & SIX_LOCK_HELD_write))); + + /* + * Percpu reader mode: + * + * The basic idea behind this algorithm is that you can implement a lock + * between two threads without any atomics, just memory barriers: + * + * For two threads you'll need two variables, one variable for "thread a + * has the lock" and another for "thread b has the lock". + * + * To take the lock, a thread sets its variable indicating that it holds + * the lock, then issues a full memory barrier, then reads from the + * other thread's variable to check if the other thread thinks it has + * the lock. If we raced, we backoff and retry/sleep. + * + * Failure to take the lock may cause a spurious trylock failure in + * another thread, because we temporarily set the lock to indicate that + * we held it. This would be a problem for a thread in six_lock(), when + * they are calling trylock after adding themself to the waitlist and + * prior to sleeping. + * + * Therefore, if we fail to get the lock, and there were waiters of the + * type we conflict with, we will have to issue a wakeup. + * + * Since we may be called under wait_lock (and by the wakeup code + * itself), we return that the wakeup has to be done instead of doing it + * here. + */ + if (type == SIX_LOCK_read && lock->readers) { + preempt_disable(); + this_cpu_inc(*lock->readers); /* signal that we own lock */ + + smp_mb(); + + old = atomic_read(&lock->state); + ret = !(old & l[type].lock_fail); + + this_cpu_sub(*lock->readers, !ret); + preempt_enable(); + + if (!ret && (old & SIX_LOCK_WAITING_write)) + ret = -1 - SIX_LOCK_write; + } else if (type == SIX_LOCK_write && lock->readers) { + if (try) { + atomic_add(SIX_LOCK_HELD_write, &lock->state); + smp_mb__after_atomic(); + } + + ret = !pcpu_read_count(lock); + + if (try && !ret) { + old = atomic_sub_return(SIX_LOCK_HELD_write, &lock->state); + if (old & SIX_LOCK_WAITING_read) + ret = -1 - SIX_LOCK_read; + } + } else { + old = atomic_read(&lock->state); + do { + ret = !(old & l[type].lock_fail); + if (!ret || (type == SIX_LOCK_write && !try)) { + smp_mb(); + break; + } + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, old + l[type].lock_val)); + + EBUG_ON(ret && !(atomic_read(&lock->state) & l[type].held_mask)); + } + + if (ret > 0) + six_set_owner(lock, type, old, task); + + EBUG_ON(type == SIX_LOCK_write && try && ret <= 0 && + (atomic_read(&lock->state) & SIX_LOCK_HELD_write)); + + return ret; +} + +static void __six_lock_wakeup(struct six_lock *lock, enum six_lock_type lock_type) +{ + struct six_lock_waiter *w, *next; + struct task_struct *task; + bool saw_one; + int ret; +again: + ret = 0; + saw_one = false; + raw_spin_lock(&lock->wait_lock); + + list_for_each_entry_safe(w, next, &lock->wait_list, list) { + if (w->lock_want != lock_type) + continue; + + if (saw_one && lock_type != SIX_LOCK_read) + goto unlock; + saw_one = true; + + ret = __do_six_trylock(lock, lock_type, w->task, false); + if (ret <= 0) + goto unlock; + + __list_del(w->list.prev, w->list.next); + task = w->task; + /* + * Do no writes to @w besides setting lock_acquired - otherwise + * we would need a memory barrier: + */ + barrier(); + w->lock_acquired = true; + wake_up_process(task); + } + + six_clear_bitmask(lock, SIX_LOCK_WAITING_read << lock_type); +unlock: + raw_spin_unlock(&lock->wait_lock); + + if (ret < 0) { + lock_type = -ret - 1; + goto again; + } +} + +__always_inline +static void six_lock_wakeup(struct six_lock *lock, u32 state, + enum six_lock_type lock_type) +{ + if (lock_type == SIX_LOCK_write && (state & SIX_LOCK_HELD_read)) + return; + + if (!(state & (SIX_LOCK_WAITING_read << lock_type))) + return; + + __six_lock_wakeup(lock, lock_type); +} + +__always_inline +static bool do_six_trylock(struct six_lock *lock, enum six_lock_type type, bool try) +{ + int ret; + + ret = __do_six_trylock(lock, type, current, try); + if (ret < 0) + __six_lock_wakeup(lock, -ret - 1); + + return ret > 0; +} + +/** + * six_trylock_ip - attempt to take a six lock without blocking + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_trylock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) +{ + if (!do_six_trylock(lock, type, true)) + return false; + + if (type != SIX_LOCK_write) + six_acquire(&lock->dep_map, 1, type == SIX_LOCK_read, ip); + return true; +} +EXPORT_SYMBOL_GPL(six_trylock_ip); + +/** + * six_relock_ip - attempt to re-take a lock that was held previously + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @seq: lock sequence number obtained from six_lock_seq() while lock was + * held previously + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * Return: true on success, false on failure. + */ +bool six_relock_ip(struct six_lock *lock, enum six_lock_type type, + unsigned seq, unsigned long ip) +{ + if (six_lock_seq(lock) != seq || !six_trylock_ip(lock, type, ip)) + return false; + + if (six_lock_seq(lock) != seq) { + six_unlock_ip(lock, type, ip); + return false; + } + + return true; +} +EXPORT_SYMBOL_GPL(six_relock_ip); + +#ifdef CONFIG_LOCK_SPIN_ON_OWNER + +static inline bool six_can_spin_on_owner(struct six_lock *lock) +{ + struct task_struct *owner; + bool ret; + + if (need_resched()) + return false; + + rcu_read_lock(); + owner = READ_ONCE(lock->owner); + ret = !owner || owner_on_cpu(owner); + rcu_read_unlock(); + + return ret; +} + +static inline bool six_spin_on_owner(struct six_lock *lock, + struct task_struct *owner, + u64 end_time) +{ + bool ret = true; + unsigned loop = 0; + + rcu_read_lock(); + while (lock->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner. If that fails, + * owner might point to freed memory. If it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner_on_cpu(owner) || need_resched()) { + ret = false; + break; + } + + if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) { + six_set_bitmask(lock, SIX_LOCK_NOSPIN); + ret = false; + break; + } + + cpu_relax(); + } + rcu_read_unlock(); + + return ret; +} + +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + struct task_struct *task = current; + u64 end_time; + + if (type == SIX_LOCK_write) + return false; + + preempt_disable(); + if (!six_can_spin_on_owner(lock)) + goto fail; + + if (!osq_lock(&lock->osq)) + goto fail; + + end_time = sched_clock() + 10 * NSEC_PER_USEC; + + while (1) { + struct task_struct *owner; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = READ_ONCE(lock->owner); + if (owner && !six_spin_on_owner(lock, owner, end_time)) + break; + + if (do_six_trylock(lock, type, false)) { + osq_unlock(&lock->osq); + preempt_enable(); + return true; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + cpu_relax(); + } + + osq_unlock(&lock->osq); +fail: + preempt_enable(); + + /* + * If we fell out of the spin path because of need_resched(), + * reschedule now, before we try-lock again. This avoids getting + * scheduled out right after we obtained the lock. + */ + if (need_resched()) + schedule(); + + return false; +} + +#else /* CONFIG_LOCK_SPIN_ON_OWNER */ + +static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type) +{ + return false; +} + +#endif + +noinline +static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + int ret = 0; + + if (type == SIX_LOCK_write) { + EBUG_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + atomic_add(SIX_LOCK_HELD_write, &lock->state); + smp_mb__after_atomic(); + } + + trace_contention_begin(lock, 0); + lock_contended(&lock->dep_map, ip); + + if (six_optimistic_spin(lock, type)) + goto out; + + wait->task = current; + wait->lock_want = type; + wait->lock_acquired = false; + + raw_spin_lock(&lock->wait_lock); + six_set_bitmask(lock, SIX_LOCK_WAITING_read << type); + /* + * Retry taking the lock after taking waitlist lock, in case we raced + * with an unlock: + */ + ret = __do_six_trylock(lock, type, current, false); + if (ret <= 0) { + wait->start_time = local_clock(); + + if (!list_empty(&lock->wait_list)) { + struct six_lock_waiter *last = + list_last_entry(&lock->wait_list, + struct six_lock_waiter, list); + + if (time_before_eq64(wait->start_time, last->start_time)) + wait->start_time = last->start_time + 1; + } + + list_add_tail(&wait->list, &lock->wait_list); + } + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(ret > 0)) { + ret = 0; + goto out; + } + + if (unlikely(ret < 0)) { + __six_lock_wakeup(lock, -ret - 1); + ret = 0; + } + + while (1) { + set_current_state(TASK_UNINTERRUPTIBLE); + + if (wait->lock_acquired) + break; + + ret = should_sleep_fn ? should_sleep_fn(lock, p) : 0; + if (unlikely(ret)) { + raw_spin_lock(&lock->wait_lock); + if (!wait->lock_acquired) + list_del(&wait->list); + raw_spin_unlock(&lock->wait_lock); + + if (unlikely(wait->lock_acquired)) + do_six_unlock_type(lock, type); + break; + } + + schedule(); + } + + __set_current_state(TASK_RUNNING); +out: + if (ret && type == SIX_LOCK_write) { + six_clear_bitmask(lock, SIX_LOCK_HELD_write); + six_lock_wakeup(lock, atomic_read(&lock->state), SIX_LOCK_read); + } + trace_contention_end(lock, 0); + + return ret; +} + +/** + * six_lock_ip_waiter - take a lock, with full waitlist interface + * @lock: lock to take + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @wait: pointer to wait object, which will be added to lock's waitlist + * @should_sleep_fn: callback run after adding to waitlist, immediately prior + * to scheduling + * @p: passed through to @should_sleep_fn + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * This is the most general six_lock() variant, with parameters to support full + * cycle detection for deadlock avoidance. + * + * The code calling this function must implement tracking of held locks, and the + * @wait object should be embedded into the struct that tracks held locks - + * which must also be accessible in a thread-safe way. + * + * @should_sleep_fn should invoke the cycle detector; it should walk each + * lock's waiters, and for each waiter recursively walk their held locks. + * + * When this function must block, @wait will be added to @lock's waitlist before + * calling trylock, and before calling @should_sleep_fn, and @wait will not be + * removed from the lock waitlist until the lock has been successfully acquired, + * or we abort. + * + * @wait.start_time will be monotonically increasing for any given waitlist, and + * thus may be used as a loop cursor. + * + * Return: 0 on success, or the return code from @should_sleep_fn on failure. + */ +int six_lock_ip_waiter(struct six_lock *lock, enum six_lock_type type, + struct six_lock_waiter *wait, + six_lock_should_sleep_fn should_sleep_fn, void *p, + unsigned long ip) +{ + int ret; + + wait->start_time = 0; + + if (type != SIX_LOCK_write) + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, ip); + + ret = do_six_trylock(lock, type, true) ? 0 + : six_lock_slowpath(lock, type, wait, should_sleep_fn, p, ip); + + if (ret && type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); + if (!ret) + lock_acquired(&lock->dep_map, ip); + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_ip_waiter); + +__always_inline +static void do_six_unlock_type(struct six_lock *lock, enum six_lock_type type) +{ + u32 state; + + if (type == SIX_LOCK_intent) + lock->owner = NULL; + + if (type == SIX_LOCK_read && + lock->readers) { + smp_mb(); /* unlock barrier */ + this_cpu_dec(*lock->readers); + smp_mb(); /* between unlocking and checking for waiters */ + state = atomic_read(&lock->state); + } else { + u32 v = l[type].lock_val; + + if (type != SIX_LOCK_read) + v += atomic_read(&lock->state) & SIX_LOCK_NOSPIN; + + EBUG_ON(!(atomic_read(&lock->state) & l[type].held_mask)); + state = atomic_sub_return_release(v, &lock->state); + } + + six_lock_wakeup(lock, state, l[type].unlock_wakeup); +} + +/** + * six_unlock_ip - drop a six lock + * @lock: lock to unlock + * @type: SIX_LOCK_read, SIX_LOCK_intent, or SIX_LOCK_write + * @ip: ip parameter for lockdep/lockstat, i.e. _THIS_IP_ + * + * When a lock is held multiple times (because six_lock_incement()) was used), + * this decrements the 'lock held' counter by one. + * + * For example: + * six_lock_read(&foo->lock); read count 1 + * six_lock_increment(&foo->lock, SIX_LOCK_read); read count 2 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 1 + * six_lock_unlock(&foo->lock, SIX_LOCK_read); read count 0 + */ +void six_unlock_ip(struct six_lock *lock, enum six_lock_type type, unsigned long ip) +{ + EBUG_ON(type == SIX_LOCK_write && + !(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + EBUG_ON((type == SIX_LOCK_write || + type == SIX_LOCK_intent) && + lock->owner != current); + + if (type != SIX_LOCK_write) + six_release(&lock->dep_map, ip); + else + lock->seq++; + + if (type == SIX_LOCK_intent && + lock->intent_lock_recurse) { + --lock->intent_lock_recurse; + return; + } + + do_six_unlock_type(lock, type); +} +EXPORT_SYMBOL_GPL(six_unlock_ip); + +/** + * six_lock_downgrade - convert an intent lock to a read lock + * @lock: lock to dowgrade + * + * @lock will have read count incremented and intent count decremented + */ +void six_lock_downgrade(struct six_lock *lock) +{ + six_lock_increment(lock, SIX_LOCK_read); + six_unlock_intent(lock); +} +EXPORT_SYMBOL_GPL(six_lock_downgrade); + +/** + * six_lock_tryupgrade - attempt to convert read lock to an intent lock + * @lock: lock to upgrade + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ +bool six_lock_tryupgrade(struct six_lock *lock) +{ + u32 old = atomic_read(&lock->state), new; + + do { + new = old; + + if (new & SIX_LOCK_HELD_intent) + return false; + + if (!lock->readers) { + EBUG_ON(!(new & SIX_LOCK_HELD_read)); + new -= l[SIX_LOCK_read].lock_val; + } + + new |= SIX_LOCK_HELD_intent; + } while (!atomic_try_cmpxchg_acquire(&lock->state, &old, new)); + + if (lock->readers) + this_cpu_dec(*lock->readers); + + six_set_owner(lock, SIX_LOCK_intent, old, current); + + return true; +} +EXPORT_SYMBOL_GPL(six_lock_tryupgrade); + +/** + * six_trylock_convert - attempt to convert a held lock from one type to another + * @lock: lock to upgrade + * @from: SIX_LOCK_read or SIX_LOCK_intent + * @to: SIX_LOCK_read or SIX_LOCK_intent + * + * On success, @lock will have intent count incremented and read count + * decremented + * + * Return: true on success, false on failure + */ +bool six_trylock_convert(struct six_lock *lock, + enum six_lock_type from, + enum six_lock_type to) +{ + EBUG_ON(to == SIX_LOCK_write || from == SIX_LOCK_write); + + if (to == from) + return true; + + if (to == SIX_LOCK_read) { + six_lock_downgrade(lock); + return true; + } else { + return six_lock_tryupgrade(lock); + } +} +EXPORT_SYMBOL_GPL(six_trylock_convert); + +/** + * six_lock_increment - increase held lock count on a lock that is already held + * @lock: lock to increment + * @type: SIX_LOCK_read or SIX_LOCK_intent + * + * @lock must already be held, with a lock type that is greater than or equal to + * @type + * + * A corresponding six_unlock_type() call will be required for @lock to be fully + * unlocked. + */ +void six_lock_increment(struct six_lock *lock, enum six_lock_type type) +{ + six_acquire(&lock->dep_map, 0, type == SIX_LOCK_read, _RET_IP_); + + /* XXX: assert already locked, and that we don't overflow: */ + + switch (type) { + case SIX_LOCK_read: + if (lock->readers) { + this_cpu_inc(*lock->readers); + } else { + EBUG_ON(!(atomic_read(&lock->state) & + (SIX_LOCK_HELD_read| + SIX_LOCK_HELD_intent))); + atomic_add(l[type].lock_val, &lock->state); + } + break; + case SIX_LOCK_intent: + EBUG_ON(!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent)); + lock->intent_lock_recurse++; + break; + case SIX_LOCK_write: + BUG(); + break; + } +} +EXPORT_SYMBOL_GPL(six_lock_increment); + +/** + * six_lock_wakeup_all - wake up all waiters on @lock + * @lock: lock to wake up waiters for + * + * Wakeing up waiters will cause them to re-run should_sleep_fn, which may then + * abort the lock operation. + * + * This function is never needed in a bug-free program; it's only useful in + * debug code, e.g. to determine if a cycle detector is at fault. + */ +void six_lock_wakeup_all(struct six_lock *lock) +{ + u32 state = atomic_read(&lock->state); + struct six_lock_waiter *w; + + six_lock_wakeup(lock, state, SIX_LOCK_read); + six_lock_wakeup(lock, state, SIX_LOCK_intent); + six_lock_wakeup(lock, state, SIX_LOCK_write); + + raw_spin_lock(&lock->wait_lock); + list_for_each_entry(w, &lock->wait_list, list) + wake_up_process(w->task); + raw_spin_unlock(&lock->wait_lock); +} +EXPORT_SYMBOL_GPL(six_lock_wakeup_all); + +/** + * six_lock_counts - return held lock counts, for each lock type + * @lock: lock to return counters for + * + * Return: the number of times a lock is held for read, intent and write. + */ +struct six_lock_count six_lock_counts(struct six_lock *lock) +{ + struct six_lock_count ret; + + ret.n[SIX_LOCK_read] = !lock->readers + ? atomic_read(&lock->state) & SIX_LOCK_HELD_read + : pcpu_read_count(lock); + ret.n[SIX_LOCK_intent] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_intent) + + lock->intent_lock_recurse; + ret.n[SIX_LOCK_write] = !!(atomic_read(&lock->state) & SIX_LOCK_HELD_write); + + return ret; +} +EXPORT_SYMBOL_GPL(six_lock_counts); + +/** + * six_lock_readers_add - directly manipulate reader count of a lock + * @lock: lock to add/subtract readers for + * @nr: reader count to add/subtract + * + * When an upper layer is implementing lock reentrency, we may have both read + * and intent locks on the same lock. + * + * When we need to take a write lock, the read locks will cause self-deadlock, + * because six locks themselves do not track which read locks are held by the + * current thread and which are held by a different thread - it does no + * per-thread tracking of held locks. + * + * The upper layer that is tracking held locks may however, if trylock() has + * failed, count up its own read locks, subtract them, take the write lock, and + * then re-add them. + * + * As in any other situation when taking a write lock, @lock must be held for + * intent one (or more) times, so @lock will never be left unlocked. + */ +void six_lock_readers_add(struct six_lock *lock, int nr) +{ + if (lock->readers) { + this_cpu_add(*lock->readers, nr); + } else { + EBUG_ON((int) (atomic_read(&lock->state) & SIX_LOCK_HELD_read) + nr < 0); + /* reader count starts at bit 0 */ + atomic_add(nr, &lock->state); + } +} +EXPORT_SYMBOL_GPL(six_lock_readers_add); + +/** + * six_lock_exit - release resources held by a lock prior to freeing + * @lock: lock to exit + * + * When a lock was initialized in percpu mode (SIX_OLCK_INIT_PCPU), this is + * required to free the percpu read counts. + */ +void six_lock_exit(struct six_lock *lock) +{ + WARN_ON(lock->readers && pcpu_read_count(lock)); + WARN_ON(atomic_read(&lock->state) & SIX_LOCK_HELD_read); + + free_percpu(lock->readers); + lock->readers = NULL; +} +EXPORT_SYMBOL_GPL(six_lock_exit); + +void __six_lock_init(struct six_lock *lock, const char *name, + struct lock_class_key *key, enum six_lock_init_flags flags) +{ + atomic_set(&lock->state, 0); + raw_spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + debug_check_no_locks_freed((void *) lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + + /* + * Don't assume that we have real percpu variables available in + * userspace: + */ +#ifdef __KERNEL__ + if (flags & SIX_LOCK_INIT_PCPU) { + /* + * We don't return an error here on memory allocation failure + * since percpu is an optimization, and locks will work with the + * same semantics in non-percpu mode: callers can check for + * failure if they wish by checking lock->readers, but generally + * will not want to treat it as an error. + */ + lock->readers = alloc_percpu(unsigned); + } +#endif +} +EXPORT_SYMBOL_GPL(__six_lock_init); diff --git a/kernel/module/main.c b/kernel/module/main.c index 4e2cf784c..7f7b5bedf 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -56,6 +56,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1217,15 +1218,19 @@ static void *module_memory_alloc(unsigned int size, enum mod_mem_type type) return module_alloc(size); } -static void module_memory_free(void *ptr, enum mod_mem_type type) +static void module_memory_free(void *ptr, enum mod_mem_type type, + bool unload_codetags) { + if (!unload_codetags && mod_mem_type_is_core_data(type)) + return; + if (mod_mem_use_vmalloc(type)) vfree(ptr); else module_memfree(ptr); } -static void free_mod_mem(struct module *mod) +static void free_mod_mem(struct module *mod, bool unload_codetags) { for_each_mod_mem_type(type) { struct module_memory *mod_mem = &mod->mem[type]; @@ -1236,19 +1241,23 @@ static void free_mod_mem(struct module *mod) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod_mem->base, type); + module_memory_free(mod_mem->base, type, + unload_codetags); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA); + module_memory_free(mod->mem[MOD_DATA].base, MOD_DATA, unload_codetags); } /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { + bool unload_codetags; + trace_module_free(mod); + unload_codetags = codetag_unload_module(mod); mod_sysfs_teardown(mod); /* @@ -1290,7 +1299,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - free_mod_mem(mod); + free_mod_mem(mod, unload_codetags); } void *__symbol_get(const char *symbol) @@ -2292,7 +2301,7 @@ static int move_module(struct module *mod, struct load_info *info) return 0; out_enomem: for (t--; t >= 0; t--) - module_memory_free(mod->mem[t].base, t); + module_memory_free(mod->mem[t].base, t, true); return ret; } @@ -2422,7 +2431,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) percpu_modfree(mod); module_arch_freeing_init(mod); - free_mod_mem(mod); + free_mod_mem(mod, true); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -2974,6 +2983,8 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Get rid of temporary copy. */ free_copy(info, flags); + codetag_load_module(mod); + /* Done! */ trace_module_load(mod); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9ed5ce989..4f6582487 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -151,6 +151,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *tsk, unsigned long *store, put_task_stack(tsk); return c.len; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array @@ -301,6 +302,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task, save_stack_trace_tsk(task, &trace); return trace.nr_entries; } +EXPORT_SYMBOL_GPL(stack_trace_save_tsk); /** * stack_trace_save_regs - Save a stack trace based on pt_regs into a storage array diff --git a/lib/Kconfig b/lib/Kconfig index 5c2da561c..f78bc8b42 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -505,6 +505,9 @@ config ASSOCIATIVE_ARRAY for more information. +config CLOSURES + bool + config HAS_IOMEM bool depends on !NO_IOMEM diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ce51d4dc6..a19ec6fd7 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -957,6 +957,36 @@ config DEBUG_STACKOVERFLOW If in doubt, say "N". +config CODE_TAGGING + bool + select KALLSYMS + +config MEM_ALLOC_PROFILING + bool "Enable memory allocation profiling" + default n + depends on PROC_FS + select CODE_TAGGING + select PAGE_EXTENSION + select SLAB_OBJ_EXT + help + Track allocation source code and record total allocation size + initiated at that code location. The mechanism can be used to track + memory leaks with a low performance and memory impact. + +config MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + bool "Enable memory allocation profiling by default" + default y + depends on MEM_ALLOC_PROFILING + +config MEM_ALLOC_PROFILING_DEBUG + bool "Memory allocation profiler debugging" + default n + depends on MEM_ALLOC_PROFILING + select MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT + help + Adds warnings with helpful error messages for memory allocation + profiling. + source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" source "lib/Kconfig.kmsan" @@ -1637,6 +1667,15 @@ config DEBUG_NOTIFIERS This is a relatively cheap check but if you care about maximum performance, say N. +config DEBUG_CLOSURES + bool "Debug closures (bcache async widgits)" + depends on CLOSURES + select DEBUG_FS + help + Keeps all active closures in a linked list and provides a debugfs + interface to list them, which makes it possible to see asynchronous + operations that get stuck. + config BUG_ON_DATA_CORRUPTION bool "Trigger a BUG when data corruption is detected" select DEBUG_LIST @@ -1997,6 +2036,12 @@ config FAULT_INJECTION_STACKTRACE_FILTER help Provide stacktrace filter for fault-injection capabilities +config CODETAG_FAULT_INJECTION + bool "Code tagging based fault injection" + select CODE_TAGGING + help + Dynamic fault injection based on code tagging + config ARCH_HAS_KCOV bool help @@ -2123,6 +2168,15 @@ config CPUMASK_KUNIT_TEST If unsure, say N. +config MEAN_AND_VARIANCE_UNIT_TEST + tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS + depends on KUNIT + select MEAN_AND_VARIANCE + default KUNIT_ALL_TESTS + help + This option enables the kunit tests for mean_and_variance module. + If unsure, say N. + config TEST_LIST_SORT tristate "Linked list sorting test" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index 876fcdeae..fb1d20939 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -30,7 +30,7 @@ endif lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o timerqueue.o xarray.o \ maple_tree.o idr.o extable.o irq_regs.o argv_split.o \ - flex_proportions.o ratelimit.o show_mem.o \ + flex_proportions.o ratelimit.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ @@ -226,6 +226,11 @@ obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ of-reconfig-notifier-error-inject.o obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o +obj-$(CONFIG_CODE_TAGGING) += codetag.o +obj-$(CONFIG_MEM_ALLOC_PROFILING) += alloc_tag.o + +obj-$(CONFIG_CODETAG_FAULT_INJECTION) += dynamic_fault.o + lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o @@ -248,6 +253,8 @@ obj-$(CONFIG_ATOMIC64_SELFTEST) += atomic64_test.o obj-$(CONFIG_CPU_RMAP) += cpu_rmap.o +obj-$(CONFIG_CLOSURES) += closure.o + obj-$(CONFIG_DQL) += dynamic_queue_limits.o obj-$(CONFIG_GLOB) += glob.o diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c new file mode 100644 index 000000000..1ca90cff5 --- /dev/null +++ b/lib/alloc_tag.c @@ -0,0 +1,225 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include +#include + +static struct codetag_type *alloc_tag_cttype; + +DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, + mem_alloc_profiling_key); + +static void *allocinfo_start(struct seq_file *m, loff_t *pos) +{ + struct codetag_iterator *iter; + struct codetag *ct; + loff_t node = *pos; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + m->private = iter; + if (!iter) + return NULL; + + codetag_lock_module_list(alloc_tag_cttype, true); + *iter = codetag_get_ct_iter(alloc_tag_cttype); + while ((ct = codetag_next_ct(iter)) != NULL && node) + node--; + + return ct ? iter : NULL; +} + +static void *allocinfo_next(struct seq_file *m, void *arg, loff_t *pos) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + struct codetag *ct = codetag_next_ct(iter); + + (*pos)++; + if (!ct) + return NULL; + + return iter; +} + +static void allocinfo_stop(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)m->private; + + if (iter) { + codetag_lock_module_list(alloc_tag_cttype, false); + kfree(iter); + } +} + +static void alloc_tag_to_text(struct seq_buf *out, struct codetag *ct) +{ + struct alloc_tag *tag = ct_to_alloc_tag(ct); + s64 bytes = alloc_tag_read(tag); + char val[10], *p = val; + + if (bytes < 0) { + *p++ = '-'; + bytes = -bytes; + } + + string_get_size(bytes, 1, + STRING_SIZE_BASE2|STRING_SIZE_NOSPACE, + p, val + ARRAY_SIZE(val) - p); + + seq_buf_printf(out, "%8s ", val); + codetag_to_text(out, ct); + seq_buf_putc(out, ' '); + seq_buf_putc(out, '\n'); +} + +static int allocinfo_show(struct seq_file *m, void *arg) +{ + struct codetag_iterator *iter = (struct codetag_iterator *)arg; + char *bufp; + size_t n = seq_get_buf(m, &bufp); + struct seq_buf buf; + + seq_buf_init(&buf, bufp, n); + alloc_tag_to_text(&buf, iter->ct); + seq_commit(m, seq_buf_used(&buf)); + return 0; +} + +static const struct seq_operations allocinfo_seq_op = { + .start = allocinfo_start, + .next = allocinfo_next, + .stop = allocinfo_stop, + .show = allocinfo_show, +}; + +void alloc_tags_show_mem_report(struct seq_buf *s) +{ + struct codetag_iterator iter; + struct codetag *ct; + struct { + struct codetag *tag; + size_t bytes; + } tags[10], n; + unsigned int i, nr = 0; + + codetag_lock_module_list(alloc_tag_cttype, true); + iter = codetag_get_ct_iter(alloc_tag_cttype); + while ((ct = codetag_next_ct(&iter))) { + n.tag = ct; + n.bytes = alloc_tag_read(ct_to_alloc_tag(ct)); + + for (i = 0; i < nr; i++) + if (n.bytes > tags[i].bytes) + break; + + if (i < ARRAY_SIZE(tags)) { + nr -= nr == ARRAY_SIZE(tags); + memmove(&tags[i + 1], + &tags[i], + sizeof(tags[0]) * (nr - i)); + nr++; + tags[i] = n; + } + } + + for (i = 0; i < nr; i++) + alloc_tag_to_text(s, tags[i].tag); + + codetag_lock_module_list(alloc_tag_cttype, false); +} + +static void __init procfs_init(void) +{ + proc_create_seq("allocinfo", 0444, NULL, &allocinfo_seq_op); +} + +static void alloc_tag_module_load(struct codetag_type *cttype, struct codetag_module *cmod) +{ + struct codetag_iterator iter = codetag_get_ct_iter(cttype); + struct codetag *ct; + + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { + if (iter.cmod != cmod) + continue; + + ct_to_alloc_tag(ct)->bytes_allocated = alloc_percpu(u64); + } +} + +static bool alloc_tag_module_unload(struct codetag_type *cttype, struct codetag_module *cmod) +{ + struct codetag_iterator iter = codetag_get_ct_iter(cttype); + bool module_unused = true; + struct alloc_tag *tag; + struct codetag *ct; + size_t bytes; + + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { + if (iter.cmod != cmod) + continue; + + tag = ct_to_alloc_tag(ct); + bytes = alloc_tag_read(tag); + + if (!WARN(bytes, "%s:%u module %s func:%s has %zu allocated at module unload", + ct->filename, ct->lineno, ct->modname, ct->function, bytes)) + free_percpu(tag->bytes_allocated); + else + module_unused = false; + } + + return module_unused; +} + +static __init bool need_page_alloc_tagging(void) +{ + return true; +} + +static __init void init_page_alloc_tagging(void) +{ +} + +struct page_ext_operations page_alloc_tagging_ops = { + .size = sizeof(union codetag_ref), + .need = need_page_alloc_tagging, + .init = init_page_alloc_tagging, +}; +EXPORT_SYMBOL(page_alloc_tagging_ops); + +static struct ctl_table memory_allocation_profiling_sysctls[] = { + { + .procname = "mem_profiling", + .data = &mem_alloc_profiling_key, +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + .mode = 0444, +#else + .mode = 0644, +#endif + .proc_handler = proc_do_static_key, + }, + { } +}; + +static int __init alloc_tag_init(void) +{ + const struct codetag_type_desc desc = { + .section = "alloc_tags", + .tag_size = sizeof(struct alloc_tag), + .module_load = alloc_tag_module_load, + .module_unload = alloc_tag_module_unload, + }; + + alloc_tag_cttype = codetag_register_type(&desc); + if (IS_ERR_OR_NULL(alloc_tag_cttype)) + return PTR_ERR(alloc_tag_cttype); + + register_sysctl_init("vm", memory_allocation_profiling_sysctls); + procfs_init(); + + return 0; +} +module_init(alloc_tag_init); diff --git a/drivers/md/bcache/closure.c b/lib/closure.c similarity index 88% rename from drivers/md/bcache/closure.c rename to lib/closure.c index d8d9394a6..0855e698c 100644 --- a/drivers/md/bcache/closure.c +++ b/lib/closure.c @@ -6,13 +6,13 @@ * Copyright 2012 Google, Inc. */ +#include #include -#include +#include +#include #include #include -#include "closure.h" - static inline void closure_put_after_sub(struct closure *cl, int flags) { int r = flags & CLOSURE_REMAINING_MASK; @@ -45,6 +45,7 @@ void closure_sub(struct closure *cl, int v) { closure_put_after_sub(cl, atomic_sub_return(v, &cl->remaining)); } +EXPORT_SYMBOL(closure_sub); /* * closure_put - decrement a closure's refcount @@ -53,6 +54,7 @@ void closure_put(struct closure *cl) { closure_put_after_sub(cl, atomic_dec_return(&cl->remaining)); } +EXPORT_SYMBOL(closure_put); /* * closure_wake_up - wake up all closures on a wait list, without memory barrier @@ -74,6 +76,7 @@ void __closure_wake_up(struct closure_waitlist *wait_list) closure_sub(cl, CLOSURE_WAITING + 1); } } +EXPORT_SYMBOL(__closure_wake_up); /** * closure_wait - add a closure to a waitlist @@ -93,6 +96,7 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl) return true; } +EXPORT_SYMBOL(closure_wait); struct closure_syncer { struct task_struct *task; @@ -127,8 +131,9 @@ void __sched __closure_sync(struct closure *cl) __set_current_state(TASK_RUNNING); } +EXPORT_SYMBOL(__closure_sync); -#ifdef CONFIG_BCACHE_CLOSURES_DEBUG +#ifdef CONFIG_DEBUG_CLOSURES static LIST_HEAD(closure_list); static DEFINE_SPINLOCK(closure_list_lock); @@ -144,6 +149,7 @@ void closure_debug_create(struct closure *cl) list_add(&cl->all, &closure_list); spin_unlock_irqrestore(&closure_list_lock, flags); } +EXPORT_SYMBOL(closure_debug_create); void closure_debug_destroy(struct closure *cl) { @@ -156,8 +162,7 @@ void closure_debug_destroy(struct closure *cl) list_del(&cl->all); spin_unlock_irqrestore(&closure_list_lock, flags); } - -static struct dentry *closure_debug; +EXPORT_SYMBOL(closure_debug_destroy); static int debug_show(struct seq_file *f, void *data) { @@ -181,7 +186,7 @@ static int debug_show(struct seq_file *f, void *data) seq_printf(f, " W %pS\n", (void *) cl->waiting_on); - seq_printf(f, "\n"); + seq_puts(f, "\n"); } spin_unlock_irq(&closure_list_lock); @@ -190,18 +195,11 @@ static int debug_show(struct seq_file *f, void *data) DEFINE_SHOW_ATTRIBUTE(debug); -void __init closure_debug_init(void) +static int __init closure_debug_init(void) { - if (!IS_ERR_OR_NULL(bcache_debug)) - /* - * it is unnecessary to check return value of - * debugfs_create_file(), we should not care - * about this. - */ - closure_debug = debugfs_create_file( - "closures", 0400, bcache_debug, NULL, &debug_fops); + debugfs_create_file("closures", 0400, NULL, NULL, &debug_fops); + return 0; } -#endif +late_initcall(closure_debug_init) -MODULE_AUTHOR("Kent Overstreet "); -MODULE_LICENSE("GPL"); +#endif diff --git a/lib/codetag.c b/lib/codetag.c new file mode 100644 index 000000000..84f90f3b9 --- /dev/null +++ b/lib/codetag.c @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +struct codetag_type { + struct list_head link; + unsigned int count; + struct idr mod_idr; + struct rw_semaphore mod_lock; /* protects mod_idr */ + struct codetag_type_desc desc; +}; + +static DEFINE_MUTEX(codetag_lock); +static LIST_HEAD(codetag_types); + +void codetag_lock_module_list(struct codetag_type *cttype, bool lock) +{ + if (lock) + down_read(&cttype->mod_lock); + else + up_read(&cttype->mod_lock); +} + +struct codetag_iterator codetag_get_ct_iter(struct codetag_type *cttype) +{ + struct codetag_iterator iter = { + .cttype = cttype, + .cmod = NULL, + .mod_id = 0, + .ct = NULL, + }; + + return iter; +} + +static inline struct codetag *get_first_module_ct(struct codetag_module *cmod) +{ + return cmod->range.start < cmod->range.stop ? cmod->range.start : NULL; +} + +static inline +struct codetag *get_next_module_ct(struct codetag_iterator *iter) +{ + struct codetag *res = (struct codetag *) + ((char *)iter->ct + iter->cttype->desc.tag_size); + + return res < iter->cmod->range.stop ? res : NULL; +} + +struct codetag *codetag_next_ct(struct codetag_iterator *iter) +{ + struct codetag_type *cttype = iter->cttype; + struct codetag_module *cmod; + struct codetag *ct; + + lockdep_assert_held(&cttype->mod_lock); + + if (unlikely(idr_is_empty(&cttype->mod_idr))) + return NULL; + + ct = NULL; + while (true) { + cmod = idr_find(&cttype->mod_idr, iter->mod_id); + + /* If module was removed move to the next one */ + if (!cmod) + cmod = idr_get_next_ul(&cttype->mod_idr, + &iter->mod_id); + + /* Exit if no more modules */ + if (!cmod) + break; + + if (cmod != iter->cmod) { + iter->cmod = cmod; + ct = get_first_module_ct(cmod); + } else + ct = get_next_module_ct(iter); + + if (ct) + break; + + iter->mod_id++; + } + + iter->ct = ct; + return ct; +} + +void codetag_to_text(struct seq_buf *out, struct codetag *ct) +{ + seq_buf_printf(out, "%s:%u module:%s func:%s", + ct->filename, ct->lineno, + ct->modname, ct->function); +} + +static inline size_t range_size(const struct codetag_type *cttype, + const struct codetag_range *range) +{ + return ((char *)range->stop - (char *)range->start) / + cttype->desc.tag_size; +} + +static void *get_symbol(struct module *mod, const char *prefix, const char *name) +{ + char buf[64]; + void *ret; + int res; + + res = snprintf(buf, sizeof(buf), "%s%s", prefix, name); + if (WARN_ON(res < 1 || res > sizeof(buf))) + return NULL; + + preempt_disable(); + ret = mod ? + (void *)find_kallsyms_symbol_value(mod, buf) : + (void *)kallsyms_lookup_name(buf); + preempt_enable(); + + return ret; +} + +static struct codetag_range get_section_range(struct module *mod, + const char *section) +{ + return (struct codetag_range) { + get_symbol(mod, "__start_", section), + get_symbol(mod, "__stop_", section), + }; +} + +static int codetag_module_init(struct codetag_type *cttype, struct module *mod) +{ + struct codetag_range range; + struct codetag_module *cmod; + int err; + + range = get_section_range(mod, cttype->desc.section); + if (!range.start || !range.stop) { + pr_warn("Failed to load code tags of type %s from the module %s\n", + cttype->desc.section, + mod ? mod->name : "(built-in)"); + return -EINVAL; + } + + /* Ignore empty ranges */ + if (range.start == range.stop) + return 0; + + BUG_ON(range.start > range.stop); + + cmod = kmalloc(sizeof(*cmod), GFP_KERNEL); + if (unlikely(!cmod)) + return -ENOMEM; + + cmod->mod = mod; + cmod->range = range; + + down_write(&cttype->mod_lock); + err = idr_alloc(&cttype->mod_idr, cmod, 0, 0, GFP_KERNEL); + if (err >= 0) { + cttype->count += range_size(cttype, &range); + if (cttype->desc.module_load) + cttype->desc.module_load(cttype, cmod); + } + up_write(&cttype->mod_lock); + + if (err < 0) { + kfree(cmod); + return err; + } + + return 0; +} + +struct codetag_type * +codetag_register_type(const struct codetag_type_desc *desc) +{ + struct codetag_type *cttype; + int err; + + BUG_ON(desc->tag_size <= 0); + + cttype = kzalloc(sizeof(*cttype), GFP_KERNEL); + if (unlikely(!cttype)) + return ERR_PTR(-ENOMEM); + + cttype->desc = *desc; + idr_init(&cttype->mod_idr); + init_rwsem(&cttype->mod_lock); + + err = codetag_module_init(cttype, NULL); + if (unlikely(err)) { + kfree(cttype); + return ERR_PTR(err); + } + + mutex_lock(&codetag_lock); + list_add_tail(&cttype->link, &codetag_types); + mutex_unlock(&codetag_lock); + + return cttype; +} + +void codetag_load_module(struct module *mod) +{ + struct codetag_type *cttype; + + if (!mod) + return; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) + codetag_module_init(cttype, mod); + mutex_unlock(&codetag_lock); +} + +bool codetag_unload_module(struct module *mod) +{ + struct codetag_type *cttype; + bool unload_ok = true; + + if (!mod) + return true; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + struct codetag_module *found = NULL; + struct codetag_module *cmod; + unsigned long mod_id, tmp; + + down_write(&cttype->mod_lock); + idr_for_each_entry_ul(&cttype->mod_idr, cmod, tmp, mod_id) { + if (cmod->mod && cmod->mod == mod) { + found = cmod; + break; + } + } + if (found) { + if (cttype->desc.module_unload) + if (!cttype->desc.module_unload(cttype, cmod)) + unload_ok = false; + + cttype->count -= range_size(cttype, &cmod->range); + idr_remove(&cttype->mod_idr, mod_id); + kfree(cmod); + } + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); + + return unload_ok; +} + +/* Codetag query parsing */ + +#define CODETAG_QUERY_TOKENS() \ + x(func) \ + x(file) \ + x(line) \ + x(module) \ + x(class) \ + x(index) + +enum tokens { +#define x(name) TOK_##name, + CODETAG_QUERY_TOKENS() +#undef x +}; + +static const char * const token_strs[] = { +#define x(name) #name, + CODETAG_QUERY_TOKENS() +#undef x + NULL +}; + +static int parse_range(char *str, unsigned int *first, unsigned int *last) +{ + char *first_str = str; + char *last_str = strchr(first_str, '-'); + + if (last_str) + *last_str++ = '\0'; + + if (kstrtouint(first_str, 10, first)) + return -EINVAL; + + if (!last_str) + *last = *first; + else if (kstrtouint(last_str, 10, last)) + return -EINVAL; + + return 0; +} + +char *codetag_query_parse(struct codetag_query *q, char *buf) +{ + while (1) { + char *p = buf; + char *str1 = strsep_no_empty(&p, " \t\r\n"); + char *str2 = strsep_no_empty(&p, " \t\r\n"); + int ret, token; + + if (!str1 || !str2) + break; + + token = match_string(token_strs, ARRAY_SIZE(token_strs), str1); + if (token < 0) + break; + + switch (token) { + case TOK_func: + q->function = str2; + break; + case TOK_file: + q->filename = str2; + break; + case TOK_line: + ret = parse_range(str2, &q->first_line, &q->last_line); + if (ret) + return ERR_PTR(ret); + q->match_line = true; + break; + case TOK_module: + q->module = str2; + break; + case TOK_class: + q->class = str2; + break; + case TOK_index: + ret = parse_range(str2, &q->first_index, &q->last_index); + if (ret) + return ERR_PTR(ret); + q->match_index = true; + break; + } + + buf = p; + } + + return buf; +} + +bool codetag_matches_query(struct codetag_query *q, + const struct codetag *ct, + const struct codetag_module *mod, + const char *class) +{ + size_t classlen = q->class ? strlen(q->class) : 0; + + if (q->module && + (!mod->mod || + strcmp(q->module, ct->modname))) + return false; + + if (q->filename && + strcmp(q->filename, ct->filename) && + strcmp(q->filename, kbasename(ct->filename))) + return false; + + if (q->function && + strcmp(q->function, ct->function)) + return false; + + /* match against the line number range */ + if (q->match_line && + (ct->lineno < q->first_line || + ct->lineno > q->last_line)) + return false; + + /* match against the class */ + if (classlen && + (strncmp(q->class, class, classlen) || + (class[classlen] && class[classlen] != ':'))) + return false; + + /* match against the fault index */ + if (q->match_index && + (q->cur_index < q->first_index || + q->cur_index > q->last_index)) { + q->cur_index++; + return false; + } + + q->cur_index++; + return true; +} diff --git a/lib/dynamic_fault.c b/lib/dynamic_fault.c new file mode 100644 index 000000000..c92374359 --- /dev/null +++ b/lib/dynamic_fault.c @@ -0,0 +1,371 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include +#include +#include +#include + +static struct codetag_type *cttype; + +bool __dynamic_fault_enabled(struct dfault *df) +{ + union dfault_state old, new; + unsigned int v = df->state.v; + bool ret; + + do { + old.v = new.v = v; + + if (new.enabled == DFAULT_disabled) + return false; + + ret = df->frequency + ? ++new.count >= df->frequency + : true; + if (ret) + new.count = 0; + if (ret && new.enabled == DFAULT_oneshot) + new.enabled = DFAULT_disabled; + } while ((v = cmpxchg(&df->state.v, old.v, new.v)) != old.v); + + if (ret) + pr_debug("returned true for %s:%u", df->tag.filename, df->tag.lineno); + + return ret; +} +EXPORT_SYMBOL(__dynamic_fault_enabled); + +static const char * const dfault_state_strs[] = { +#define x(n) #n, + DFAULT_STATES() +#undef x + NULL +}; + +static void dynamic_fault_to_text(struct seq_buf *out, struct dfault *df) +{ + codetag_to_text(out, &df->tag); + seq_buf_printf(out, "class:%s %s \"", df->class, + dfault_state_strs[df->state.enabled]); +} + +struct dfault_query { + struct codetag_query q; + + bool set_enabled:1; + unsigned int enabled:2; + + bool set_frequency:1; + unsigned int frequency; +}; + +/* + * Search the tables for _dfault's which match the given + * `query' and apply the `flags' and `mask' to them. Tells + * the user which dfault's were changed, or whether none + * were matched. + */ +static int dfault_change(struct dfault_query *query) +{ + struct codetag_iterator ct_iter = codetag_get_ct_iter(cttype); + struct codetag *ct; + unsigned int nfound = 0; + + codetag_lock_module_list(cttype, true); + + while ((ct = codetag_next_ct(&ct_iter))) { + struct dfault *df = container_of(ct, struct dfault, tag); + + if (!codetag_matches_query(&query->q, ct, ct_iter.cmod, df->class)) + continue; + + if (query->set_enabled && + query->enabled != df->state.enabled) { + if (query->enabled != DFAULT_disabled) + static_key_slow_inc(&df->enabled.key); + else if (df->state.enabled != DFAULT_disabled) + static_key_slow_dec(&df->enabled.key); + + df->state.enabled = query->enabled; + } + + if (query->set_frequency) + df->frequency = query->frequency; + + pr_debug("changed %s:%d [%s]%s #%d %s", + df->tag.filename, df->tag.lineno, df->tag.modname, + df->tag.function, query->q.cur_index, + dfault_state_strs[df->state.enabled]); + + nfound++; + } + + pr_debug("dfault: %u matches", nfound); + + codetag_lock_module_list(cttype, false); + + return nfound ? 0 : -ENOENT; +} + +#define DFAULT_TOKENS() \ + x(disable, 0) \ + x(enable, 0) \ + x(oneshot, 0) \ + x(frequency, 1) + +enum dfault_token { +#define x(name, nr_args) TOK_##name, + DFAULT_TOKENS() +#undef x +}; + +static const char * const dfault_token_strs[] = { +#define x(name, nr_args) #name, + DFAULT_TOKENS() +#undef x + NULL +}; + +static unsigned int dfault_token_nr_args[] = { +#define x(name, nr_args) nr_args, + DFAULT_TOKENS() +#undef x +}; + +static enum dfault_token str_to_token(const char *word, unsigned int nr_words) +{ + int tok = match_string(dfault_token_strs, ARRAY_SIZE(dfault_token_strs), word); + + if (tok < 0) { + pr_debug("unknown keyword \"%s\"", word); + return tok; + } + + if (nr_words < dfault_token_nr_args[tok]) { + pr_debug("insufficient arguments to \"%s\"", word); + return -EINVAL; + } + + return tok; +} + +static int dfault_parse_command(struct dfault_query *query, + enum dfault_token tok, + char *words[], size_t nr_words) +{ + unsigned int i = 0; + int ret; + + switch (tok) { + case TOK_disable: + query->set_enabled = true; + query->enabled = DFAULT_disabled; + break; + case TOK_enable: + query->set_enabled = true; + query->enabled = DFAULT_enabled; + break; + case TOK_oneshot: + query->set_enabled = true; + query->enabled = DFAULT_oneshot; + break; + case TOK_frequency: + query->set_frequency = 1; + ret = kstrtouint(words[i++], 10, &query->frequency); + if (ret) + return ret; + + if (!query->set_enabled) { + query->set_enabled = 1; + query->enabled = DFAULT_enabled; + } + break; + } + + return i; +} + +static int dynamic_fault_store(char *buf) +{ + struct dfault_query query = { NULL }; +#define MAXWORDS 9 + char *tok, *words[MAXWORDS]; + int ret, nr_words, i = 0; + + buf = codetag_query_parse(&query.q, buf); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + while ((tok = strsep_no_empty(&buf, " \t\r\n"))) { + if (nr_words == ARRAY_SIZE(words)) + return -EINVAL; /* ran out of words[] before bytes */ + words[nr_words++] = tok; + } + + while (i < nr_words) { + const char *tok_str = words[i++]; + enum dfault_token tok = str_to_token(tok_str, nr_words - i); + + if (tok < 0) + return tok; + + ret = dfault_parse_command(&query, tok, words + i, nr_words - i); + if (ret < 0) + return ret; + + i += ret; + BUG_ON(i > nr_words); + } + + pr_debug("q->function=\"%s\" q->filename=\"%s\" " + "q->module=\"%s\" q->line=%u-%u\n q->index=%u-%u", + query.q.function, query.q.filename, query.q.module, + query.q.first_line, query.q.last_line, + query.q.first_index, query.q.last_index); + + ret = dfault_change(&query); + if (ret < 0) + return ret; + + return 0; +} + +struct dfault_iter { + struct codetag_iterator ct_iter; + + struct seq_buf buf; + char rawbuf[4096]; +}; + +static int dfault_open(struct inode *inode, struct file *file) +{ + struct dfault_iter *iter; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + codetag_lock_module_list(cttype, true); + iter->ct_iter = codetag_get_ct_iter(cttype); + codetag_lock_module_list(cttype, false); + + file->private_data = iter; + seq_buf_init(&iter->buf, iter->rawbuf, sizeof(iter->rawbuf)); + return 0; +} + +static int dfault_release(struct inode *inode, struct file *file) +{ + struct dfault_iter *iter = file->private_data; + + kfree(iter); + return 0; +} + +struct user_buf { + char __user *buf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +static int flush_ubuf(struct user_buf *dst, struct seq_buf *src) +{ + if (src->len) { + size_t bytes = min_t(size_t, src->len, dst->size); + int err = copy_to_user(dst->buf, src->buffer, bytes); + + if (err) + return err; + + dst->ret += bytes; + dst->buf += bytes; + dst->size -= bytes; + src->len -= bytes; + memmove(src->buffer, src->buffer + bytes, src->len); + } + + return 0; +} + +static ssize_t dfault_read(struct file *file, char __user *ubuf, + size_t size, loff_t *ppos) +{ + struct dfault_iter *iter = file->private_data; + struct user_buf buf = { .buf = ubuf, .size = size }; + struct codetag *ct; + struct dfault *df; + int err; + + codetag_lock_module_list(iter->ct_iter.cttype, true); + while (1) { + err = flush_ubuf(&buf, &iter->buf); + if (err || !buf.size) + break; + + ct = codetag_next_ct(&iter->ct_iter); + if (!ct) + break; + + df = container_of(ct, struct dfault, tag); + dynamic_fault_to_text(&iter->buf, df); + seq_buf_putc(&iter->buf, '\n'); + } + codetag_lock_module_list(iter->ct_iter.cttype, false); + + return err ?: buf.ret; +} + +/* + * File_ops->write method for /dynamic_fault/conrol. Gathers the + * command text from userspace, parses and executes it. + */ +static ssize_t dfault_write(struct file *file, const char __user *ubuf, + size_t len, loff_t *offp) +{ + char tmpbuf[256]; + + if (len == 0) + return 0; + /* we don't check *offp -- multiple writes() are allowed */ + if (len > sizeof(tmpbuf)-1) + return -E2BIG; + if (copy_from_user(tmpbuf, ubuf, len)) + return -EFAULT; + tmpbuf[len] = '\0'; + pr_debug("read %zu bytes from userspace", len); + + dynamic_fault_store(tmpbuf); + + *offp += len; + return len; +} + +static const struct file_operations dfault_ops = { + .owner = THIS_MODULE, + .open = dfault_open, + .release = dfault_release, + .read = dfault_read, + .write = dfault_write +}; + +static int __init dynamic_fault_init(void) +{ + const struct codetag_type_desc desc = { + .section = "dynamic_fault_tags", + .tag_size = sizeof(struct dfault), + }; + struct dentry *debugfs_file; + + cttype = codetag_register_type(&desc); + if (IS_ERR_OR_NULL(cttype)) + return PTR_ERR(cttype); + + debugfs_file = debugfs_create_file("dynamic_faults", 0666, NULL, NULL, &dfault_ops); + if (IS_ERR(debugfs_file)) + return PTR_ERR(debugfs_file); + + return 0; +} +module_init(dynamic_fault_init); diff --git a/lib/errname.c b/lib/errname.c index 67739b174..dd1b99855 100644 --- a/lib/errname.c +++ b/lib/errname.c @@ -228,3 +228,4 @@ const char *errname(int err) return err > 0 ? name + 1 : name; } +EXPORT_SYMBOL(errname); diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index f25eb111c..41f1bcdc4 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -1,4 +1,5 @@ +#include #include #include #include @@ -166,6 +167,10 @@ void *__genradix_iter_peek(struct genradix_iter *iter, struct genradix_root *r; struct genradix_node *n; unsigned level, i; + + if (iter->offset == SIZE_MAX) + return NULL; + restart: r = READ_ONCE(radix->root); if (!r) @@ -184,10 +189,17 @@ void *__genradix_iter_peek(struct genradix_iter *iter, (GENRADIX_ARY - 1); while (!n->children[i]) { + size_t objs_per_ptr = genradix_depth_size(level); + + if (iter->offset + objs_per_ptr < iter->offset) { + iter->offset = SIZE_MAX; + iter->pos = SIZE_MAX; + return NULL; + } + i++; - iter->offset = round_down(iter->offset + - genradix_depth_size(level), - genradix_depth_size(level)); + iter->offset = round_down(iter->offset + objs_per_ptr, + objs_per_ptr); iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; if (i == GENRADIX_ARY) @@ -201,6 +213,64 @@ void *__genradix_iter_peek(struct genradix_iter *iter, } EXPORT_SYMBOL(__genradix_iter_peek); +void *__genradix_iter_peek_prev(struct genradix_iter *iter, + struct __genradix *radix, + size_t objs_per_page, + size_t obj_size_plus_page_remainder) +{ + struct genradix_root *r; + struct genradix_node *n; + unsigned level, i; + + if (iter->offset == SIZE_MAX) + return NULL; + +restart: + r = READ_ONCE(radix->root); + if (!r) + return NULL; + + n = genradix_root_to_node(r); + level = genradix_root_to_depth(r); + + if (ilog2(iter->offset) >= genradix_depth_shift(level)) { + iter->offset = genradix_depth_size(level); + iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + + iter->offset -= obj_size_plus_page_remainder; + iter->pos--; + } + + while (level) { + level--; + + i = (iter->offset >> genradix_depth_shift(level)) & + (GENRADIX_ARY - 1); + + while (!n->children[i]) { + size_t objs_per_ptr = genradix_depth_size(level); + + iter->offset = round_down(iter->offset, objs_per_ptr); + iter->pos = (iter->offset >> PAGE_SHIFT) * objs_per_page; + + if (!iter->offset) + return NULL; + + iter->offset -= obj_size_plus_page_remainder; + iter->pos--; + + if (!i) + goto restart; + --i; + } + + n = n->children[i]; + } + + return &n->data[iter->offset & (PAGE_SIZE - 1)]; +} +EXPORT_SYMBOL(__genradix_iter_peek_prev); + static void genradix_free_recurse(struct genradix_node *n, unsigned level) { if (level) { diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 960223ed9..f9c4bba27 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -857,24 +857,37 @@ size_t iov_iter_zero(size_t bytes, struct iov_iter *i) } EXPORT_SYMBOL(iov_iter_zero); -size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t bytes, - struct iov_iter *i) +size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, + size_t bytes, struct iov_iter *i) { - char *kaddr = kmap_atomic(page), *p = kaddr + offset; - if (!page_copy_sane(page, offset, bytes)) { - kunmap_atomic(kaddr); + size_t n, copied = 0; + + if (!page_copy_sane(page, offset, bytes)) return 0; - } - if (WARN_ON_ONCE(!i->data_source)) { - kunmap_atomic(kaddr); + if (WARN_ON_ONCE(!i->data_source)) return 0; - } - iterate_and_advance(i, bytes, base, len, off, - copyin(p + off, base, len), - memcpy_from_iter(i, p + off, base, len) - ) - kunmap_atomic(kaddr); - return bytes; + + do { + char *p; + + n = bytes - copied; + if (PageHighMem(page)) { + page += offset / PAGE_SIZE; + offset %= PAGE_SIZE; + n = min_t(size_t, n, PAGE_SIZE - offset); + } + + p = kmap_atomic(page) + offset; + iterate_and_advance(i, n, base, len, off, + copyin(p + off, base, len), + memcpy_from_iter(i, p + off, base, len) + ) + kunmap_atomic(p); + copied += n; + offset += n; + } while (PageHighMem(page) && copied != bytes && n > 0); + + return copied; } EXPORT_SYMBOL(copy_page_from_iter_atomic); diff --git a/lib/math/Kconfig b/lib/math/Kconfig index 0634b428d..7530ae9a3 100644 --- a/lib/math/Kconfig +++ b/lib/math/Kconfig @@ -15,3 +15,6 @@ config PRIME_NUMBERS config RATIONAL tristate + +config MEAN_AND_VARIANCE + tristate diff --git a/lib/math/Makefile b/lib/math/Makefile index bfac26ddf..2ef1487e0 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -4,6 +4,8 @@ obj-y += div64.o gcd.o lcm.o int_pow.o int_sqrt.o reciprocal_div.o obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o obj-$(CONFIG_RATIONAL) += rational.o +obj-$(CONFIG_MEAN_AND_VARIANCE) += mean_and_variance.o obj-$(CONFIG_TEST_DIV64) += test_div64.o obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o +obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o diff --git a/lib/math/mean_and_variance.c b/lib/math/mean_and_variance.c new file mode 100644 index 000000000..eb5f2ba03 --- /dev/null +++ b/lib/math/mean_and_variance.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions for incremental mean and variance. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * Copyright © 2022 Daniel B. Hill + * + * Author: Daniel B. Hill + * + * Description: + * + * This is includes some incremental algorithms for mean and variance calculation + * + * Derived from the paper: https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf + * + * Create a struct and if it's the weighted variant set the w field (weight = 2^k). + * + * Use mean_and_variance[_weighted]_update() on the struct to update it's state. + * + * Use the mean_and_variance[_weighted]_get_* functions to calculate the mean and variance, some computation + * is deferred to these functions for performance reasons. + * + * see lib/math/mean_and_variance_test.c for examples of usage. + * + * DO NOT access the mean and variance fields of the weighted variants directly. + * DO NOT change the weight after calling update. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +u128_u u128_div(u128_u n, u64 d) +{ + u128_u r; + u64 rem; + u64 hi = u128_hi(n); + u64 lo = u128_lo(n); + u64 h = hi & ((u64) U32_MAX << 32); + u64 l = (hi & (u64) U32_MAX) << 32; + + r = u128_shl(u64_to_u128(div64_u64_rem(h, d, &rem)), 64); + r = u128_add(r, u128_shl(u64_to_u128(div64_u64_rem(l + (rem << 32), d, &rem)), 32)); + r = u128_add(r, u64_to_u128(div64_u64_rem(lo + (rem << 32), d, &rem))); + return r; +} +EXPORT_SYMBOL_GPL(u128_div); + +/** + * mean_and_variance_get_mean() - get mean from @s + */ +s64 mean_and_variance_get_mean(struct mean_and_variance s) +{ + return s.n ? div64_u64(s.sum, s.n) : 0; +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_mean); + +/** + * mean_and_variance_get_variance() - get variance from @s1 + * + * see linked pdf equation 12. + */ +u64 mean_and_variance_get_variance(struct mean_and_variance s1) +{ + if (s1.n) { + u128_u s2 = u128_div(s1.sum_squares, s1.n); + u64 s3 = abs(mean_and_variance_get_mean(s1)); + + return u128_lo(u128_sub(s2, u128_square(s3))); + } else { + return 0; + } +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_variance); + +/** + * mean_and_variance_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_get_stddev(struct mean_and_variance s) +{ + return int_sqrt64(mean_and_variance_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev); + +/** + * mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update() + * @s1: .. + * @s2: .. + * + * see linked pdf: function derived from equations 140-143 where alpha = 2^w. + * values are stored bitshifted for performance and added precision. + */ +void mean_and_variance_weighted_update(struct mean_and_variance_weighted *s, s64 x) +{ + // previous weighted variance. + u8 w = s->weight; + u64 var_w0 = s->variance; + // new value weighted. + s64 x_w = x << w; + s64 diff_w = x_w - s->mean; + s64 diff = fast_divpow2(diff_w, w); + // new mean weighted. + s64 u_w1 = s->mean + diff; + + if (!s->init) { + s->mean = x_w; + s->variance = 0; + } else { + s->mean = u_w1; + s->variance = ((var_w0 << w) - var_w0 + ((diff_w * (x_w - u_w1)) >> w)) >> w; + } + s->init = true; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update); + +/** + * mean_and_variance_weighted_get_mean() - get mean from @s + */ +s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s) +{ + return fast_divpow2(s.mean, s.weight); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean); + +/** + * mean_and_variance_weighted_get_variance() -- get variance from @s + */ +u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s) +{ + // always positive don't need fast divpow2 + return s.variance >> s.weight; +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance); + +/** + * mean_and_variance_weighted_get_stddev() - get standard deviation from @s + */ +u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s) +{ + return int_sqrt64(mean_and_variance_weighted_get_variance(s)); +} +EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_stddev); + +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/lib/math/mean_and_variance_test.c b/lib/math/mean_and_variance_test.c new file mode 100644 index 000000000..f45591a16 --- /dev/null +++ b/lib/math/mean_and_variance_test.c @@ -0,0 +1,239 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include + +#define MAX_SQR (SQRT_U64_MAX*SQRT_U64_MAX) + +static void mean_and_variance_basic_test(struct kunit *test) +{ + struct mean_and_variance s = {}; + + mean_and_variance_update(&s, 2); + mean_and_variance_update(&s, 2); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 2); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 0); + KUNIT_EXPECT_EQ(test, s.n, 2); + + mean_and_variance_update(&s, 4); + mean_and_variance_update(&s, 4); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(s), 3); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_variance(s), 1); + KUNIT_EXPECT_EQ(test, s.n, 4); +} + +/* + * Test values computed using a spreadsheet from the psuedocode at the bottom: + * https://fanf2.user.srcf.net/hermes/doc/antiforgery/stats.pdf + */ + +static void mean_and_variance_weighted_test(struct kunit *test) +{ + struct mean_and_variance_weighted s = { .weight = 2 }; + + mean_and_variance_weighted_update(&s, 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + + mean_and_variance_weighted_update(&s, 20); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + + mean_and_variance_weighted_update(&s, 30); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); + + s = (struct mean_and_variance_weighted) { .weight = 2 }; + + mean_and_variance_weighted_update(&s, -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -10); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 0); + + mean_and_variance_weighted_update(&s, -20); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -12); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 18); + + mean_and_variance_weighted_update(&s, -30); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -16); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 72); +} + +static void mean_and_variance_weighted_advanced_test(struct kunit *test) +{ + struct mean_and_variance_weighted s = { .weight = 8 }; + s64 i; + + for (i = 10; i <= 100; i += 10) + mean_and_variance_weighted_update(&s, i); + + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), 11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); + + s = (struct mean_and_variance_weighted) { .weight = 8 }; + + for (i = -10; i >= -100; i -= 10) + mean_and_variance_weighted_update(&s, i); + + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(s), -11); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_variance(s), 107); +} + +static void do_mean_and_variance_test(struct kunit *test, + s64 initial_value, + s64 initial_n, + s64 n, + unsigned weight, + s64 *data, + s64 *mean, + s64 *stddev, + s64 *weighted_mean, + s64 *weighted_stddev) +{ + struct mean_and_variance mv = {}; + struct mean_and_variance_weighted vw = { .weight = weight }; + + for (unsigned i = 0; i < initial_n; i++) { + mean_and_variance_update(&mv, initial_value); + mean_and_variance_weighted_update(&vw, initial_value); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), 0); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), initial_value); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),0); + } + + for (unsigned i = 0; i < n; i++) { + mean_and_variance_update(&mv, data[i]); + mean_and_variance_weighted_update(&vw, data[i]); + + KUNIT_EXPECT_EQ(test, mean_and_variance_get_mean(mv), mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_get_stddev(mv), stddev[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_mean(vw), weighted_mean[i]); + KUNIT_EXPECT_EQ(test, mean_and_variance_weighted_get_stddev(vw),weighted_stddev[i]); + } + + KUNIT_EXPECT_EQ(test, mv.n, initial_n + n); +} + +/* Test behaviour with a single outlier, then back to steady state: */ +static void mean_and_variance_test_1(struct kunit *test) +{ + s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; + s64 mean[] = { 22, 21, 20, 19, 18, 17, 16 }; + s64 stddev[] = { 32, 29, 28, 27, 26, 25, 24 }; + s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; + s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +static void mean_and_variance_test_2(struct kunit *test) +{ + s64 d[] = { 100, 10, 10, 10, 10, 10, 10 }; + s64 mean[] = { 10, 10, 10, 10, 10, 10, 10 }; + s64 stddev[] = { 9, 9, 9, 9, 9, 9, 9 }; + s64 weighted_mean[] = { 32, 27, 22, 19, 17, 15, 14 }; + s64 weighted_stddev[] = { 38, 35, 31, 27, 24, 21, 18 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +/* Test behaviour where we switch from one steady state to another: */ +static void mean_and_variance_test_3(struct kunit *test) +{ + s64 d[] = { 100, 100, 100, 100, 100 }; + s64 mean[] = { 22, 32, 40, 46, 50 }; + s64 stddev[] = { 32, 39, 42, 44, 45 }; + s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; + s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +static void mean_and_variance_test_4(struct kunit *test) +{ + s64 d[] = { 100, 100, 100, 100, 100 }; + s64 mean[] = { 10, 11, 12, 13, 14 }; + s64 stddev[] = { 9, 13, 15, 17, 19 }; + s64 weighted_mean[] = { 32, 49, 61, 71, 78 }; + s64 weighted_stddev[] = { 38, 44, 44, 41, 38 }; + + do_mean_and_variance_test(test, 10, 6, ARRAY_SIZE(d), 2, + d, mean, stddev, weighted_mean, weighted_stddev); +} + +static void mean_and_variance_fast_divpow2(struct kunit *test) +{ + s64 i; + u8 d; + + for (i = 0; i < 100; i++) { + d = 0; + KUNIT_EXPECT_EQ(test, fast_divpow2(i, d), div_u64(i, 1LLU << d)); + KUNIT_EXPECT_EQ(test, abs(fast_divpow2(-i, d)), div_u64(i, 1LLU << d)); + for (d = 1; d < 32; d++) { + KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(i, d)), + div_u64(i, 1 << d), "%lld %u", i, d); + KUNIT_EXPECT_EQ_MSG(test, abs(fast_divpow2(-i, d)), + div_u64(i, 1 << d), "%lld %u", -i, d); + } + } +} + +static void mean_and_variance_u128_basic_test(struct kunit *test) +{ + u128_u a = u64s_to_u128(0, U64_MAX); + u128_u a1 = u64s_to_u128(0, 1); + u128_u b = u64s_to_u128(1, 0); + u128_u c = u64s_to_u128(0, 1LLU << 63); + u128_u c2 = u64s_to_u128(U64_MAX, U64_MAX); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a, a1)), 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a, a1)), 0); + KUNIT_EXPECT_EQ(test, u128_hi(u128_add(a1, a)), 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_add(a1, a)), 0); + + KUNIT_EXPECT_EQ(test, u128_lo(u128_sub(b, a1)), U64_MAX); + KUNIT_EXPECT_EQ(test, u128_hi(u128_sub(b, a1)), 0); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_shl(c, 1)), 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_shl(c, 1)), 0); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_square(U64_MAX)), U64_MAX - 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_square(U64_MAX)), 1); + + KUNIT_EXPECT_EQ(test, u128_lo(u128_div(b, 2)), 1LLU << 63); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_div(c2, 2)), U64_MAX >> 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_div(c2, 2)), U64_MAX); + + KUNIT_EXPECT_EQ(test, u128_hi(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U32_MAX >> 1); + KUNIT_EXPECT_EQ(test, u128_lo(u128_div(u128_shl(u64_to_u128(U64_MAX), 32), 2)), U64_MAX << 31); +} + +static struct kunit_case mean_and_variance_test_cases[] = { + KUNIT_CASE(mean_and_variance_fast_divpow2), + KUNIT_CASE(mean_and_variance_u128_basic_test), + KUNIT_CASE(mean_and_variance_basic_test), + KUNIT_CASE(mean_and_variance_weighted_test), + KUNIT_CASE(mean_and_variance_weighted_advanced_test), + KUNIT_CASE(mean_and_variance_test_1), + KUNIT_CASE(mean_and_variance_test_2), + KUNIT_CASE(mean_and_variance_test_3), + KUNIT_CASE(mean_and_variance_test_4), + {} +}; + +static struct kunit_suite mean_and_variance_test_suite = { + .name = "mean and variance tests", + .test_cases = mean_and_variance_test_cases +}; + +kunit_test_suite(mean_and_variance_test_suite); + +MODULE_AUTHOR("Daniel B. Hill"); +MODULE_LICENSE("GPL"); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index 6ae2ba8e0..76e5bf9be 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -130,7 +130,7 @@ static union nested_table *nested_table_alloc(struct rhashtable *ht, if (ntbl) return ntbl; - ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC); + ntbl = kmalloc_noprof(PAGE_SIZE, GFP_ATOMIC|__GFP_ZERO); if (ntbl && leaf) { for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++) @@ -157,7 +157,7 @@ static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht, size = sizeof(*tbl) + sizeof(tbl->buckets[0]); - tbl = kzalloc(size, gfp); + tbl = kmalloc_noprof(size, gfp|__GFP_ZERO); if (!tbl) return NULL; @@ -180,8 +180,10 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size_t size; int i; static struct lock_class_key __key; + struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag); - tbl = kvzalloc(struct_size(tbl, buckets, nbuckets), gfp); + tbl = kvmalloc_node_noprof(struct_size(tbl, buckets, nbuckets), + gfp|__GFP_ZERO, NUMA_NO_NODE); size = nbuckets; @@ -190,6 +192,8 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, nbuckets = 0; } + alloc_tag_restore(ht->alloc_tag, old); + if (tbl == NULL) return NULL; @@ -360,9 +364,14 @@ static int rhashtable_rehash_alloc(struct rhashtable *ht, ASSERT_RHT_MUTEX(ht); - new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL); - if (new_tbl == NULL) + new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL|__GFP_NOWARN); + if (new_tbl == NULL) { + WARN("rhashtable bucket table allocation failure for %ps", + (void *) ht->p.hashfn ?: + (void *) ht->p.obj_hashfn ?: + (void *) ht->p.obj_cmpfn); return -ENOMEM; + } err = rhashtable_rehash_attach(ht, old_tbl, new_tbl); if (err) @@ -975,7 +984,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) } /** - * rhashtable_init - initialize a new hash table + * rhashtable_init_noprof - initialize a new hash table * @ht: hash table to be initialized * @params: configuration parameters * @@ -1016,7 +1025,7 @@ static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed) * .obj_hashfn = my_hash_fn, * }; */ -int rhashtable_init(struct rhashtable *ht, +int rhashtable_init_noprof(struct rhashtable *ht, const struct rhashtable_params *params) { struct bucket_table *tbl; @@ -1031,6 +1040,10 @@ int rhashtable_init(struct rhashtable *ht, spin_lock_init(&ht->lock); memcpy(&ht->p, params, sizeof(*params)); +#ifdef CONFIG_MEM_ALLOC_PROFILING + ht->alloc_tag = current->alloc_tag; +#endif + if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); @@ -1076,26 +1089,26 @@ int rhashtable_init(struct rhashtable *ht, return 0; } -EXPORT_SYMBOL_GPL(rhashtable_init); +EXPORT_SYMBOL_GPL(rhashtable_init_noprof); /** - * rhltable_init - initialize a new hash list table + * rhltable_init_noprof - initialize a new hash list table * @hlt: hash list table to be initialized * @params: configuration parameters * * Initializes a new hash list table. * - * See documentation for rhashtable_init. + * See documentation for rhashtable_init_noprof. */ -int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params) +int rhltable_init_noprof(struct rhltable *hlt, const struct rhashtable_params *params) { int err; - err = rhashtable_init(&hlt->ht, params); + err = rhashtable_init_noprof(&hlt->ht, params); hlt->ht.rhlist = true; return err; } -EXPORT_SYMBOL_GPL(rhltable_init); +EXPORT_SYMBOL_GPL(rhltable_init_noprof); static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj, void (*free_fn)(void *ptr, void *arg), @@ -1222,6 +1235,7 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( unsigned int index = hash & ((1 << tbl->nest) - 1); unsigned int size = tbl->size >> tbl->nest; union nested_table *ntbl; + struct alloc_tag * __maybe_unused old = alloc_tag_save(ht->alloc_tag); ntbl = nested_table_top(tbl); hash >>= tbl->nest; @@ -1236,6 +1250,8 @@ struct rhash_lock_head __rcu **rht_bucket_nested_insert( size <= (1 << shift)); } + alloc_tag_restore(ht->alloc_tag, old); + if (!ntbl) return NULL; diff --git a/lib/seq_buf.c b/lib/seq_buf.c index 45c450f42..2b87e9219 100644 --- a/lib/seq_buf.c +++ b/lib/seq_buf.c @@ -427,3 +427,13 @@ int seq_buf_hex_dump(struct seq_buf *s, const char *prefix_str, int prefix_type, } return 0; } + +void seq_buf_human_readable_u64(struct seq_buf *s, u64 v) +{ + char *buf; + size_t size = seq_buf_get_buf(s, &buf); + int wrote = string_get_size(v, 1, false, buf, size); + + seq_buf_commit(s, wrote); +} +EXPORT_SYMBOL(seq_buf_human_readable_u64); diff --git a/lib/string.c b/lib/string.c index 3d55ef890..dd4914baf 100644 --- a/lib/string.c +++ b/lib/string.c @@ -520,6 +520,25 @@ char *strsep(char **s, const char *ct) EXPORT_SYMBOL(strsep); #endif +/** + * strsep_no_empt - Split a string into tokens, but don't return empty tokens + * @s: The string to be searched + * @ct: The characters to search for + * + * strsep() updates @s to point after the token, ready for the next call. + */ +char *strsep_no_empty(char **s, const char *ct) +{ + char *ret; + + do { + ret = strsep(s, ct); + } while (ret && !*ret); + + return ret; +} +EXPORT_SYMBOL_GPL(strsep_no_empty); + #ifndef __HAVE_ARCH_MEMSET /** * memset - Fill a region of memory with the given value diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 230020a2e..d527ce455 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -19,11 +19,17 @@ #include #include +enum string_size_units { + STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ + STRING_UNITS_2, /* use binary powers of 2^10 */ +}; + /** * string_get_size - get the size in the specified units * @size: The size to be converted in blocks * @blk_size: Size of the block (use 1 for size in bytes) - * @units: units to use (powers of 1000 or 1024) + * @flags: units to use (powers of 1000 or 1024), whether to include space + * separator * @buf: buffer to format to * @len: length of buffer * @@ -31,15 +37,19 @@ * giving the size in the required units. @buf should have room for * at least 9 bytes and will always be zero terminated. * + * Return value: number of characters of output that would have been written + * (which may be greater than len, if output was truncated). */ -void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, - char *buf, int len) +int string_get_size(u64 size, u64 blk_size, enum string_size_flags flags, + char *buf, int len) { + enum string_size_units units = flags & flags & STRING_SIZE_BASE2 + ? STRING_UNITS_2 : STRING_UNITS_10; static const char *const units_10[] = { - "B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB" + "", "k", "M", "G", "T", "P", "E", "Z", "Y" }; static const char *const units_2[] = { - "B", "KiB", "MiB", "GiB", "TiB", "PiB", "EiB", "ZiB", "YiB" + "", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi", "Yi" }; static const char *const *const units_str[] = { [STRING_UNITS_10] = units_10, @@ -126,8 +136,10 @@ void string_get_size(u64 size, u64 blk_size, const enum string_size_units units, else unit = units_str[units][i]; - snprintf(buf, len, "%u%s %s", (u32)size, - tmp, unit); + return snprintf(buf, len, "%u%s%s%s%s", (u32)size, tmp, + (flags & STRING_SIZE_NOSPACE) ? "" : " ", + unit, + (flags & STRING_SIZE_NOBYTES) ? "" : "B"); } EXPORT_SYMBOL(string_get_size); diff --git a/lib/test-string_helpers.c b/lib/test-string_helpers.c index 9a68849a5..0b01ffca9 100644 --- a/lib/test-string_helpers.c +++ b/lib/test-string_helpers.c @@ -507,8 +507,8 @@ static __init void __test_string_get_size(const u64 size, const u64 blk_size, char buf10[string_get_size_maxbuf]; char buf2[string_get_size_maxbuf]; - string_get_size(size, blk_size, STRING_UNITS_10, buf10, sizeof(buf10)); - string_get_size(size, blk_size, STRING_UNITS_2, buf2, sizeof(buf2)); + string_get_size(size, blk_size, 0, buf10, sizeof(buf10)); + string_get_size(size, blk_size, STRING_SIZE_BASE2, buf2, sizeof(buf2)); test_string_get_size_check("STRING_UNITS_10", exp_result10, buf10, size, blk_size); diff --git a/mm/Makefile b/mm/Makefile index e29afc890..e2ecfe0ea 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -53,7 +53,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ mm_init.o percpu.o slab_common.o \ compaction.o \ interval_tree.o list_lru.o workingset.o \ - debug.o gup.o mmap_lock.o $(mmu-y) + debug.o gup.o mmap_lock.o show_mem.o $(mmu-y) # Give 'page_alloc' its own module-parameter namespace page-alloc-y := page_alloc.o diff --git a/mm/compaction.c b/mm/compaction.c index c8bcdea15..09dd56a94 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1684,8 +1684,8 @@ static void isolate_freepages(struct compact_control *cc) * This is a migrate-callback that "allocates" freepages by taking pages * from the isolated freelists in the block we are migrating to. */ -static struct page *compaction_alloc(struct page *migratepage, - unsigned long data) +static struct page *compaction_alloc_noprof(struct page *migratepage, + unsigned long data) { struct compact_control *cc = (struct compact_control *)data; struct page *freepage; @@ -1704,6 +1704,12 @@ static struct page *compaction_alloc(struct page *migratepage, return freepage; } +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data) +{ + return alloc_hooks(compaction_alloc_noprof(migratepage, data)); +} + /* * This is a migrate-callback that "frees" freepages back to the isolated * freelist. All pages on the freelist are from the same zone, so there is no diff --git a/mm/filemap.c b/mm/filemap.c index 83dda76d1..e5c81c0cf 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -958,7 +958,7 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) { int n; struct folio *folio; @@ -973,9 +973,9 @@ struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order) return folio; } - return folio_alloc(gfp, order); + return folio_alloc_noprof(gfp, order); } -EXPORT_SYMBOL(filemap_alloc_folio); +EXPORT_SYMBOL(filemap_alloc_folio_noprof); #endif /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 624671aaa..221cce005 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -2557,6 +2558,7 @@ static void __split_huge_page(struct page *page, struct list_head *list, /* Caller disabled irqs, so they are still disabled here */ split_page_owner(head, nr); + pgalloc_tag_split(head, nr); /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f791076da..3e5a604ee 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3246,7 +3246,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) if (i == h->max_huge_pages_node[nid]) return; - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); pr_warn("HugeTLB: allocating %u of page size %s failed node%d. Only allocated %lu hugepages.\n", h->max_huge_pages_node[nid], buf, nid, i); h->max_huge_pages -= (h->max_huge_pages_node[nid] - i); @@ -3308,7 +3308,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (i < h->max_huge_pages) { char buf[32]; - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); pr_warn("HugeTLB: allocating %lu of page size %s failed. Only allocated %lu hugepages.\n", h->max_huge_pages, buf, i); h->max_huge_pages = i; @@ -3354,7 +3354,7 @@ static void __init report_hugepages(void) for_each_hstate(h) { char buf[32]; - string_get_size(huge_page_size(h), 1, STRING_UNITS_2, buf, 32); + string_get_size(huge_page_size(h), 1, STRING_SIZE_BASE2, buf, 32); pr_info("HugeTLB: registered %s page size, pre-allocated %ld pages\n", buf, h->free_huge_pages); pr_info("HugeTLB: %d KiB vmemmap can be freed for a %s page\n", @@ -4245,7 +4245,7 @@ static int __init hugetlb_init(void) char buf[32]; string_get_size(huge_page_size(&default_hstate), - 1, STRING_UNITS_2, buf, 32); + 1, STRING_SIZE_BASE2, buf, 32); pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n", default_hstate.max_huge_pages, buf); pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n", diff --git a/mm/kfence/core.c b/mm/kfence/core.c index dad3c0eb7..aea6fa145 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -590,9 +590,9 @@ static unsigned long kfence_init_pool(void) continue; __folio_set_slab(slab_folio(slab)); -#ifdef CONFIG_MEMCG - slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg | - MEMCG_DATA_OBJCGS; +#ifdef CONFIG_MEMCG_KMEM + slab->obj_exts = (unsigned long)&kfence_metadata[i / 2 - 1].obj_exts | + MEMCG_DATA_OBJEXTS; #endif } @@ -634,8 +634,8 @@ static unsigned long kfence_init_pool(void) if (!i || (i % 2)) continue; -#ifdef CONFIG_MEMCG - slab->memcg_data = 0; +#ifdef CONFIG_MEMCG_KMEM + slab->obj_exts = 0; #endif __folio_clear_slab(slab_folio(slab)); } @@ -1093,8 +1093,8 @@ void __kfence_free(void *addr) { struct kfence_metadata *meta = addr_to_metadata((unsigned long)addr); -#ifdef CONFIG_MEMCG - KFENCE_WARN_ON(meta->objcg); +#ifdef CONFIG_MEMCG_KMEM + KFENCE_WARN_ON(meta->obj_exts.objcg); #endif /* * If the objects of the cache are SLAB_TYPESAFE_BY_RCU, defer freeing diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h index 392fb273e..b02d2cb96 100644 --- a/mm/kfence/kfence.h +++ b/mm/kfence/kfence.h @@ -97,8 +97,8 @@ struct kfence_metadata { struct kfence_track free_track; /* For updating alloc_covered on frees. */ u32 alloc_stack_hash; -#ifdef CONFIG_MEMCG - struct obj_cgroup *objcg; +#ifdef CONFIG_MEMCG_KMEM + struct slabobj_ext obj_exts; #endif }; diff --git a/mm/madvise.c b/mm/madvise.c index b5ffbaf61..e08639a7c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1311,6 +1311,64 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, madvise_vma_anon_name); } #endif /* CONFIG_ANON_VMA_NAME */ + +static noinline unsigned long test_alloc(unsigned long in1, unsigned long in2, size_t size) +{ + switch (in1) + { + case (1): + return __get_free_pages(GFP_KERNEL, 0); + case (2): + return (unsigned long)kmalloc(size, GFP_KERNEL | __GFP_ACCOUNT); + default: + printk("test_alloc invoked with args in1=%lu in2=%lu\n", + in1, in2); + return 0; + } +} + +static noinline void test_free(unsigned long in1, unsigned long in2, unsigned long addr) +{ + switch (in1) + { + case (1): + free_page(addr); + break; + case (2): + kfree((void*)addr); + break; + default: + printk("test_free invoked with args in1=%lu in2=%lu\n", + in1, in2); + break; + } +} + +#define MADV_TEST 25 +static noinline int alloc_bench(unsigned long in1, unsigned long in2) +{ + int i, batch, iter; + unsigned long addr[10]; + + for (iter = 0; iter < 1000000; iter++) { + size_t size = 8; + for (batch = 0; batch < 30; batch++) { + for (i = 0; i < 10; i++) { + addr[i] = test_alloc(in1, in2, size); + } + for (i = 0; i < 10; i++) { + test_free(in1, in2, addr[i]); + } + size += 8; + } + if (fatal_signal_pending(current)) + return -EINTR; + //cond_resched(); + } + + return 0; +} + /* * The madvise(2) system call. * @@ -1390,6 +1448,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh size_t len; struct blk_plug plug; + if (behavior == MADV_TEST) + return alloc_bench(start, len_in); + if (!madvise_behavior_valid(behavior)) return -EINVAL; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b27e245a..f2a7fe718 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2892,13 +2892,6 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) } #ifdef CONFIG_MEMCG_KMEM -/* - * The allocated objcg pointers array is not accounted directly. - * Moreover, it should not come from DMA buffer and is not readily - * reclaimable. So those GFP bits should be masked off. - */ -#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) - /* * mod_objcg_mlstate() may be called with irq enabled, so * mod_memcg_lruvec_state() should be used. @@ -2917,62 +2910,27 @@ static inline void mod_objcg_mlstate(struct obj_cgroup *objcg, rcu_read_unlock(); } -int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, - gfp_t gfp, bool new_slab) -{ - unsigned int objects = objs_per_slab(s, slab); - unsigned long memcg_data; - void *vec; - - gfp &= ~OBJCGS_CLEAR_MASK; - vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp, - slab_nid(slab)); - if (!vec) - return -ENOMEM; - - memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS; - if (new_slab) { - /* - * If the slab is brand new and nobody can yet access its - * memcg_data, no synchronization is required and memcg_data can - * be simply assigned. - */ - slab->memcg_data = memcg_data; - } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) { - /* - * If the slab is already in use, somebody can allocate and - * assign obj_cgroups in parallel. In this case the existing - * objcg vector should be reused. - */ - kfree(vec); - return 0; - } - - kmemleak_not_leak(vec); - return 0; -} - static __always_inline struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) { /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in - * slab->memcg_data. + * slab->obj_exts. */ if (folio_test_slab(folio)) { - struct obj_cgroup **objcgs; + struct slabobj_ext *obj_exts; struct slab *slab; unsigned int off; slab = folio_slab(folio); - objcgs = slab_objcgs(slab); - if (!objcgs) + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return NULL; off = obj_to_index(slab->slab_cache, slab, p); - if (objcgs[off]) - return obj_cgroup_memcg(objcgs[off]); + if (obj_exts[off].objcg) + return obj_cgroup_memcg(obj_exts[off].objcg); return NULL; } @@ -2980,7 +2938,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p) /* * folio_memcg_check() is used here, because in theory we can encounter * a folio where the slab flag has been cleared already, but - * slab->memcg_data has not been freed yet + * slab->obj_exts has not been freed yet * folio_memcg_check() will guarantee that a proper memory * cgroup pointer or NULL will be returned. */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1756389a0..aaf767767 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2109,7 +2109,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, { struct page *page; - page = __alloc_pages(gfp, order, nid, NULL); + page = __alloc_pages_noprof(gfp, order, nid, NULL); /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ if (!static_branch_likely(&vm_numa_stat_key)) return page; @@ -2135,15 +2135,15 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_pages_noprof(preferred_gfp, order, nid, &pol->nodes); if (!page) - page = __alloc_pages(gfp, order, nid, NULL); + page = __alloc_pages_noprof(gfp, order, nid, NULL); return page; } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * vma_alloc_folio_noprof - Allocate a folio for a VMA. * @gfp: GFP flags. * @order: Order of the folio. * @vma: Pointer to VMA or NULL if not available. @@ -2157,7 +2157,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * * Return: The folio on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, +struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage) { struct mempolicy *pol; @@ -2228,7 +2228,7 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * memory with both reclaim and compact as well. */ if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, + folio = __folio_alloc_noprof(gfp, order, hpage_node, nmask); goto out; @@ -2237,15 +2237,15 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + folio = __folio_alloc_noprof(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: return folio; } -EXPORT_SYMBOL(vma_alloc_folio); +EXPORT_SYMBOL(vma_alloc_folio_noprof); /** - * alloc_pages - Allocate pages. + * alloc_pages_noprof - Allocate pages. * @gfp: GFP flags. * @order: Power of two of number of pages to allocate. * @@ -2258,7 +2258,7 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; struct page *page; @@ -2276,23 +2276,23 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_pages_preferred_many(gfp, order, policy_node(gfp, pol, numa_node_id()), pol); else - page = __alloc_pages(gfp, order, + page = __alloc_pages_noprof(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); return page; } -EXPORT_SYMBOL(alloc_pages); +EXPORT_SYMBOL(alloc_pages_noprof); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { - struct page *page = alloc_pages(gfp | __GFP_COMP, order); + struct page *page = alloc_pages_noprof(gfp | __GFP_COMP, order); if (page && order > 1) prep_transhuge_page(page); return (struct folio *)page; } -EXPORT_SYMBOL(folio_alloc); +EXPORT_SYMBOL(folio_alloc_noprof); static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, struct mempolicy *pol, unsigned long nr_pages, @@ -2311,13 +2311,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp, for (i = 0; i < nodes; i++) { if (delta) { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node + 1, NULL, page_array); delta--; } else { - nr_allocated = __alloc_pages_bulk(gfp, + nr_allocated = alloc_pages_bulk_noprof(gfp, interleave_nodes(pol), NULL, nr_pages_per_node, NULL, page_array); } @@ -2339,11 +2339,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes, + nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes, nr_pages, NULL, page_array); if (nr_allocated < nr_pages) - nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL, + nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL, nr_pages - nr_allocated, NULL, page_array + nr_allocated); return nr_allocated; @@ -2355,7 +2355,7 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid, * It can accelerate memory allocation especially interleaving * allocate memory. */ -unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, +unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; @@ -2371,7 +2371,7 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), + return alloc_pages_bulk_noprof(gfp, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol), nr_pages, NULL, page_array); } diff --git a/mm/mempool.c b/mm/mempool.c index 734bcf5af..4fd949178 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -230,17 +230,17 @@ EXPORT_SYMBOL(mempool_init_node); * * Return: %0 on success, negative error code otherwise. */ -int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) +int mempool_init_noprof(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data) { return mempool_init_node(pool, min_nr, alloc_fn, free_fn, pool_data, GFP_KERNEL, NUMA_NO_NODE); } -EXPORT_SYMBOL(mempool_init); +EXPORT_SYMBOL(mempool_init_noprof); /** - * mempool_create - create a memory pool + * mempool_create_node - create a memory pool * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. @@ -255,17 +255,9 @@ EXPORT_SYMBOL(mempool_init); * * Return: pointer to the created memory pool object or %NULL on error. */ -mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data) -{ - return mempool_create_node(min_nr, alloc_fn, free_fn, pool_data, - GFP_KERNEL, NUMA_NO_NODE); -} -EXPORT_SYMBOL(mempool_create); - -mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, - mempool_free_t *free_fn, void *pool_data, - gfp_t gfp_mask, int node_id) +mempool_t *mempool_create_node_noprof(int min_nr, mempool_alloc_t *alloc_fn, + mempool_free_t *free_fn, void *pool_data, + gfp_t gfp_mask, int node_id) { mempool_t *pool; @@ -281,7 +273,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, return pool; } -EXPORT_SYMBOL(mempool_create_node); +EXPORT_SYMBOL(mempool_create_node_noprof); /** * mempool_resize - resize an existing memory pool @@ -377,7 +369,7 @@ EXPORT_SYMBOL(mempool_resize); * * Return: pointer to the allocated element or %NULL on error. */ -void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) +void *mempool_alloc_noprof(mempool_t *pool, gfp_t gfp_mask) { void *element; unsigned long flags; @@ -444,7 +436,7 @@ void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) finish_wait(&pool->wait, &wait); goto repeat_alloc; } -EXPORT_SYMBOL(mempool_alloc); +EXPORT_SYMBOL(mempool_alloc_noprof); /** * mempool_free - return an element to the pool. @@ -515,7 +507,7 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data) { struct kmem_cache *mem = pool_data; VM_BUG_ON(mem->ctor); - return kmem_cache_alloc(mem, gfp_mask); + return kmem_cache_alloc_noprof(mem, gfp_mask); } EXPORT_SYMBOL(mempool_alloc_slab); @@ -533,7 +525,7 @@ EXPORT_SYMBOL(mempool_free_slab); void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { size_t size = (size_t)pool_data; - return kmalloc(size, gfp_mask); + return kmalloc_noprof(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); @@ -550,7 +542,7 @@ EXPORT_SYMBOL(mempool_kfree); void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data) { int order = (int)(long)pool_data; - return alloc_pages(gfp_mask, order); + return alloc_pages_noprof(gfp_mask, order); } EXPORT_SYMBOL(mempool_alloc_pages); diff --git a/mm/mm_init.c b/mm/mm_init.c index 7f7f9c677..42135fad4 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include "internal.h" diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 044e1eed7..f2657245e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -168,27 +168,6 @@ static bool oom_unkillable_task(struct task_struct *p) return false; } -/* - * Check whether unreclaimable slab amount is greater than - * all user memory(LRU pages). - * dump_unreclaimable_slab() could help in the case that - * oom due to too much unreclaimable slab used by kernel. -*/ -static bool should_dump_unreclaim_slab(void) -{ - unsigned long nr_lru; - - nr_lru = global_node_page_state(NR_ACTIVE_ANON) + - global_node_page_state(NR_INACTIVE_ANON) + - global_node_page_state(NR_ACTIVE_FILE) + - global_node_page_state(NR_INACTIVE_FILE) + - global_node_page_state(NR_ISOLATED_ANON) + - global_node_page_state(NR_ISOLATED_FILE) + - global_node_page_state(NR_UNEVICTABLE); - - return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru); -} - /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate @@ -462,8 +441,6 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) mem_cgroup_print_oom_meminfo(oc->memcg); else { __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); - if (should_dump_unreclaim_slab()) - dump_unreclaimable_slab(); } if (sysctl_oom_dump_tasks) dump_tasks(oc); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 47421bedc..e20ef7a00 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include @@ -1259,6 +1260,7 @@ static __always_inline bool free_pages_prepare(struct page *page, __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); page_table_check_free(page, order); + pgalloc_tag_sub(page, order); return false; } @@ -1301,6 +1303,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); page_table_check_free(page, order); + pgalloc_tag_sub(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -1730,6 +1733,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, set_page_owner(page, order, gfp_flags); page_table_check_alloc(page, order); + pgalloc_tag_add(page, current, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -2790,6 +2794,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); + pgalloc_tag_split(page, 1 << order); split_page_memcg(page, 1 << order); } EXPORT_SYMBOL_GPL(split_page); @@ -4577,7 +4582,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, * * Returns the number of pages on the list or array. */ -unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, +unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid, nodemask_t *nodemask, int nr_pages, struct list_head *page_list, struct page **page_array) @@ -4713,7 +4718,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, pcp_trylock_finish(UP_flags); failed: - page = __alloc_pages(gfp, 0, preferred_nid, nodemask); + page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask); if (page) { if (page_list) list_add(&page->lru, page_list); @@ -4724,13 +4729,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, goto out; } -EXPORT_SYMBOL_GPL(__alloc_pages_bulk); +EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof); /* * This is the 'heart' of the zoned buddy allocator. */ -struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, - nodemask_t *nodemask) +struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order, + int preferred_nid, nodemask_t *nodemask) { struct page *page; unsigned int alloc_flags = ALLOC_WMARK_LOW; @@ -4792,41 +4797,41 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid, return page; } -EXPORT_SYMBOL(__alloc_pages); +EXPORT_SYMBOL(__alloc_pages_noprof); -struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid, +struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_nid, nodemask_t *nodemask) { - struct page *page = __alloc_pages(gfp | __GFP_COMP, order, + struct page *page = __alloc_pages_noprof(gfp | __GFP_COMP, order, preferred_nid, nodemask); if (page && order > 1) prep_transhuge_page(page); return (struct folio *)page; } -EXPORT_SYMBOL(__folio_alloc); +EXPORT_SYMBOL(__folio_alloc_noprof); /* * Common helper functions. Never use with __GFP_HIGHMEM because the returned * address cannot represent highmem pages. Use alloc_pages and then kmap if * you need to access high mem. */ -unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) +unsigned long get_free_pages_noprof(gfp_t gfp_mask, unsigned int order) { struct page *page; - page = alloc_pages(gfp_mask & ~__GFP_HIGHMEM, order); + page = alloc_pages_noprof(gfp_mask & ~__GFP_HIGHMEM, order); if (!page) return 0; return (unsigned long) page_address(page); } -EXPORT_SYMBOL(__get_free_pages); +EXPORT_SYMBOL(get_free_pages_noprof); -unsigned long get_zeroed_page(gfp_t gfp_mask) +unsigned long get_zeroed_page_noprof(gfp_t gfp_mask) { - return __get_free_page(gfp_mask | __GFP_ZERO); + return get_free_pages_noprof(gfp_mask | __GFP_ZERO, 0); } -EXPORT_SYMBOL(get_zeroed_page); +EXPORT_SYMBOL(get_zeroed_page_noprof); /** * __free_pages - Free pages allocated with alloc_pages(). @@ -5006,6 +5011,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, 1 << order); + pgalloc_tag_split(page, 1 << order); split_page_memcg(page, 1 << order); while (page < --last) set_page_refcounted(last); @@ -5018,7 +5024,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, } /** - * alloc_pages_exact - allocate an exact number physically-contiguous pages. + * alloc_pages_exact_noprof - allocate an exact number physically-contiguous pages. * @size: the number of bytes to allocate * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP * @@ -5032,7 +5038,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, * * Return: pointer to the allocated area or %NULL in case of error. */ -void *alloc_pages_exact(size_t size, gfp_t gfp_mask) +void *alloc_pages_exact_noprof(size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); unsigned long addr; @@ -5040,13 +5046,13 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - addr = __get_free_pages(gfp_mask, order); + addr = get_free_pages_noprof(gfp_mask, order); return make_alloc_exact(addr, order, size); } -EXPORT_SYMBOL(alloc_pages_exact); +EXPORT_SYMBOL(alloc_pages_exact_noprof); /** - * alloc_pages_exact_nid - allocate an exact number of physically-contiguous + * alloc_pages_exact_nid_noprof - allocate an exact number of physically-contiguous * pages on a node. * @nid: the preferred node ID where memory should be allocated * @size: the number of bytes to allocate @@ -5057,7 +5063,7 @@ EXPORT_SYMBOL(alloc_pages_exact); * * Return: pointer to the allocated area or %NULL in case of error. */ -void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) +void * __meminit alloc_pages_exact_nid_noprof(int nid, size_t size, gfp_t gfp_mask) { unsigned int order = get_order(size); struct page *p; @@ -5065,7 +5071,7 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM))) gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM); - p = alloc_pages_node(nid, gfp_mask, order); + p = alloc_pages_node_noprof(nid, gfp_mask, order); if (!p) return NULL; return make_alloc_exact((unsigned long)page_address(p), order, size); @@ -6738,7 +6744,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, } /** - * alloc_contig_range() -- tries to allocate given range of pages + * alloc_contig_range_noprof() -- tries to allocate given range of pages * @start: start PFN to allocate * @end: one-past-the-last PFN to allocate * @migratetype: migratetype of the underlying pageblocks (either @@ -6758,7 +6764,7 @@ int __alloc_contig_migrate_range(struct compact_control *cc, * pages which PFN is in [start, end) are allocated for the caller and * need to be freed with free_contig_range(). */ -int alloc_contig_range(unsigned long start, unsigned long end, +int alloc_contig_range_noprof(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask) { unsigned long outer_start, outer_end; @@ -6882,15 +6888,15 @@ int alloc_contig_range(unsigned long start, unsigned long end, undo_isolate_page_range(start, end, migratetype); return ret; } -EXPORT_SYMBOL(alloc_contig_range); +EXPORT_SYMBOL(alloc_contig_range_noprof); static int __alloc_contig_pages(unsigned long start_pfn, unsigned long nr_pages, gfp_t gfp_mask) { unsigned long end_pfn = start_pfn + nr_pages; - return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE, - gfp_mask); + return alloc_contig_range_noprof(start_pfn, end_pfn, MIGRATE_MOVABLE, + gfp_mask); } static bool pfn_range_valid_contig(struct zone *z, unsigned long start_pfn, @@ -6925,7 +6931,7 @@ static bool zone_spans_last_pfn(const struct zone *zone, } /** - * alloc_contig_pages() -- tries to find and allocate contiguous range of pages + * alloc_contig_pages_noprof() -- tries to find and allocate contiguous range of pages * @nr_pages: Number of contiguous pages to allocate * @gfp_mask: GFP mask to limit search and used during compaction * @nid: Target node @@ -6945,8 +6951,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * * Return: pointer to contiguous pages on success, or NULL if not successful. */ -struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, - int nid, nodemask_t *nodemask) +struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned long ret, pfn, flags; struct zonelist *zonelist; diff --git a/mm/page_ext.c b/mm/page_ext.c index dc1626be4..6c8ad6e12 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -10,6 +10,7 @@ #include #include #include +#include /* * struct page extension @@ -82,6 +83,9 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + &page_alloc_tagging_ops, +#endif #ifdef CONFIG_PAGE_TABLE_CHECK &page_table_check_ops, #endif @@ -92,7 +96,16 @@ unsigned long page_ext_size; static unsigned long total_usage; static struct page_ext *lookup_page_ext(const struct page *page); +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG +/* + * To ensure correct allocation tagging for pages, page_ext should be available + * before the first page allocation. Otherwise early task stacks will be + * allocated before page_ext initialization and missing tags will be flagged. + */ +bool early_page_ext __meminitdata = true; +#else bool early_page_ext __meminitdata; +#endif static int __init setup_early_page_ext(char *str) { early_page_ext = true; diff --git a/mm/page_owner.c b/mm/page_owner.c index 31169b3e7..8b6086c66 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -372,7 +372,7 @@ static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, if (!memcg_data) goto out_unlock; - if (memcg_data & MEMCG_DATA_OBJCGS) + if (memcg_data & MEMCG_DATA_OBJEXTS) ret += scnprintf(kbuf + ret, count - ret, "Slab cache page\n"); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index f9847c131..c5d1d6723 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -32,6 +32,19 @@ struct pcpu_block_md { int nr_bits; /* total bits responsible for */ }; +struct pcpuobj_ext { +#ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cgroup; +#endif +#ifdef CONFIG_MEM_ALLOC_PROFILING + union codetag_ref tag; +#endif +}; + +#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MEM_ALLOC_PROFILING) +#define NEED_PCPUOBJ_EXT +#endif + struct pcpu_chunk { #ifdef CONFIG_PERCPU_STATS int nr_alloc; /* # of allocations */ @@ -57,8 +70,8 @@ struct pcpu_chunk { int end_offset; /* additional area required to have the region end page aligned */ -#ifdef CONFIG_MEMCG_KMEM - struct obj_cgroup **obj_cgroups; /* vector of object cgroups */ +#ifdef NEED_PCPUOBJ_EXT + struct pcpuobj_ext *obj_exts; /* vector of object cgroups */ #endif int nr_pages; /* # of pages served by this chunk */ @@ -67,6 +80,15 @@ struct pcpu_chunk { unsigned long populated[]; /* populated bitmap */ }; +static inline bool need_pcpuobj_ext(void) +{ + if (IS_ENABLED(CONFIG_MEM_ALLOC_PROFILING)) + return true; + if (!mem_cgroup_kmem_disabled()) + return true; + return false; +} + extern spinlock_t pcpu_lock; extern struct list_head *pcpu_chunk_lists; diff --git a/mm/percpu.c b/mm/percpu.c index 28e07ede4..2298f38d4 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1392,9 +1392,9 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); -#ifdef CONFIG_MEMCG_KMEM +#ifdef NEED_PCPUOBJ_EXT /* first chunk is free to use */ - chunk->obj_cgroups = NULL; + chunk->obj_exts = NULL; #endif pcpu_init_md_blocks(chunk); @@ -1463,12 +1463,12 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) if (!chunk->md_blocks) goto md_blocks_fail; -#ifdef CONFIG_MEMCG_KMEM - if (!mem_cgroup_kmem_disabled()) { - chunk->obj_cgroups = +#ifdef NEED_PCPUOBJ_EXT + if (need_pcpuobj_ext()) { + chunk->obj_exts = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * - sizeof(struct obj_cgroup *), gfp); - if (!chunk->obj_cgroups) + sizeof(struct pcpuobj_ext), gfp); + if (!chunk->obj_exts) goto objcg_fail; } #endif @@ -1480,7 +1480,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) return chunk; -#ifdef CONFIG_MEMCG_KMEM +#ifdef NEED_PCPUOBJ_EXT objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif @@ -1498,8 +1498,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; -#ifdef CONFIG_MEMCG_KMEM - pcpu_mem_free(chunk->obj_cgroups); +#ifdef NEED_PCPUOBJ_EXT + pcpu_mem_free(chunk->obj_exts); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); @@ -1648,8 +1648,8 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, if (!objcg) return; - if (likely(chunk && chunk->obj_cgroups)) { - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; + if (likely(chunk && chunk->obj_exts)) { + chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, @@ -1665,13 +1665,13 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; - if (unlikely(!chunk->obj_cgroups)) + if (unlikely(!chunk->obj_exts)) return; - objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; + objcg = chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup; if (!objcg) return; - chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; + chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].cgroup = NULL; obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); @@ -1701,8 +1701,34 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) } #endif /* CONFIG_MEMCG_KMEM */ +#ifdef CONFIG_MEM_ALLOC_PROFILING +static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, + size_t size) +{ + if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) { + alloc_tag_add(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, + current->alloc_tag, size); + } +} + +static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ + if (mem_alloc_profiling_enabled() && likely(chunk->obj_exts)) + alloc_tag_sub_noalloc(&chunk->obj_exts[off >> PCPU_MIN_ALLOC_SHIFT].tag, size); +} +#else +static void pcpu_alloc_tag_alloc_hook(struct pcpu_chunk *chunk, int off, + size_t size) +{ +} + +static void pcpu_alloc_tag_free_hook(struct pcpu_chunk *chunk, int off, size_t size) +{ +} +#endif + /** - * pcpu_alloc - the percpu allocator + * pcpu_alloc_noprof - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available @@ -1716,7 +1742,7 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, +void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; @@ -1883,6 +1909,8 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); + pcpu_alloc_tag_alloc_hook(chunk, off, size); + return ptr; fail_unlock: @@ -1909,61 +1937,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } - -/** - * __alloc_percpu_gfp - allocate dynamic percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * @gfp: allocation flags - * - * Allocate zero-filled percpu area of @size bytes aligned at @align. If - * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can - * be called from any context but is a lot more likely to fail. If @gfp - * has __GFP_NOWARN then no warning will be triggered on invalid or failed - * allocation requests. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) -{ - return pcpu_alloc(size, align, false, gfp); -} -EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); - -/** - * __alloc_percpu - allocate dynamic percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). - */ -void __percpu *__alloc_percpu(size_t size, size_t align) -{ - return pcpu_alloc(size, align, false, GFP_KERNEL); -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * __alloc_reserved_percpu - allocate reserved percpu area - * @size: size of area to allocate in bytes - * @align: alignment of area (max PAGE_SIZE) - * - * Allocate zero-filled percpu area of @size bytes aligned at @align - * from reserved percpu area if arch has set it up; otherwise, - * allocation is served from the same dynamic area. Might sleep. - * Might trigger writeouts. - * - * CONTEXT: - * Does GFP_KERNEL allocation. - * - * RETURNS: - * Percpu pointer to the allocated area on success, NULL on failure. - */ -void __percpu *__alloc_reserved_percpu(size_t size, size_t align) -{ - return pcpu_alloc(size, align, true, GFP_KERNEL); -} +EXPORT_SYMBOL_GPL(pcpu_alloc_noprof); /** * pcpu_balance_free - manage the amount of free chunks @@ -2273,6 +2247,8 @@ void free_percpu(void __percpu *ptr) size = pcpu_free_area(chunk, off); + pcpu_alloc_tag_free_hook(chunk, off, size); + pcpu_memcg_free_hook(chunk, off, size); /* diff --git a/lib/show_mem.c b/mm/show_mem.c similarity index 57% rename from lib/show_mem.c rename to mm/show_mem.c index 1485c87be..de209c55d 100644 --- a/lib/show_mem.c +++ b/mm/show_mem.c @@ -7,11 +7,15 @@ #include #include +#include + +#include "slab.h" void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long total = 0, reserved = 0, highmem = 0; struct zone *zone; + char *buf; printk("Mem-Info:\n"); __show_free_areas(filter, nodemask, max_zone_idx); @@ -34,4 +38,37 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) #ifdef CONFIG_MEMORY_FAILURE printk("%lu pages hwpoisoned\n", atomic_long_read(&num_poisoned_pages)); #endif + + buf = kmalloc(4096, GFP_ATOMIC); + if (buf) { + struct seq_buf s; + + printk("Unreclaimable slab info:\n"); + seq_buf_init(&s, buf, 4096); + dump_unreclaimable_slab(&s); + seq_buf_terminate(&s); + printk("%s", buf); + + printk("Shrinkers:\n"); + seq_buf_init(&s, buf, 4096); + shrinkers_to_text(&s); + seq_buf_terminate(&s); + printk("%s", buf); + + kfree(buf); + } +#ifdef CONFIG_MEM_ALLOC_PROFILING + { + struct seq_buf s; + char *buf = kmalloc(4096, GFP_ATOMIC); + + if (buf) { + printk("Memory allocations:\n"); + seq_buf_init(&s, buf, 4096); + alloc_tags_show_mem_report(&s); + printk("%s", buf); + kfree(buf); + } + } +#endif } diff --git a/mm/slab.c b/mm/slab.c index bb57f7fdb..d02d2dd27 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1232,7 +1232,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); list_add(&kmem_cache->list, &slab_caches); slab_state = PARTIAL; @@ -3367,9 +3367,11 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp, unsigned long caller) { + struct slab *slab = virt_to_slab(objp); bool init; - memcg_slab_free_hook(cachep, virt_to_slab(objp), &objp, 1); + memcg_slab_free_hook(cachep, slab, &objp, 1); + alloc_tagging_slab_free_hook(cachep, slab, &objp, 1); if (is_kfence_address(objp)) { kmemleak_free_recursive(objp, cachep->flags); @@ -3446,18 +3448,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, return ret; } -void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) +void *kmem_cache_alloc_noprof(struct kmem_cache *cachep, gfp_t flags) { return __kmem_cache_alloc_lru(cachep, NULL, flags); } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *cachep, struct list_lru *lru, gfp_t flags) { return __kmem_cache_alloc_lru(cachep, lru, flags); } -EXPORT_SYMBOL(kmem_cache_alloc_lru); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); static __always_inline void cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, @@ -3469,8 +3471,8 @@ cache_alloc_debugcheck_after_bulk(struct kmem_cache *s, gfp_t flags, p[i] = cache_alloc_debugcheck_after(s, flags, p[i], caller); } -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { struct obj_cgroup *objcg = NULL; unsigned long irqflags; @@ -3508,7 +3510,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, kmem_cache_free_bulk(s, i, p); return 0; } -EXPORT_SYMBOL(kmem_cache_alloc_bulk); +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); /** * kmem_cache_alloc_node - Allocate an object on the specified node @@ -3523,7 +3525,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); * * Return: pointer to the new object or %NULL in case of error */ -void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) +void *kmem_cache_alloc_node_noprof(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, NULL, flags, nodeid, cachep->object_size, _RET_IP_); @@ -3531,7 +3533,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); void *__kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, size_t orig_size, diff --git a/mm/slab.h b/mm/slab.h index f01ac256a..bc2d3429d 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -57,8 +57,8 @@ struct slab { #endif atomic_t __page_refcount; -#ifdef CONFIG_MEMCG - unsigned long memcg_data; +#ifdef CONFIG_SLAB_OBJ_EXT + unsigned long obj_exts; #endif }; @@ -67,8 +67,8 @@ struct slab { SLAB_MATCH(flags, __page_flags); SLAB_MATCH(compound_head, slab_cache); /* Ensure bit 0 is clear */ SLAB_MATCH(_refcount, __page_refcount); -#ifdef CONFIG_MEMCG -SLAB_MATCH(memcg_data, memcg_data); +#ifdef CONFIG_SLAB_OBJ_EXT +SLAB_MATCH(memcg_data, obj_exts); #endif #undef SLAB_MATCH static_assert(sizeof(struct slab) <= sizeof(struct page)); @@ -390,36 +390,198 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla return false; } -#ifdef CONFIG_MEMCG_KMEM +#ifdef CONFIG_SLAB_OBJ_EXT + /* - * slab_objcgs - get the object cgroups vector associated with a slab + * slab_obj_exts - get the pointer to the slab object extension vector + * associated with a slab. * @slab: a pointer to the slab struct * - * Returns a pointer to the object cgroups vector associated with the slab, + * Returns a pointer to the object extension vector associated with the slab, * or NULL if no such vector has been associated yet. */ -static inline struct obj_cgroup **slab_objcgs(struct slab *slab) +static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { - unsigned long memcg_data = READ_ONCE(slab->memcg_data); + unsigned long obj_exts = READ_ONCE(slab->obj_exts); - VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS), +#ifdef CONFIG_MEMCG + VM_BUG_ON_PAGE(obj_exts && !(obj_exts & MEMCG_DATA_OBJEXTS), slab_page(slab)); - VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab)); + VM_BUG_ON_PAGE(obj_exts & MEMCG_DATA_KMEM, slab_page(slab)); - return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK); +#endif + return (struct slabobj_ext *)(obj_exts & ~OBJEXTS_FLAGS_MASK); } -int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s, - gfp_t gfp, bool new_slab); -void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, - enum node_stat_item idx, int nr); +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab); + + +#ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) +{ + struct slabobj_ext *slab_exts; + struct slab *obj_exts_slab; + + obj_exts_slab = virt_to_slab(obj_exts); + slab_exts = slab_obj_exts(obj_exts_slab); + if (slab_exts) { + unsigned int offs = obj_to_index(obj_exts_slab->slab_cache, + obj_exts_slab, obj_exts); + /* codetag should be NULL */ + WARN_ON(slab_exts[offs].ref.ct); + set_codetag_empty(&slab_exts[offs].ref); + } +} + +static inline void mark_failed_objexts_alloc(struct slab *slab) +{ + slab->obj_exts = OBJEXTS_ALLOC_FAIL; +} + +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) +{ + /* + * If vector previously failed to allocate then we have live + * objects with no tag reference. Mark all references in this + * vector as empty to avoid warnings later on. + */ + if (obj_exts & OBJEXTS_ALLOC_FAIL) { + unsigned int i; + + for (i = 0; i < objects; i++) + set_codetag_empty(&vec[i].ref); + } +} + + +#else /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline void mark_objexts_empty(struct slabobj_ext *obj_exts) {} +static inline void mark_failed_objexts_alloc(struct slab *slab) {} +static inline void handle_failed_objexts_alloc(unsigned long obj_exts, + struct slabobj_ext *vec, unsigned int objects) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING_DEBUG */ + +static inline bool need_slab_obj_ext(void) +{ +#ifdef CONFIG_MEM_ALLOC_PROFILING + if (mem_alloc_profiling_enabled()) + return true; +#endif + /* + * CONFIG_MEMCG_KMEM creates vector of obj_cgroup objects conditionally + * inside memcg_slab_post_alloc_hook. No other users for now. + */ + return false; +} + +static inline void free_slab_obj_exts(struct slab *slab) +{ + struct slabobj_ext *obj_exts; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + + /* + * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its + * corresponding extension will be NULL. alloc_tag_sub() will throw a + * warning if slab has extensions but the extension of an object is + * NULL, therefore replace NULL with CODETAG_EMPTY to indicate that + * the extension for obj_exts is expected to be NULL. + */ + mark_objexts_empty(obj_exts); + kfree(obj_exts); + slab->obj_exts = 0; +} + +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +{ + struct slab *slab; + + if (!p) + return NULL; + + if (!need_slab_obj_ext()) + return NULL; + + if (s->flags & SLAB_NO_OBJ_EXT) + return NULL; -static inline void memcg_free_slab_cgroups(struct slab *slab) + if (flags & __GFP_NO_OBJ_EXT) + return NULL; + + slab = virt_to_slab(p); + if (!slab_obj_exts(slab) && + WARN(alloc_slab_obj_exts(slab, s, flags, false), + "%s, %s: Failed to create slab extension vector!\n", + __func__, s->name)) + return NULL; + + return slab_obj_exts(slab) + obj_to_index(s, slab, p); +} + +#else /* CONFIG_SLAB_OBJ_EXT */ + +static inline struct slabobj_ext *slab_obj_exts(struct slab *slab) { - kfree(slab_objcgs(slab)); - slab->memcg_data = 0; + return NULL; +} + +static inline int alloc_slab_obj_exts(struct slab *slab, + struct kmem_cache *s, gfp_t gfp, + bool new_slab) +{ + return 0; +} + +static inline void free_slab_obj_exts(struct slab *slab) +{ +} + +static inline struct slabobj_ext * +prepare_slab_obj_exts_hook(struct kmem_cache *s, gfp_t flags, void *p) +{ + return NULL; +} + +#endif /* CONFIG_SLAB_OBJ_EXT */ + +#ifdef CONFIG_MEM_ALLOC_PROFILING + +static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) +{ + struct slabobj_ext *obj_exts; + int i; + + obj_exts = slab_obj_exts(slab); + if (!obj_exts) + return; + + for (i = 0; i < objects; i++) { + unsigned int off = obj_to_index(s, slab, p[i]); + + alloc_tag_sub(&obj_exts[off].ref, s->size); + } } +#else + +static inline void alloc_tagging_slab_free_hook(struct kmem_cache *s, struct slab *slab, + void **p, int objects) {} + +#endif /* CONFIG_MEM_ALLOC_PROFILING */ + +#ifdef CONFIG_MEMCG_KMEM +void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr); + static inline size_t obj_full_size(struct kmem_cache *s) { /* @@ -487,16 +649,15 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, if (likely(p[i])) { slab = virt_to_slab(p[i]); - if (!slab_objcgs(slab) && - memcg_alloc_slab_cgroups(slab, s, flags, - false)) { + if (!slab_obj_exts(slab) && + alloc_slab_obj_exts(slab, s, flags, false)) { obj_cgroup_uncharge(objcg, obj_full_size(s)); continue; } off = obj_to_index(s, slab, p[i]); obj_cgroup_get(objcg); - slab_objcgs(slab)[off] = objcg; + slab_obj_exts(slab)[off].objcg = objcg; mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), obj_full_size(s)); } else { @@ -509,14 +670,14 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s, static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects) { - struct obj_cgroup **objcgs; + struct slabobj_ext *obj_exts; int i; if (!memcg_kmem_online()) return; - objcgs = slab_objcgs(slab); - if (!objcgs) + obj_exts = slab_obj_exts(slab); + if (!obj_exts) return; for (i = 0; i < objects; i++) { @@ -524,11 +685,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, unsigned int off; off = obj_to_index(s, slab, p[i]); - objcg = objcgs[off]; + objcg = obj_exts[off].objcg; if (!objcg) continue; - objcgs[off] = NULL; + obj_exts[off].objcg = NULL; obj_cgroup_uncharge(objcg, obj_full_size(s)); mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s), -obj_full_size(s)); @@ -537,27 +698,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, } #else /* CONFIG_MEMCG_KMEM */ -static inline struct obj_cgroup **slab_objcgs(struct slab *slab) -{ - return NULL; -} - static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr) { return NULL; } -static inline int memcg_alloc_slab_cgroups(struct slab *slab, - struct kmem_cache *s, gfp_t gfp, - bool new_slab) -{ - return 0; -} - -static inline void memcg_free_slab_cgroups(struct slab *slab) -{ -} - static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s, struct list_lru *lru, struct obj_cgroup **objcgp, @@ -594,7 +739,7 @@ static __always_inline void account_slab(struct slab *slab, int order, struct kmem_cache *s, gfp_t gfp) { if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT)) - memcg_alloc_slab_cgroups(slab, s, gfp, true); + alloc_slab_obj_exts(slab, s, gfp, true); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), PAGE_SIZE << order); @@ -603,8 +748,7 @@ static __always_inline void account_slab(struct slab *slab, int order, static __always_inline void unaccount_slab(struct slab *slab, int order, struct kmem_cache *s) { - if (memcg_kmem_online()) - memcg_free_slab_cgroups(slab); + free_slab_obj_exts(slab); mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s), -(PAGE_SIZE << order)); @@ -684,6 +828,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, unsigned int orig_size) { unsigned int zero_size = s->object_size; + struct slabobj_ext *obj_exts; size_t i; flags &= gfp_allowed_mask; @@ -714,6 +859,13 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); kmsan_slab_alloc(s, p[i], flags); + obj_exts = prepare_slab_obj_exts_hook(s, flags, p[i]); + +#ifdef CONFIG_MEM_ALLOC_PROFILING + /* obj_exts can be allocated for other reasons */ + if (likely(obj_exts) && mem_alloc_profiling_enabled()) + alloc_tag_add(&obj_exts->ref, current->alloc_tag, s->size); +#endif } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); @@ -766,10 +918,12 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) if ((__n = get_node(__s, __node))) +struct seq_buf; + #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) -void dump_unreclaimable_slab(void); +void dump_unreclaimable_slab(struct seq_buf *); #else -static inline void dump_unreclaimable_slab(void) +static inline void dump_unreclaimable_slab(struct seq_buf *out) { } #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index 607249785..5b204e16f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include "internal.h" @@ -204,6 +205,64 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align, return NULL; } +#ifdef CONFIG_SLAB_OBJ_EXT +/* + * The allocated objcg pointers array is not accounted directly. + * Moreover, it should not come from DMA buffer and is not readily + * reclaimable. So those GFP bits should be masked off. + */ +#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) + +int alloc_slab_obj_exts(struct slab *slab, struct kmem_cache *s, + gfp_t gfp, bool new_slab) +{ + unsigned int objects = objs_per_slab(s, slab); + unsigned long new_exts; + unsigned long old_exts; + struct slabobj_ext *vec; + + gfp &= ~OBJCGS_CLEAR_MASK; + /* Prevent recursive extension vector allocation */ + gfp |= __GFP_NO_OBJ_EXT; + vec = kcalloc_node(objects, sizeof(struct slabobj_ext), gfp, + slab_nid(slab)); + if (!vec) { + /* Mark vectors which failed to allocate */ + if (new_slab) + mark_failed_objexts_alloc(slab); + + return -ENOMEM; + } + + new_exts = (unsigned long)vec; +#ifdef CONFIG_MEMCG + new_exts |= MEMCG_DATA_OBJEXTS; +#endif + old_exts = slab->obj_exts; + handle_failed_objexts_alloc(old_exts, vec, objects); + if (new_slab) { + /* + * If the slab is brand new and nobody can yet access its + * obj_exts, no synchronization is required and obj_exts can + * be simply assigned. + */ + slab->obj_exts = new_exts; + } else if (cmpxchg(&slab->obj_exts, old_exts, new_exts) != old_exts) { + /* + * If the slab is already in use, somebody can allocate and + * assign slabobj_exts in parallel. In this case the existing + * objcg vector should be reused. + */ + mark_objexts_empty(vec); + kfree(vec); + return 0; + } + + kmemleak_not_leak(vec); + return 0; +} +#endif /* CONFIG_SLAB_OBJ_EXT */ + static struct kmem_cache *create_cache(const char *name, unsigned int object_size, unsigned int align, slab_flags_t flags, unsigned int useroffset, @@ -968,24 +1027,24 @@ void *__do_kmalloc_node(size_t size, gfp_t flags, int node, unsigned long caller return ret; } -void *__kmalloc_node(size_t size, gfp_t flags, int node) +void *__kmalloc_node_noprof(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc_node); +EXPORT_SYMBOL(__kmalloc_node_noprof); -void *__kmalloc(size_t size, gfp_t flags) +void *__kmalloc_noprof(size_t size, gfp_t flags) { return __do_kmalloc_node(size, flags, NUMA_NO_NODE, _RET_IP_); } -EXPORT_SYMBOL(__kmalloc); +EXPORT_SYMBOL(__kmalloc_noprof); -void *__kmalloc_node_track_caller(size_t size, gfp_t flags, - int node, unsigned long caller) +void *kmalloc_node_track_caller_noprof(size_t size, gfp_t flags, + int node, unsigned long caller) { return __do_kmalloc_node(size, flags, node, caller); } -EXPORT_SYMBOL(__kmalloc_node_track_caller); +EXPORT_SYMBOL(kmalloc_node_track_caller_noprof); /** * kfree - free previously allocated memory @@ -1052,7 +1111,7 @@ size_t __ksize(const void *object) return slab_ksize(folio_slab(folio)->slab_cache); } -void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +void *kmalloc_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = __kmem_cache_alloc_node(s, gfpflags, NUMA_NO_NODE, size, _RET_IP_); @@ -1062,9 +1121,9 @@ void *kmalloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_trace); +EXPORT_SYMBOL(kmalloc_trace_noprof); -void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, +void *kmalloc_node_trace_noprof(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { void *ret = __kmem_cache_alloc_node(s, gfpflags, node, size, _RET_IP_); @@ -1074,7 +1133,7 @@ void *kmalloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } -EXPORT_SYMBOL(kmalloc_node_trace); +EXPORT_SYMBOL(kmalloc_node_trace_noprof); gfp_t kmalloc_fix_flags(gfp_t flags) { @@ -1104,7 +1163,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) flags = kmalloc_fix_flags(flags); flags |= __GFP_COMP; - page = alloc_pages_node(node, flags, order); + page = alloc_pages_node_noprof(node, flags, order); if (page) { ptr = page_address(page); mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE_B, @@ -1119,7 +1178,7 @@ static void *__kmalloc_large_node(size_t size, gfp_t flags, int node) return ptr; } -void *kmalloc_large(size_t size, gfp_t flags) +void *kmalloc_large_noprof(size_t size, gfp_t flags) { void *ret = __kmalloc_large_node(size, flags, NUMA_NO_NODE); @@ -1127,9 +1186,9 @@ void *kmalloc_large(size_t size, gfp_t flags) flags, NUMA_NO_NODE); return ret; } -EXPORT_SYMBOL(kmalloc_large); +EXPORT_SYMBOL(kmalloc_large_noprof); -void *kmalloc_large_node(size_t size, gfp_t flags, int node) +void *kmalloc_large_node_noprof(size_t size, gfp_t flags, int node) { void *ret = __kmalloc_large_node(size, flags, node); @@ -1137,7 +1196,7 @@ void *kmalloc_large_node(size_t size, gfp_t flags, int node) flags, node); return ret; } -EXPORT_SYMBOL(kmalloc_large_node); +EXPORT_SYMBOL(kmalloc_large_node_noprof); #ifdef CONFIG_SLAB_FREELIST_RANDOM /* Randomize a generic freelist */ @@ -1259,10 +1318,15 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -void dump_unreclaimable_slab(void) +void dump_unreclaimable_slab(struct seq_buf *out) { struct kmem_cache *s; struct slabinfo sinfo; + struct slab_by_mem { + struct kmem_cache *s; + size_t total, active; + } slabs_by_mem[10], n; + int i, nr = 0; /* * Here acquiring slab_mutex is risky since we don't prefer to get @@ -1272,24 +1336,52 @@ void dump_unreclaimable_slab(void) * without acquiring the mutex. */ if (!mutex_trylock(&slab_mutex)) { - pr_warn("excessive unreclaimable slab but cannot dump stats\n"); + seq_buf_puts(out, "excessive unreclaimable slab but cannot dump stats\n"); return; } - pr_info("Unreclaimable slab info:\n"); - pr_info("Name Used Total\n"); - list_for_each_entry(s, &slab_caches, list) { if (s->flags & SLAB_RECLAIM_ACCOUNT) continue; get_slabinfo(s, &sinfo); - if (sinfo.num_objs > 0) - pr_info("%-17s %10luKB %10luKB\n", s->name, - (sinfo.active_objs * s->size) / 1024, - (sinfo.num_objs * s->size) / 1024); + if (!sinfo.num_objs) + continue; + + n.s = s; + n.total = sinfo.num_objs * s->size; + n.active = sinfo.active_objs * s->size; + + for (i = 0; i < nr; i++) + if (n.total < slabs_by_mem[i].total) + break; + + if (nr < ARRAY_SIZE(slabs_by_mem)) { + memmove(&slabs_by_mem[i + 1], + &slabs_by_mem[i], + sizeof(slabs_by_mem[0]) * (nr - i)); + nr++; + } else if (i) { + i--; + memmove(&slabs_by_mem[0], + &slabs_by_mem[1], + sizeof(slabs_by_mem[0]) * i); + } else { + continue; + } + + slabs_by_mem[i] = n; } + + for (i = nr - 1; i >= 0; --i) { + seq_buf_printf(out, "%-17s total: ", slabs_by_mem[i].s->name); + seq_buf_human_readable_u64(out, slabs_by_mem[i].total); + seq_buf_printf(out, " active: "); + seq_buf_human_readable_u64(out, slabs_by_mem[i].active); + seq_buf_putc(out, '\n'); + } + mutex_unlock(&slab_mutex); } @@ -1356,7 +1448,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) return (void *)p; } - ret = kmalloc_track_caller(new_size, flags); + ret = kmalloc_node_track_caller_noprof(new_size, flags, NUMA_NO_NODE, _RET_IP_); if (ret && p) { /* Disable KASAN checks as the object's redzone is accessed. */ kasan_disable_current(); @@ -1380,7 +1472,7 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags) * * Return: pointer to the allocated memory or %NULL in case of error */ -void *krealloc(const void *p, size_t new_size, gfp_t flags) +void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags) { void *ret; @@ -1395,7 +1487,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags) return ret; } -EXPORT_SYMBOL(krealloc); +EXPORT_SYMBOL(krealloc_noprof); /** * kfree_sensitive - Clear sensitive information in memory before freeing diff --git a/mm/slub.c b/mm/slub.c index c87628cd8..768b0e292 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1781,7 +1781,7 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, return kasan_slab_free(s, x, init); } -static inline bool slab_free_freelist_hook(struct kmem_cache *s, +static __always_inline bool slab_free_freelist_hook(struct kmem_cache *s, void **head, void **tail, int *cnt) { @@ -3470,18 +3470,18 @@ void *__kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, return ret; } -void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) +void *kmem_cache_alloc_noprof(struct kmem_cache *s, gfp_t gfpflags) { return __kmem_cache_alloc_lru(s, NULL, gfpflags); } -EXPORT_SYMBOL(kmem_cache_alloc); +EXPORT_SYMBOL(kmem_cache_alloc_noprof); -void *kmem_cache_alloc_lru(struct kmem_cache *s, struct list_lru *lru, +void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru, gfp_t gfpflags) { return __kmem_cache_alloc_lru(s, lru, gfpflags); } -EXPORT_SYMBOL(kmem_cache_alloc_lru); +EXPORT_SYMBOL(kmem_cache_alloc_lru_noprof); void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, size_t orig_size, @@ -3491,7 +3491,7 @@ void *__kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, caller, orig_size); } -void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) +void *kmem_cache_alloc_node_noprof(struct kmem_cache *s, gfp_t gfpflags, int node) { void *ret = slab_alloc_node(s, NULL, gfpflags, node, _RET_IP_, s->object_size); @@ -3499,7 +3499,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node); +EXPORT_SYMBOL(kmem_cache_alloc_node_noprof); static noinline void free_to_partial_list( struct kmem_cache *s, struct slab *slab, @@ -3779,6 +3779,7 @@ static __fastpath_inline void slab_free(struct kmem_cache *s, struct slab *slab, unsigned long addr) { memcg_slab_free_hook(s, slab, p, cnt); + alloc_tagging_slab_free_hook(s, slab, p, cnt); /* * With KASAN enabled slab_free_freelist_hook modifies the freelist * to remove objects, whose reuse must be delayed. @@ -4009,8 +4010,8 @@ static int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, #endif /* CONFIG_SLUB_TINY */ /* Note that interrupts must be enabled when calling this function. */ -int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, - void **p) +int kmem_cache_alloc_bulk_noprof(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) { int i; struct obj_cgroup *objcg = NULL; @@ -4034,7 +4035,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, slab_want_init_on_alloc(flags, s), s->object_size); return i; } -EXPORT_SYMBOL(kmem_cache_alloc_bulk); +EXPORT_SYMBOL(kmem_cache_alloc_bulk_noprof); /* @@ -5020,7 +5021,8 @@ void __init kmem_cache_init(void) node_set(node, slab_nodes); create_boot_cache(kmem_cache_node, "kmem_cache_node", - sizeof(struct kmem_cache_node), SLAB_HWCACHE_ALIGN, 0, 0); + sizeof(struct kmem_cache_node), + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); @@ -5030,7 +5032,7 @@ void __init kmem_cache_init(void) create_boot_cache(kmem_cache, "kmem_cache", offsetof(struct kmem_cache, node) + nr_node_ids * sizeof(struct kmem_cache_node *), - SLAB_HWCACHE_ALIGN, 0, 0); + SLAB_HWCACHE_ALIGN | SLAB_NO_OBJ_EXT, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); kmem_cache_node = bootstrap(&boot_kmem_cache_node); diff --git a/mm/util.c b/mm/util.c index dd12b9531..9d24b8870 100644 --- a/mm/util.c +++ b/mm/util.c @@ -115,7 +115,7 @@ char *kstrndup(const char *s, size_t max, gfp_t gfp) EXPORT_SYMBOL(kstrndup); /** - * kmemdup - duplicate region of memory + * kmemdup_noprof - duplicate region of memory * * @src: memory region to duplicate * @len: memory region length @@ -124,16 +124,16 @@ EXPORT_SYMBOL(kstrndup); * Return: newly allocated copy of @src or %NULL in case of error, * result is physically contiguous. Use kfree() to free. */ -void *kmemdup(const void *src, size_t len, gfp_t gfp) +void *kmemdup_noprof(const void *src, size_t len, gfp_t gfp) { void *p; - p = kmalloc_track_caller(len, gfp); + p = kmalloc_node_track_caller_noprof(len, gfp, NUMA_NO_NODE, _RET_IP_); if (p) memcpy(p, src, len); return p; } -EXPORT_SYMBOL(kmemdup); +EXPORT_SYMBOL(kmemdup_noprof); /** * kvmemdup - duplicate region of memory @@ -564,7 +564,7 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, EXPORT_SYMBOL(vm_mmap); /** - * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * kvmalloc_node_noprof - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. * @size: size of the request. * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. @@ -579,7 +579,7 @@ EXPORT_SYMBOL(vm_mmap); * * Return: pointer to the allocated memory of %NULL in case of failure */ -void *kvmalloc_node(size_t size, gfp_t flags, int node) +void *kvmalloc_node_noprof(size_t size, gfp_t flags, int node) { gfp_t kmalloc_flags = flags; void *ret; @@ -601,7 +601,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) kmalloc_flags &= ~__GFP_NOFAIL; } - ret = kmalloc_node(size, kmalloc_flags, node); + ret = kmalloc_node_noprof(size, kmalloc_flags, node); /* * It doesn't really make sense to fallback to vmalloc for sub page @@ -626,11 +626,11 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * about the resulting pointer, and cannot play * protection games. */ - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, flags, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(kvmalloc_node); +EXPORT_SYMBOL(kvmalloc_node_noprof); /** * kvfree() - Free memory. @@ -669,7 +669,7 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) { void *newp; @@ -682,15 +682,15 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags) kvfree(p); return newp; } -EXPORT_SYMBOL(kvrealloc); +EXPORT_SYMBOL(kvrealloc_noprof); /** - * __vmalloc_array - allocate memory for a virtually contiguous array. + * __vmalloc_array_noprof - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vmalloc_array(size_t n, size_t size, gfp_t flags) +void *__vmalloc_array_noprof(size_t n, size_t size, gfp_t flags) { size_t bytes; @@ -698,18 +698,18 @@ void *__vmalloc_array(size_t n, size_t size, gfp_t flags) return NULL; return __vmalloc(bytes, flags); } -EXPORT_SYMBOL(__vmalloc_array); +EXPORT_SYMBOL(__vmalloc_array_noprof); /** - * vmalloc_array - allocate memory for a virtually contiguous array. + * vmalloc_array_noprof - allocate memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vmalloc_array(size_t n, size_t size) +void *vmalloc_array_noprof(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL); } -EXPORT_SYMBOL(vmalloc_array); +EXPORT_SYMBOL(vmalloc_array_noprof); /** * __vcalloc - allocate and zero memory for a virtually contiguous array. @@ -717,22 +717,22 @@ EXPORT_SYMBOL(vmalloc_array); * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ -void *__vcalloc(size_t n, size_t size, gfp_t flags) +void *__vcalloc_noprof(size_t n, size_t size, gfp_t flags) { return __vmalloc_array(n, size, flags | __GFP_ZERO); } -EXPORT_SYMBOL(__vcalloc); +EXPORT_SYMBOL(__vcalloc_noprof); /** - * vcalloc - allocate and zero memory for a virtually contiguous array. + * vcalloc_noprof - allocate and zero memory for a virtually contiguous array. * @n: number of elements. * @size: element size. */ -void *vcalloc(size_t n, size_t size) +void *vcalloc_noprof(size_t n, size_t size) { return __vmalloc_array(n, size, GFP_KERNEL | __GFP_ZERO); } -EXPORT_SYMBOL(vcalloc); +EXPORT_SYMBOL(vcalloc_noprof); /* Neutral page->mapping pointer to address_space or anon_vma or other */ void *page_rmapping(struct page *page) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1d13d7168..4c199cf9b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2971,12 +2971,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolicy wants to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, + nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(bulk_gfp, nid, + nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -3006,9 +3006,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid, break; if (nid == NUMA_NO_NODE) - page = alloc_pages(alloc_gfp, order); + page = alloc_pages_noprof(alloc_gfp, order); else - page = alloc_pages_node(nid, alloc_gfp, order); + page = alloc_pages_node_noprof(nid, alloc_gfp, order); if (unlikely(!page)) { if (!nofail) break; @@ -3065,10 +3065,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - area->pages = __vmalloc_node(array_size, 1, nested_gfp, node, + area->pages = __vmalloc_node_noprof(array_size, 1, nested_gfp, node, area->caller); } else { - area->pages = kmalloc_node(array_size, nested_gfp, node); + area->pages = kmalloc_node_noprof(array_size, nested_gfp, node); } if (!area->pages) { @@ -3151,7 +3151,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } /** - * __vmalloc_node_range - allocate virtually contiguous memory + * __vmalloc_node_range_noprof - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @start: vm area range start @@ -3178,7 +3178,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * * Return: the address of the area or %NULL on failure */ -void *__vmalloc_node_range(unsigned long size, unsigned long align, +void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller) @@ -3307,7 +3307,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, } /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_noprof - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment * @gfp_mask: flags for the page level allocator @@ -3325,10 +3325,10 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, * * Return: pointer to the allocated memory or %NULL on error */ -void *__vmalloc_node(unsigned long size, unsigned long align, +void *__vmalloc_node_noprof(unsigned long size, unsigned long align, gfp_t gfp_mask, int node, const void *caller) { - return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, align, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, 0, node, caller); } /* @@ -3337,15 +3337,15 @@ void *__vmalloc_node(unsigned long size, unsigned long align, * than that. */ #ifdef CONFIG_TEST_VMALLOC_MODULE -EXPORT_SYMBOL_GPL(__vmalloc_node); +EXPORT_SYMBOL_GPL(__vmalloc_node_noprof); #endif -void *__vmalloc(unsigned long size, gfp_t gfp_mask) +void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, gfp_mask, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(__vmalloc); +EXPORT_SYMBOL(__vmalloc_noprof); /** * vmalloc - allocate virtually contiguous memory @@ -3359,12 +3359,12 @@ EXPORT_SYMBOL(__vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc(unsigned long size) +void *vmalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc); +EXPORT_SYMBOL(vmalloc_noprof); /** * vmalloc_huge - allocate virtually contiguous memory, allow huge pages @@ -3378,16 +3378,16 @@ EXPORT_SYMBOL(vmalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_huge(unsigned long size, gfp_t gfp_mask) +void *vmalloc_huge_noprof(unsigned long size, gfp_t gfp_mask) { - return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, 1, VMALLOC_START, VMALLOC_END, gfp_mask, PAGE_KERNEL, VM_ALLOW_HUGE_VMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL_GPL(vmalloc_huge); +EXPORT_SYMBOL_GPL(vmalloc_huge_noprof); /** - * vzalloc - allocate virtually contiguous memory with zero fill + * vzalloc_noprof - allocate virtually contiguous memory with zero fill * @size: allocation size * * Allocate enough pages to cover @size from the page level @@ -3399,12 +3399,12 @@ EXPORT_SYMBOL_GPL(vmalloc_huge); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc(unsigned long size) +void *vzalloc_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc); +EXPORT_SYMBOL(vzalloc_noprof); /** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace @@ -3415,17 +3415,17 @@ EXPORT_SYMBOL(vzalloc); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_user(unsigned long size) +void *vmalloc_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_user); +EXPORT_SYMBOL(vmalloc_user_noprof); /** - * vmalloc_node - allocate memory on a specific node + * vmalloc_node_noprof - allocate memory on a specific node * @size: allocation size * @node: numa node * @@ -3437,15 +3437,15 @@ EXPORT_SYMBOL(vmalloc_user); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_node(unsigned long size, int node) +void *vmalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_node); +EXPORT_SYMBOL(vmalloc_node_noprof); /** - * vzalloc_node - allocate memory on a specific node with zero fill + * vzalloc_node_noprof - allocate memory on a specific node with zero fill * @size: allocation size * @node: numa node * @@ -3455,12 +3455,12 @@ EXPORT_SYMBOL(vmalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vzalloc_node(unsigned long size, int node) +void *vzalloc_node_noprof(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node, + return __vmalloc_node_noprof(size, 1, GFP_KERNEL | __GFP_ZERO, node, __builtin_return_address(0)); } -EXPORT_SYMBOL(vzalloc_node); +EXPORT_SYMBOL(vzalloc_node_noprof); #if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) #define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL) @@ -3475,7 +3475,7 @@ EXPORT_SYMBOL(vzalloc_node); #endif /** - * vmalloc_32 - allocate virtually contiguous memory (32bit addressable) + * vmalloc_32_noprof - allocate virtually contiguous memory (32bit addressable) * @size: allocation size * * Allocate enough 32bit PA addressable pages to cover @size from the @@ -3483,15 +3483,15 @@ EXPORT_SYMBOL(vzalloc_node); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32(unsigned long size) +void *vmalloc_32_noprof(unsigned long size) { - return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, + return __vmalloc_node_noprof(size, 1, GFP_VMALLOC32, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32); +EXPORT_SYMBOL(vmalloc_32_noprof); /** - * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory + * vmalloc_32_user_noprof - allocate zeroed virtually contiguous 32bit memory * @size: allocation size * * The resulting memory area is 32bit addressable and zeroed so it can be @@ -3499,14 +3499,14 @@ EXPORT_SYMBOL(vmalloc_32); * * Return: pointer to the allocated memory or %NULL on error */ -void *vmalloc_32_user(unsigned long size) +void *vmalloc_32_user_noprof(unsigned long size) { - return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END, + return __vmalloc_node_range_noprof(size, SHMLBA, VMALLOC_START, VMALLOC_END, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, VM_USERMAP, NUMA_NO_NODE, __builtin_return_address(0)); } -EXPORT_SYMBOL(vmalloc_32_user); +EXPORT_SYMBOL(vmalloc_32_user_noprof); /* * Atomically zero bytes in the iterator. diff --git a/mm/vmscan.c b/mm/vmscan.c index d6802821d..a22f36ec7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -698,7 +699,6 @@ static int __prealloc_shrinker(struct shrinker *shrinker) return 0; } -#ifdef CONFIG_SHRINKER_DEBUG int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) { va_list ap; @@ -718,19 +718,12 @@ int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) return err; } -#else -int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __prealloc_shrinker(shrinker); -} -#endif void free_prealloced_shrinker(struct shrinker *shrinker) { -#ifdef CONFIG_SHRINKER_DEBUG kfree_const(shrinker->name); shrinker->name = NULL; -#endif + if (shrinker->flags & SHRINKER_MEMCG_AWARE) { down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); @@ -761,7 +754,6 @@ static int __register_shrinker(struct shrinker *shrinker) return 0; } -#ifdef CONFIG_SHRINKER_DEBUG int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) { va_list ap; @@ -780,12 +772,6 @@ int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) } return err; } -#else -int register_shrinker(struct shrinker *shrinker, const char *fmt, ...) -{ - return __register_shrinker(shrinker); -} -#endif EXPORT_SYMBOL(register_shrinker); /* @@ -811,6 +797,9 @@ void unregister_shrinker(struct shrinker *shrinker) kfree(shrinker->nr_deferred); shrinker->nr_deferred = NULL; + + kfree_const(shrinker->name); + shrinker->name = NULL; } EXPORT_SYMBOL(unregister_shrinker); @@ -829,6 +818,80 @@ void synchronize_shrinkers(void) } EXPORT_SYMBOL(synchronize_shrinkers); +void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker) +{ + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; + + seq_buf_puts(out, shrinker->name); + seq_buf_putc(out, '\n'); + + seq_buf_printf(out, "objects: %lu\n", shrinker->count_objects(shrinker, &sc)); + seq_buf_printf(out, "requested to free: %lu\n", atomic_long_read(&shrinker->objects_requested_to_free)); + seq_buf_printf(out, "objects freed: %lu\n", atomic_long_read(&shrinker->objects_freed)); + + if (shrinker->to_text) { + shrinker->to_text(out, shrinker); + seq_buf_puts(out, "\n"); + } +} + +/** + * shrinkers_to_text - Report on shrinkers with highest usage + * + * This reports on the top 10 shrinkers, by object counts, in sorted order: + * intended to be used for OOM reporting. + */ +void shrinkers_to_text(struct seq_buf *out) +{ + struct shrinker *shrinker; + struct shrinker_by_mem { + struct shrinker *shrinker; + unsigned long mem; + } shrinkers_by_mem[10]; + int i, nr = 0; + + if (!down_read_trylock(&shrinker_rwsem)) { + seq_buf_puts(out, "(couldn't take shrinker lock)"); + return; + } + + list_for_each_entry(shrinker, &shrinker_list, list) { + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; + unsigned long mem = shrinker->count_objects(shrinker, &sc); + + if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY) + continue; + + for (i = 0; i < nr; i++) + if (mem < shrinkers_by_mem[i].mem) + break; + + if (nr < ARRAY_SIZE(shrinkers_by_mem)) { + memmove(&shrinkers_by_mem[i + 1], + &shrinkers_by_mem[i], + sizeof(shrinkers_by_mem[0]) * (nr - i)); + nr++; + } else if (i) { + i--; + memmove(&shrinkers_by_mem[0], + &shrinkers_by_mem[1], + sizeof(shrinkers_by_mem[0]) * i); + } else { + continue; + } + + shrinkers_by_mem[i] = (struct shrinker_by_mem) { + .shrinker = shrinker, + .mem = mem, + }; + } + + for (i = nr - 1; i >= 0; --i) + shrinker_to_text(out, shrinkers_by_mem[i].shrinker); + + up_read(&shrinker_rwsem); +} + #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, @@ -895,12 +958,16 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); + atomic_long_add(nr_to_scan, &shrinker->objects_requested_to_free); + shrinkctl->nr_to_scan = nr_to_scan; shrinkctl->nr_scanned = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; + freed += ret; + atomic_long_add(ret, &shrinker->objects_freed); count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned); total_scan -= shrinkctl->nr_scanned; diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index 7778cc97a..5341736f2 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -277,3 +277,13 @@ ifneq ($(and $(filter notintermediate, $(.FEATURES)),$(filter-out 4.4,$(MAKE_VER else .SECONDARY: endif + + # expand_parents(a/b/c) = a/b/c a/b a +expand_parents2 = $(if $(subst .,,$(1)),$(call expand_parents,$(1)),) +expand_parents = $(1) $(call expand_parents2,$(patsubst %/,%,$(dir $(1)))) + +# flatten_dirs(a/b/c) = a_b_c a_b a +flatten_dirs = $(subst /,_,$(call expand_parents,$(1))) + +# eval_vars(X_,a/b/c) = $(X_a_b_c) $(X_a_b) $(X_a) +eval_vars = $(foreach var,$(call flatten_dirs,$(2)),$($(1)$(var))) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 100a386fc..1f106c71e 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -148,7 +148,7 @@ _cpp_flags = $(KBUILD_CPPFLAGS) $(cppflags-y) $(CPPFLAGS_$(target-stem).lds) # ifeq ($(CONFIG_GCOV_KERNEL),y) _c_flags += $(if $(patsubst n%,, \ - $(GCOV_PROFILE_$(basetarget).o)$(GCOV_PROFILE)$(CONFIG_GCOV_PROFILE_ALL)), \ + $(GCOV_PROFILE_$(basetarget).o)$(call eval_vars,GCOV_PROFILE_,$(src))$(GCOV_PROFILE)$(CONFIG_GCOV_PROFILE_ALL)), \ $(CFLAGS_GCOV)) endif diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index 0d2db4117..7b7dbeb5b 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -203,6 +203,11 @@ static int symbol_in_range(const struct sym_entry *s, return 0; } +static bool string_starts_with(const char *s, const char *prefix) +{ + return strncmp(s, prefix, strlen(prefix)) == 0; +} + static int symbol_valid(const struct sym_entry *s) { const char *name = sym_name(s); @@ -210,6 +215,14 @@ static int symbol_valid(const struct sym_entry *s) /* if --all-symbols is not specified, then symbols outside the text * and inittext sections are discarded */ if (!all_symbols) { + /* + * Symbols starting with __start and __stop are used to denote + * section boundaries, and should always be included: + */ + if (string_starts_with(name, "__start_") || + string_starts_with(name, "__stop_")) + return 1; + if (symbol_in_range(s, text_ranges, ARRAY_SIZE(text_ranges)) == 0) return 0; diff --git a/scripts/module.lds.S b/scripts/module.lds.S index bf5bcf283..45c67a099 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -9,6 +9,8 @@ #define DISCARD_EH_FRAME *(.eh_frame) #endif +#include + SECTIONS { /DISCARD/ : { *(.discard) @@ -47,12 +49,17 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) + CODETAG_SECTIONS() } .rodata : { *(.rodata .rodata.[0-9a-zA-Z_]*) *(.rodata..L*) } +#else + .data : { + CODETAG_SECTIONS() + } #endif } -- 2.41.0.159.g0bfa463d37