diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 52c984d..dda497e 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -35,12 +35,3 @@ jobs: - name: Release Kernel run: ./release.sh - - - name: Purge cache - uses: strrife/cloudflare-chunked-purge-action@master - env: - # Zone is required by both authentication methods - CLOUDFLARE_ZONE: ${{ secrets.CLOUDFLARE_ZONE }} - - CLOUDFLARE_TOKEN: ${{ secrets.CLOUDFLARE_TOKEN }} - PURGE_URLS: ${{ vars.PURGE_URLS }} diff --git a/VERSION b/VERSION index 021c940..5a33ecb 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.8.3 +6.10 diff --git a/config b/config index b089cde..7780f29 100644 --- a/config +++ b/config @@ -1,10 +1,10 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 6.8.3 Kernel Configuration +# Linux/x86 6.10.0 Kernel Configuration # -CONFIG_CC_VERSION_TEXT="gcc (GCC) 13.2.1 20230801" +CONFIG_CC_VERSION_TEXT="gcc (GCC) 14.1.1 20240522" CONFIG_CC_IS_GCC=y -CONFIG_GCC_VERSION=130201 +CONFIG_GCC_VERSION=140101 CONFIG_CLANG_VERSION=0 CONFIG_AS_IS_GNU=y CONFIG_AS_VERSION=24200 @@ -16,11 +16,10 @@ CONFIG_CC_CAN_LINK=y CONFIG_CC_CAN_LINK_STATIC=y CONFIG_CC_HAS_ASM_GOTO_OUTPUT=y CONFIG_CC_HAS_ASM_GOTO_TIED_OUTPUT=y -CONFIG_GCC_ASM_GOTO_OUTPUT_WORKAROUND=y CONFIG_TOOLS_SUPPORT_RELR=y CONFIG_CC_HAS_ASM_INLINE=y CONFIG_CC_HAS_NO_PROFILE_FN_ATTR=y -CONFIG_PAHOLE_VERSION=126 +CONFIG_PAHOLE_VERSION=127 CONFIG_IRQ_WORK=y CONFIG_BUILDTIME_TABLE_SORT=y CONFIG_THREAD_INFO_IN_TASK=y @@ -82,6 +81,7 @@ CONFIG_GENERIC_MSI_IRQ=y CONFIG_IRQ_MSI_IOMMU=y CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_GENERIC_IRQ_STAT_SNAPSHOT=y CONFIG_IRQ_FORCED_THREADING=y CONFIG_SPARSE_IRQ=y # CONFIG_GENERIC_IRQ_DEBUGFS is not set @@ -93,6 +93,7 @@ CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y CONFIG_GENERIC_TIME_VSYSCALL=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST_IDLE=y CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y CONFIG_GENERIC_CMOS_UPDATE=y CONFIG_HAVE_POSIX_CPU_TIMERS_TASK_WORK=y @@ -169,6 +170,7 @@ CONFIG_RCU_EXPERT=y CONFIG_TREE_SRCU=y CONFIG_TASKS_RCU_GENERIC=y # CONFIG_FORCE_TASKS_RCU is not set +CONFIG_NEED_TASKS_RCU=y CONFIG_TASKS_RCU=y # CONFIG_FORCE_TASKS_RUDE_RCU is not set CONFIG_TASKS_RUDE_RCU=y @@ -182,10 +184,11 @@ CONFIG_RCU_BOOST=y CONFIG_RCU_BOOST_DELAY=500 # CONFIG_RCU_EXP_KTHREAD is not set CONFIG_RCU_NOCB_CPU=y -CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y +# CONFIG_RCU_NOCB_CPU_DEFAULT_ALL is not set # CONFIG_RCU_NOCB_CPU_CB_BOOST is not set # CONFIG_TASKS_TRACE_RCU_READ_MB is not set CONFIG_RCU_LAZY=y +CONFIG_RCU_LAZY_DEFAULT_OFF=y CONFIG_RCU_DOUBLE_CHECK_CB_TIME=y # end of RCU Subsystem @@ -215,6 +218,7 @@ CONFIG_CC_NO_STRINGOP_OVERFLOW=y CONFIG_ARCH_SUPPORTS_INT128=y CONFIG_NUMA_BALANCING=y CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y +CONFIG_SLAB_OBJ_EXT=y CONFIG_CGROUPS=y CONFIG_PAGE_COUNTER=y # CONFIG_CGROUP_FAVOR_DYNMODS is not set @@ -287,7 +291,6 @@ CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -CONFIG_BASE_FULL=y CONFIG_FUTEX=y CONFIG_FUTEX_PI=y CONFIG_EPOLL=y @@ -325,7 +328,8 @@ CONFIG_TRACEPOINTS=y # # Kexec and crash features # -CONFIG_CRASH_CORE=y +CONFIG_CRASH_RESERVE=y +CONFIG_VMCORE_INFO=y CONFIG_KEXEC_CORE=y CONFIG_KEXEC=y CONFIG_KEXEC_FILE=y @@ -373,9 +377,10 @@ CONFIG_CC_HAS_SANE_STACKPROTECTOR=y # CONFIG_SMP=y CONFIG_X86_X2APIC=y +CONFIG_X86_POSTED_MSI=y CONFIG_X86_MPPARSE=y -# CONFIG_GOLDFISH is not set CONFIG_X86_CPU_RESCTRL=y +CONFIG_X86_FRED=y # CONFIG_X86_EXTENDED_PLATFORM is not set CONFIG_X86_INTEL_LPSS=y CONFIG_X86_AMD_PLATFORM_DEVICE=y @@ -424,6 +429,7 @@ CONFIG_INTEL_TDX_GUEST=y # CONFIG_MZEN2 is not set # CONFIG_MZEN3 is not set # CONFIG_MZEN4 is not set +# CONFIG_MZEN5 is not set # CONFIG_MPSC is not set # CONFIG_MCORE2 is not set # CONFIG_MATOM is not set @@ -540,9 +546,9 @@ CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=0 CONFIG_X86_PAT=y CONFIG_ARCH_USES_PG_UNCACHED=y CONFIG_X86_UMIP=y -CONFIG_CC_HAS_IBT=n +CONFIG_CC_HAS_IBT=y CONFIG_X86_CET=y -CONFIG_X86_KERNEL_IBT=n +CONFIG_X86_KERNEL_IBT=y CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y # CONFIG_X86_INTEL_TSX_MODE_OFF is not set # CONFIG_X86_INTEL_TSX_MODE_ON is not set @@ -564,6 +570,7 @@ CONFIG_HZ_300=y # CONFIG_HZ_625 is not set # CONFIG_HZ_1000 is not set CONFIG_HZ=300 +CONFIG_MIN_BASE_SLICE_NS=1000000 CONFIG_SCHED_HRTICK=y CONFIG_ARCH_SUPPORTS_KEXEC=y CONFIG_ARCH_SUPPORTS_KEXEC_FILE=y @@ -596,6 +603,9 @@ CONFIG_HAVE_LIVEPATCH=y # CONFIG_LIVEPATCH is not set # end of Processor type and features +CONFIG_CC_HAS_NAMED_AS=y +CONFIG_CC_HAS_NAMED_AS_FIXED_SANITIZERS=y +CONFIG_USE_X86_SEG_SUPPORT=y CONFIG_CC_HAS_SLS=y CONFIG_CC_HAS_RETURN_THUNK=y CONFIG_CC_HAS_ENTRY_PADDING=y @@ -605,19 +615,20 @@ CONFIG_CALL_PADDING=y CONFIG_HAVE_CALL_THUNKS=y CONFIG_CALL_THUNKS=y CONFIG_PREFIX_SYMBOLS=y -CONFIG_SPECULATION_MITIGATIONS=y -CONFIG_PAGE_TABLE_ISOLATION=y -CONFIG_RETPOLINE=y -CONFIG_RETHUNK=y -CONFIG_CPU_UNRET_ENTRY=y -CONFIG_CALL_DEPTH_TRACKING=y +CONFIG_CPU_MITIGATIONS=y +CONFIG_MITIGATION_PAGE_TABLE_ISOLATION=y +CONFIG_MITIGATION_RETPOLINE=y +CONFIG_MITIGATION_RETHUNK=y +CONFIG_MITIGATION_UNRET_ENTRY=y +CONFIG_MITIGATION_CALL_DEPTH_TRACKING=y # CONFIG_CALL_THUNKS_DEBUG is not set -CONFIG_CPU_IBPB_ENTRY=y -CONFIG_CPU_IBRS_ENTRY=y -CONFIG_CPU_SRSO=y -CONFIG_SLS=y -# CONFIG_GDS_FORCE_MITIGATION is not set +CONFIG_MITIGATION_IBPB_ENTRY=y +CONFIG_MITIGATION_IBRS_ENTRY=y +CONFIG_MITIGATION_SRSO=y +CONFIG_MITIGATION_SLS=y +# CONFIG_MITIGATION_GDS_FORCE is not set CONFIG_MITIGATION_RFDS=y +CONFIG_MITIGATION_SPECTRE_BHI=y CONFIG_ARCH_HAS_ADD_PAGES=y # @@ -629,6 +640,9 @@ CONFIG_SUSPEND_FREEZER=y CONFIG_HIBERNATE_CALLBACKS=y CONFIG_HIBERNATION=y CONFIG_HIBERNATION_SNAPSHOT_DEV=y +CONFIG_HIBERNATION_COMP_LZO=y +# CONFIG_HIBERNATION_COMP_LZ4 is not set +CONFIG_HIBERNATION_DEF_COMP="lzo" CONFIG_PM_STD_PARTITION="" CONFIG_PM_SLEEP=y CONFIG_PM_SLEEP_SMP=y @@ -687,8 +701,8 @@ CONFIG_ACPI_HOTPLUG_MEMORY=y CONFIG_ACPI_HOTPLUG_IOAPIC=y CONFIG_ACPI_SBS=m CONFIG_ACPI_HED=y -CONFIG_ACPI_CUSTOM_METHOD=m CONFIG_ACPI_BGRT=y +CONFIG_ACPI_NHLT=y CONFIG_ACPI_NFIT=m # CONFIG_NFIT_SECURITY_DEBUG is not set CONFIG_ACPI_NUMA=y @@ -700,6 +714,7 @@ CONFIG_ACPI_APEI_GHES=y CONFIG_ACPI_APEI_PCIEAER=y CONFIG_ACPI_APEI_MEMORY_FAILURE=y CONFIG_ACPI_APEI_EINJ=m +CONFIG_ACPI_APEI_EINJ_CXL=y CONFIG_ACPI_APEI_ERST_DEBUG=m CONFIG_ACPI_DPTF=y CONFIG_DPTF_POWER=m @@ -794,13 +809,12 @@ CONFIG_AMD_NB=y # CONFIG_IA32_EMULATION=y # CONFIG_IA32_EMULATION_DEFAULT_DISABLED is not set -# CONFIG_X86_X32_ABI is not set +CONFIG_X86_X32_ABI=y CONFIG_COMPAT_32=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y # end of Binary Emulations -CONFIG_HAVE_KVM=y CONFIG_KVM_COMMON=y CONFIG_HAVE_KVM_PFNCACHE=y CONFIG_HAVE_KVM_IRQCHIP=y @@ -811,6 +825,7 @@ CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL=y CONFIG_KVM_MMIO=y CONFIG_KVM_ASYNC_PF=y CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_READONLY_MEM=y CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y CONFIG_KVM_VFIO=y CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y @@ -840,6 +855,7 @@ CONFIG_AS_GFNI=y CONFIG_AS_VAES=y CONFIG_AS_VPCLMULQDQ=y CONFIG_AS_WRUSS=y +CONFIG_ARCH_CONFIGURES_CPU_MITIGATIONS=y # # General architecture-dependent options @@ -956,8 +972,11 @@ CONFIG_ARCH_MMAP_RND_BITS=32 CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_PAGE_SIZE_4KB=y +CONFIG_PAGE_SIZE_4KB=y CONFIG_PAGE_SIZE_LESS_THAN_64KB=y CONFIG_PAGE_SIZE_LESS_THAN_256KB=y +CONFIG_PAGE_SHIFT=12 CONFIG_HAVE_OBJTOOL=y CONFIG_HAVE_JUMP_LABEL_HACK=y CONFIG_HAVE_NOINSTR_HACK=y @@ -996,6 +1015,7 @@ CONFIG_DYNAMIC_SIGFRAME=y CONFIG_HAVE_ARCH_NODE_DEV_GROUP=y CONFIG_ARCH_HAS_HW_PTE_YOUNG=y CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG=y +CONFIG_ARCH_HAS_KERNEL_FPU_SUPPORT=y # # GCOV-based kernel profiling @@ -1010,10 +1030,11 @@ CONFIG_GCC_PLUGINS=y CONFIG_FUNCTION_ALIGNMENT_4B=y CONFIG_FUNCTION_ALIGNMENT_16B=y CONFIG_FUNCTION_ALIGNMENT=16 +CONFIG_CC_HAS_MIN_FUNCTION_ALIGNMENT=y +CONFIG_CC_HAS_SANE_FUNCTION_ALIGNMENT=y # end of General architecture-dependent options CONFIG_RT_MUTEXES=y -CONFIG_BASE_SMALL=0 CONFIG_MODULE_SIG_FORMAT=y CONFIG_MODULES=y CONFIG_MODULE_DEBUGFS=y @@ -1042,6 +1063,7 @@ CONFIG_MODULE_COMPRESS_ZSTD=y CONFIG_MODULE_DECOMPRESS=y CONFIG_MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS=y CONFIG_MODPROBE_PATH="/sbin/modprobe" +# CONFIG_TRIM_UNUSED_KSYMS is not set CONFIG_MODULES_TREE_LOOKUP=y CONFIG_BLOCK=y CONFIG_BLOCK_LEGACY_AUTOLOAD=y @@ -1056,7 +1078,6 @@ CONFIG_BLK_DEV_INTEGRITY_T10=y CONFIG_BLK_DEV_WRITE_MOUNTED=y CONFIG_BLK_DEV_ZONED=y CONFIG_BLK_DEV_THROTTLING=y -CONFIG_BLK_DEV_THROTTLING_LOW=y CONFIG_BLK_WBT=y CONFIG_BLK_WBT_MQ=y CONFIG_BLK_CGROUP_IOLATENCY=y @@ -1064,7 +1085,6 @@ CONFIG_BLK_CGROUP_FC_APPID=y CONFIG_BLK_CGROUP_IOCOST=y CONFIG_BLK_CGROUP_IOPRIO=y CONFIG_BLK_DEBUG_FS=y -CONFIG_BLK_DEBUG_FS_ZONED=y CONFIG_BLK_SED_OPAL=y CONFIG_BLK_INLINE_ENCRYPTION=y CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK=y @@ -1147,7 +1167,6 @@ CONFIG_ZPOOL=y CONFIG_SWAP=y CONFIG_ZSWAP=y CONFIG_ZSWAP_DEFAULT_ON=y -# CONFIG_ZSWAP_EXCLUSIVE_LOADS_DEFAULT_ON is not set CONFIG_ZSWAP_SHRINKER_DEFAULT_ON=y # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_DEFLATE is not set # CONFIG_ZSWAP_COMPRESSOR_DEFAULT_LZO is not set @@ -1186,7 +1205,7 @@ CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y CONFIG_SPARSEMEM_VMEMMAP=y CONFIG_ARCH_WANT_OPTIMIZE_DAX_VMEMMAP=y CONFIG_ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP=y -CONFIG_HAVE_FAST_GUP=y +CONFIG_HAVE_GUP_FAST=y CONFIG_NUMA_KEEP_MEMINFO=y CONFIG_MEMORY_ISOLATION=y CONFIG_EXCLUSIVE_SYSTEM_RAM=y @@ -1226,12 +1245,12 @@ CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y # CONFIG_TRANSPARENT_HUGEPAGE_NEVER is not set CONFIG_THP_SWAP=y CONFIG_READ_ONLY_THP_FOR_FS=y +CONFIG_PGTABLE_HAS_HUGE_LEAVES=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y CONFIG_USE_PERCPU_NUMA_NODE_ID=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_CMA=y -# CONFIG_CMA_DEBUG is not set CONFIG_CMA_DEBUGFS=y CONFIG_CMA_SYSFS=y CONFIG_CMA_AREAS=7 @@ -1273,6 +1292,7 @@ CONFIG_ARCH_SUPPORTS_PER_VMA_LOCK=y CONFIG_PER_VMA_LOCK=y CONFIG_LOCK_MM_AND_FIND_VMA=y CONFIG_IOMMU_MM_DATA=y +CONFIG_EXECMEM=y # # Data Access Monitoring @@ -1287,6 +1307,7 @@ CONFIG_NET_INGRESS=y CONFIG_NET_EGRESS=y CONFIG_NET_XGRESS=y CONFIG_NET_REDIRECT=y +CONFIG_SKB_DECRYPTED=y CONFIG_SKB_EXTENSIONS=y # @@ -1295,7 +1316,6 @@ CONFIG_SKB_EXTENSIONS=y CONFIG_PACKET=y CONFIG_PACKET_DIAG=m CONFIG_UNIX=y -CONFIG_UNIX_SCM=y CONFIG_AF_UNIX_OOB=y CONFIG_UNIX_DIAG=m CONFIG_TLS=m @@ -1318,6 +1338,7 @@ CONFIG_NET_KEY_MIGRATE=y CONFIG_XFRM_ESPINTCP=y CONFIG_SMC=m CONFIG_SMC_DIAG=m +CONFIG_SMC_LO=y CONFIG_XDP_SOCKETS=y CONFIG_XDP_SOCKETS_DIAG=m CONFIG_NET_HANDSHAKE=y @@ -1677,6 +1698,7 @@ CONFIG_IP_VS_PE_SIP=m # IP: Netfilter Configuration # CONFIG_NF_DEFRAG_IPV4=m +CONFIG_IP_NF_IPTABLES_LEGACY=m CONFIG_NF_SOCKET_IPV4=m CONFIG_NF_TPROXY_IPV4=m CONFIG_NF_TABLES_IPV4=y @@ -1709,6 +1731,7 @@ CONFIG_IP_NF_TARGET_TTL=m CONFIG_IP_NF_RAW=m CONFIG_IP_NF_SECURITY=m CONFIG_IP_NF_ARPTABLES=m +CONFIG_NFT_COMPAT_ARP=m CONFIG_IP_NF_ARPFILTER=m CONFIG_IP_NF_ARP_MANGLE=m # end of IP: Netfilter Configuration @@ -1716,6 +1739,7 @@ CONFIG_IP_NF_ARP_MANGLE=m # # IPv6: Netfilter Configuration # +CONFIG_IP6_NF_IPTABLES_LEGACY=m CONFIG_NF_SOCKET_IPV6=m CONFIG_NF_TPROXY_IPV6=m CONFIG_NF_TABLES_IPV6=y @@ -1753,6 +1777,7 @@ CONFIG_NF_TABLES_BRIDGE=m CONFIG_NFT_BRIDGE_META=m CONFIG_NFT_BRIDGE_REJECT=m CONFIG_NF_CONNTRACK_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES_LEGACY=m CONFIG_BRIDGE_NF_EBTABLES=m CONFIG_BRIDGE_EBT_BROUTE=m CONFIG_BRIDGE_EBT_T_FILTER=m @@ -1941,7 +1966,6 @@ CONFIG_NET_ACT_GACT=m CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=m CONFIG_NET_ACT_SAMPLE=m -CONFIG_NET_ACT_IPT=m CONFIG_NET_ACT_NAT=m CONFIG_NET_ACT_PEDIT=m CONFIG_NET_ACT_SIMP=m @@ -2111,6 +2135,7 @@ CONFIG_BT_MTKUART=m CONFIG_BT_HCIRSI=m CONFIG_BT_VIRTIO=m CONFIG_BT_NXPUART=m +CONFIG_BT_INTEL_PCIE=m # end of Bluetooth device drivers CONFIG_AF_RXRPC=m @@ -2224,6 +2249,7 @@ CONFIG_LWTUNNEL_BPF=y CONFIG_DST_CACHE=y CONFIG_GRO_CELLS=y CONFIG_SOCK_VALIDATE_XMIT=y +CONFIG_NET_IEEE8021Q_HELPERS=y CONFIG_NET_SELFTESTS=m CONFIG_NET_SOCK_MSG=y CONFIG_NET_DEVLINK=y @@ -2238,6 +2264,7 @@ CONFIG_ETHTOOL_NETLINK=y CONFIG_HAVE_EISA=y # CONFIG_EISA is not set CONFIG_HAVE_PCI=y +CONFIG_GENERIC_PCI_IOMAP=y CONFIG_PCI=y CONFIG_PCI_DOMAINS=y CONFIG_PCIEPORTBUS=y @@ -2330,7 +2357,6 @@ CONFIG_CXL_PORT=m CONFIG_CXL_SUSPEND=y CONFIG_CXL_REGION=y # CONFIG_CXL_REGION_INVALIDATION_TEST is not set -CONFIG_CXL_PMU=m CONFIG_PCCARD=m CONFIG_PCMCIA=m CONFIG_PCMCIA_LOAD_CIS=y @@ -2551,7 +2577,6 @@ CONFIG_MTD_ROM=m # # CONFIG_MTD_COMPLEX_MAPPINGS is not set # CONFIG_MTD_PHYSMAP is not set -# CONFIG_MTD_INTEL_VR_NOR is not set # CONFIG_MTD_PLATRAM is not set # end of Mapping drivers for chip access @@ -2629,6 +2654,7 @@ CONFIG_MTD_UBI_BEB_LIMIT=20 # CONFIG_MTD_UBI_FASTMAP is not set # CONFIG_MTD_UBI_GLUEBI is not set # CONFIG_MTD_UBI_BLOCK is not set +CONFIG_MTD_UBI_NVMEM=m # CONFIG_MTD_HYPERBUS is not set # CONFIG_OF is not set CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y @@ -2746,6 +2772,7 @@ CONFIG_DW_XDATA_PCIE=m CONFIG_PCI_ENDPOINT_TEST=m CONFIG_XILINX_SDFEC=m CONFIG_MISC_RTSX=m +CONFIG_NTSYNC=y CONFIG_TPS6594_ESM=m CONFIG_TPS6594_PFSM=m CONFIG_NSM=m @@ -3120,6 +3147,7 @@ CONFIG_DM_LOG_WRITES=m CONFIG_DM_INTEGRITY=m CONFIG_DM_ZONED=m CONFIG_DM_AUDIT=y +CONFIG_DM_VDO=m CONFIG_TARGET_CORE=m CONFIG_TCM_IBLOCK=m CONFIG_TCM_FILEIO=m @@ -3177,6 +3205,7 @@ CONFIG_VXLAN=m CONFIG_GENEVE=m CONFIG_BAREUDP=m CONFIG_GTP=m +CONFIG_PFCP=m CONFIG_AMT=m CONFIG_MACSEC=m CONFIG_NETCONSOLE=m @@ -3259,10 +3288,6 @@ CONFIG_NET_DSA_XRS700X=m CONFIG_NET_DSA_XRS700X_I2C=m CONFIG_NET_DSA_XRS700X_MDIO=m CONFIG_NET_DSA_REALTEK=m -# CONFIG_NET_DSA_REALTEK_MDIO is not set -# CONFIG_NET_DSA_REALTEK_SMI is not set -CONFIG_NET_DSA_REALTEK_RTL8365MB=m -CONFIG_NET_DSA_REALTEK_RTL8366RB=m CONFIG_NET_DSA_SMSC_LAN9303=m CONFIG_NET_DSA_SMSC_LAN9303_I2C=m CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m @@ -3399,6 +3424,8 @@ CONFIG_NET_VENDOR_HUAWEI=y CONFIG_HINIC=m CONFIG_NET_VENDOR_I825XX=y CONFIG_NET_VENDOR_INTEL=y +CONFIG_LIBETH=m +CONFIG_LIBIE=m CONFIG_E100=m CONFIG_E1000=m CONFIG_E1000E=m @@ -3424,6 +3451,7 @@ CONFIG_ICE_SWITCHDEV=y CONFIG_ICE_HWTS=y CONFIG_FM10K=m CONFIG_IGC=m +CONFIG_IGC_LEDS=y CONFIG_IDPF=m CONFIG_JME=m CONFIG_NET_VENDOR_ADI=y @@ -3437,6 +3465,7 @@ CONFIG_SKGE_GENESIS=y CONFIG_SKY2=m # CONFIG_SKY2_DEBUG is not set CONFIG_OCTEON_EP=m +CONFIG_OCTEON_EP_VF=m CONFIG_PRESTERA=m CONFIG_PRESTERA_PCI=m CONFIG_NET_VENDOR_MELLANOX=y @@ -3628,7 +3657,6 @@ CONFIG_FDDI=m CONFIG_DEFXX=m CONFIG_SKFP=m # CONFIG_HIPPI is not set -CONFIG_NET_SB1000=m CONFIG_PHYLINK=m CONFIG_PHYLIB=m CONFIG_SWPHY=y @@ -3639,6 +3667,7 @@ CONFIG_SFP=m # # MII PHY device drivers # +CONFIG_AIR_EN8811H_PHY=m CONFIG_AMD_PHY=m CONFIG_ADIN_PHY=m CONFIG_ADIN1100_PHY=m @@ -3676,7 +3705,10 @@ CONFIG_NXP_CBTX_PHY=m CONFIG_NXP_C45_TJA11XX_PHY=m CONFIG_NXP_TJA11XX_PHY=m CONFIG_NCN26000_PHY=m +CONFIG_QCOM_NET_PHYLIB=m CONFIG_AT803X_PHY=m +CONFIG_QCA83XX_PHY=m +CONFIG_QCA808X_PHY=m CONFIG_QSEMI_PHY=m CONFIG_REALTEK_PHY=m CONFIG_RENESAS_PHY=m @@ -3696,6 +3728,8 @@ CONFIG_XILINX_GMII2RGMII=m CONFIG_MICREL_KS8995MA=m CONFIG_PSE_CONTROLLER=y CONFIG_PSE_REGULATOR=m +CONFIG_PSE_PD692X0=m +CONFIG_PSE_TPS23881=m CONFIG_CAN_DEV=m CONFIG_CAN_VCAN=m CONFIG_CAN_VXCAN=m @@ -3710,10 +3744,10 @@ CONFIG_CAN_C_CAN=m CONFIG_CAN_C_CAN_PLATFORM=m CONFIG_CAN_C_CAN_PCI=m CONFIG_CAN_CC770=m -# CONFIG_CAN_CC770_ISA is not set CONFIG_CAN_CC770_PLATFORM=m CONFIG_CAN_CTUCANFD=m CONFIG_CAN_CTUCANFD_PCI=m +CONFIG_CAN_ESD_402_PCI=m CONFIG_CAN_IFI_CANFD=m CONFIG_CAN_M_CAN=m CONFIG_CAN_M_CAN_PCI=m @@ -3729,7 +3763,6 @@ CONFIG_CAN_PEAK_PCI=m CONFIG_CAN_PEAK_PCIEC=y CONFIG_CAN_PEAK_PCMCIA=m CONFIG_CAN_PLX_PCI=m -# CONFIG_CAN_SJA1000_ISA is not set CONFIG_CAN_SJA1000_PLATFORM=m CONFIG_CAN_SOFTING=m CONFIG_CAN_SOFTING_CS=m @@ -3927,6 +3960,7 @@ CONFIG_ATH11K_DEBUGFS=y CONFIG_ATH11K_SPECTRAL=y CONFIG_ATH12K=m CONFIG_ATH12K_DEBUG=y +CONFIG_ATH12K_DEBUGFS=y CONFIG_ATH12K_TRACING=y CONFIG_WLAN_VENDOR_ATMEL=y CONFIG_AT76C50X_USB=m @@ -4118,6 +4152,7 @@ CONFIG_RTLWIFI_PCI=m CONFIG_RTLWIFI_USB=m CONFIG_RTLWIFI_DEBUG=y CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8192D_COMMON=m CONFIG_RTL8723_COMMON=m CONFIG_RTLBTCOEXIST=m CONFIG_RTL8XXXU=m @@ -4129,6 +4164,8 @@ CONFIG_RTW88_SDIO=m CONFIG_RTW88_USB=m CONFIG_RTW88_8822B=m CONFIG_RTW88_8822C=m +CONFIG_RTW88_8723X=m +CONFIG_RTW88_8703B=m CONFIG_RTW88_8723D=m CONFIG_RTW88_8821C=m CONFIG_RTW88_8822BE=m @@ -4139,6 +4176,7 @@ CONFIG_RTW88_8822CS=m CONFIG_RTW88_8822CU=m CONFIG_RTW88_8723DE=m CONFIG_RTW88_8723DS=m +CONFIG_RTW88_8723CS=m CONFIG_RTW88_8723DU=m CONFIG_RTW88_8821CE=m CONFIG_RTW88_8821CS=m @@ -4152,10 +4190,12 @@ CONFIG_RTW89_8851B=m CONFIG_RTW89_8852A=m CONFIG_RTW89_8852B=m CONFIG_RTW89_8852C=m +CONFIG_RTW89_8922A=m CONFIG_RTW89_8851BE=m CONFIG_RTW89_8852AE=m CONFIG_RTW89_8852BE=m CONFIG_RTW89_8852CE=m +CONFIG_RTW89_8922AE=m CONFIG_RTW89_DEBUG=y CONFIG_RTW89_DEBUGMSG=y CONFIG_RTW89_DEBUGFS=y @@ -4409,6 +4449,9 @@ CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m CONFIG_TOUCHSCREEN_EXC3000=m CONFIG_TOUCHSCREEN_FUJITSU=m CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_GOODIX_BERLIN_CORE=m +CONFIG_TOUCHSCREEN_GOODIX_BERLIN_I2C=m +CONFIG_TOUCHSCREEN_GOODIX_BERLIN_SPI=m CONFIG_TOUCHSCREEN_HIDEEP=m CONFIG_TOUCHSCREEN_HYCON_HY46XX=m CONFIG_TOUCHSCREEN_HYNITRON_CSTXXX=m @@ -4597,7 +4640,6 @@ CONFIG_VT=y CONFIG_CONSOLE_TRANSLATIONS=y CONFIG_VT_CONSOLE=y CONFIG_VT_CONSOLE_SLEEP=y -CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set @@ -4648,10 +4690,9 @@ CONFIG_SERIAL_CORE_CONSOLE=y CONFIG_SERIAL_JSM=m # CONFIG_SERIAL_LANTIQ is not set CONFIG_SERIAL_SCCNXP=m -CONFIG_SERIAL_SC16IS7XX_CORE=m CONFIG_SERIAL_SC16IS7XX=m -CONFIG_SERIAL_SC16IS7XX_I2C=y -CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_SC16IS7XX_I2C=m +CONFIG_SERIAL_SC16IS7XX_SPI=m CONFIG_SERIAL_ALTERA_JTAGUART=m CONFIG_SERIAL_ALTERA_UART=m CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 @@ -4672,7 +4713,7 @@ CONFIG_MOXA_INTELLIO=m CONFIG_MOXA_SMARTIO=m CONFIG_N_HDLC=m CONFIG_IPWIRELESS=m -CONFIG_N_GSM=m +# CONFIG_N_GSM is not set CONFIG_NOZOMI=m CONFIG_NULL_TTY=m CONFIG_HVC_DRIVER=y @@ -4715,6 +4756,7 @@ CONFIG_HPET=y # CONFIG_HPET_MMAP is not set CONFIG_HANGCHECK_TIMER=m CONFIG_TCG_TPM=y +CONFIG_TCG_TPM2_HMAC=y CONFIG_HW_RANDOM_TPM=y CONFIG_TCG_TIS_CORE=y CONFIG_TCG_TIS=y @@ -4783,6 +4825,7 @@ CONFIG_I2C_AMD756_S4882=m CONFIG_I2C_AMD8111=m CONFIG_I2C_AMD_MP2=m CONFIG_I2C_I801=m +CONFIG_I2C_I801_MUX=y CONFIG_I2C_ISCH=m CONFIG_I2C_ISMT=m CONFIG_I2C_PIIX4=m @@ -4796,6 +4839,7 @@ CONFIG_I2C_SIS630=m CONFIG_I2C_SIS96X=m CONFIG_I2C_VIA=m CONFIG_I2C_VIAPRO=m +CONFIG_I2C_ZHAOXIN=m # # ACPI drivers @@ -4936,6 +4980,7 @@ CONFIG_PTP_1588_CLOCK_INES=m CONFIG_PTP_1588_CLOCK_KVM=m CONFIG_PTP_1588_CLOCK_IDT82P33=m CONFIG_PTP_1588_CLOCK_IDTCM=m +CONFIG_PTP_1588_CLOCK_FC3W=m CONFIG_PTP_1588_CLOCK_MOCK=m CONFIG_PTP_1588_CLOCK_VMW=m CONFIG_PTP_1588_CLOCK_OCP=m @@ -5002,6 +5047,7 @@ CONFIG_GPIO_CDEV=y CONFIG_GPIO_CDEV_V1=y CONFIG_GPIO_GENERIC=m CONFIG_GPIO_REGMAP=m +CONFIG_GPIO_SWNODE_UNDEFINED=y CONFIG_GPIO_MAX730X=m CONFIG_GPIO_IDIO_16=m @@ -5012,6 +5058,7 @@ CONFIG_GPIO_AMDPT=m CONFIG_GPIO_DWAPB=m CONFIG_GPIO_EXAR=m CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_GRANITERAPIDS=m CONFIG_GPIO_ICH=m CONFIG_GPIO_MB86S7X=m CONFIG_GPIO_MENZ127=m @@ -5052,6 +5099,7 @@ CONFIG_GPIO_TPIC2810=m CONFIG_GPIO_ADP5520=m CONFIG_GPIO_ARIZONA=m CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_CROS_EC=m CONFIG_GPIO_CRYSTAL_COVE=m CONFIG_GPIO_DA9052=m CONFIG_GPIO_DA9055=m @@ -5127,6 +5175,7 @@ CONFIG_W1_MASTER_DS2490=m CONFIG_W1_MASTER_DS2482=m CONFIG_W1_MASTER_GPIO=m CONFIG_W1_MASTER_SGI=m +CONFIG_W1_MASTER_UART=m # end of 1-wire Bus Masters # @@ -5271,6 +5320,7 @@ CONFIG_SENSORS_AHT10=m CONFIG_SENSORS_AQUACOMPUTER_D5NEXT=m CONFIG_SENSORS_AS370=m CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_ASUS_ROG_RYUJIN=m CONFIG_SENSORS_AXI_FAN_CONTROL=m CONFIG_SENSORS_K8TEMP=m CONFIG_SENSORS_K10TEMP=m @@ -5278,6 +5328,7 @@ CONFIG_SENSORS_FAM15H_POWER=m CONFIG_SENSORS_APPLESMC=m CONFIG_SENSORS_ASB100=m CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_CHIPCAP2=m CONFIG_SENSORS_CORSAIR_CPRO=m CONFIG_SENSORS_CORSAIR_PSU=m CONFIG_SENSORS_DRIVETEMP=m @@ -5310,6 +5361,7 @@ CONFIG_SENSORS_IT87=m CONFIG_SENSORS_JC42=m CONFIG_SENSORS_POWERZ=m CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LENOVO_EC=m CONFIG_SENSORS_LINEAGE=m CONFIG_SENSORS_LTC2945=m CONFIG_SENSORS_LTC2947=m @@ -5324,6 +5376,7 @@ CONFIG_SENSORS_LTC4222=m CONFIG_SENSORS_LTC4245=m CONFIG_SENSORS_LTC4260=m CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_LTC4282=m CONFIG_SENSORS_MAX1111=m CONFIG_SENSORS_MAX127=m CONFIG_SENSORS_MAX16065=m @@ -5375,6 +5428,7 @@ CONFIG_SENSORS_NCT7802=m CONFIG_SENSORS_NCT7904=m CONFIG_SENSORS_NPCM7XX=m CONFIG_SENSORS_NZXT_KRAKEN2=m +CONFIG_SENSORS_NZXT_KRAKEN3=m CONFIG_SENSORS_NZXT_SMART2=m CONFIG_SENSORS_OCC_P8_I2C=m CONFIG_SENSORS_OCC=m @@ -5385,6 +5439,7 @@ CONFIG_SENSORS_PMBUS=m CONFIG_SENSORS_ACBEL_FSG032=m CONFIG_SENSORS_ADM1266=m CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_ADP1050=m CONFIG_SENSORS_BEL_PFE=m CONFIG_SENSORS_BPA_RS600=m CONFIG_SENSORS_DELTA_AHE50DC_FAN=m @@ -5421,6 +5476,7 @@ CONFIG_SENSORS_MP5023=m CONFIG_SENSORS_MP5990=m CONFIG_SENSORS_MPQ7932_REGULATOR=y CONFIG_SENSORS_MPQ7932=m +CONFIG_SENSORS_MPQ8785=m CONFIG_SENSORS_PIM4328=m CONFIG_SENSORS_PLI1209BC=m CONFIG_SENSORS_PLI1209BC_REGULATOR=y @@ -5435,10 +5491,13 @@ CONFIG_SENSORS_TPS53679=m CONFIG_SENSORS_TPS546D24=m CONFIG_SENSORS_UCD9000=m CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_XDP710=m CONFIG_SENSORS_XDPE152=m CONFIG_SENSORS_XDPE122=m CONFIG_SENSORS_XDPE122_REGULATOR=y CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_PT5161L=m +CONFIG_SENSORS_PWM_FAN=m CONFIG_SENSORS_SBTSI=m CONFIG_SENSORS_SBRMI=m CONFIG_SENSORS_SHT15=m @@ -5460,6 +5519,7 @@ CONFIG_SENSORS_SCH56XX_COMMON=m CONFIG_SENSORS_SCH5627=m CONFIG_SENSORS_SCH5636=m CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SURFACE_FAN=m CONFIG_SENSORS_ADC128D818=m CONFIG_SENSORS_ADS7828=m CONFIG_SENSORS_ADS7871=m @@ -5511,7 +5571,6 @@ CONFIG_THERMAL_NETLINK=y # CONFIG_THERMAL_DEBUGFS is not set CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=100 CONFIG_THERMAL_HWMON=y -CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y # CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set # CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set @@ -5574,10 +5633,12 @@ CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y # CONFIG_SOFT_WATCHDOG=m # CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_CROS_EC_WATCHDOG=m CONFIG_DA9052_WATCHDOG=m CONFIG_DA9055_WATCHDOG=m CONFIG_DA9063_WATCHDOG=m CONFIG_DA9062_WATCHDOG=m +CONFIG_LENOVO_SE10_WDT=m CONFIG_MENF21BMC_WATCHDOG=m CONFIG_MENZ069_WATCHDOG=m CONFIG_WDAT_WDT=m @@ -6004,6 +6065,7 @@ CONFIG_VIDEO_V4L2_SUBDEV_API=y # CONFIG_VIDEO_ADV_DEBUG is not set # CONFIG_VIDEO_FIXED_MINOR_RANGES is not set CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_LOOPBACK=m CONFIG_V4L2_MEM2MEM_DEV=m CONFIG_V4L2_FLASH_LED_CLASS=m CONFIG_V4L2_FWNODE=m @@ -6258,6 +6320,7 @@ CONFIG_DVB_BUDGET=m CONFIG_DVB_BUDGET_CI=m CONFIG_DVB_BUDGET_AV=m CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_VIDEO_INTEL_IPU6=m CONFIG_INTEL_VSC=m CONFIG_IPU_BRIDGE=m CONFIG_RADIO_ADAPTERS=m @@ -6800,8 +6863,8 @@ CONFIG_DVB_DUMMY_FE=m # Graphics support # CONFIG_APERTURE_HELPERS=y -CONFIG_VIDEO_CMDLINE=y -CONFIG_VIDEO_NOMODESET=y +CONFIG_SCREEN_INFO=y +CONFIG_VIDEO=y # CONFIG_AUXDISPLAY is not set # CONFIG_PANEL is not set CONFIG_AGP=y @@ -6820,11 +6883,12 @@ CONFIG_DRM_FBDEV_EMULATION=y CONFIG_DRM_FBDEV_OVERALLOC=100 CONFIG_DRM_LOAD_EDID_FIRMWARE=y CONFIG_DRM_DISPLAY_HELPER=m +CONFIG_DRM_DISPLAY_DP_AUX_CEC=y +CONFIG_DRM_DISPLAY_DP_AUX_CHARDEV=y CONFIG_DRM_DISPLAY_DP_HELPER=y +CONFIG_DRM_DISPLAY_DP_TUNNEL=y CONFIG_DRM_DISPLAY_HDCP_HELPER=y CONFIG_DRM_DISPLAY_HDMI_HELPER=y -CONFIG_DRM_DP_AUX_CHARDEV=y -CONFIG_DRM_DP_CEC=y CONFIG_DRM_TTM=m CONFIG_DRM_EXEC=m CONFIG_DRM_GPUVM=m @@ -6890,6 +6954,7 @@ CONFIG_DRM_I915_COMPRESS_ERROR=y CONFIG_DRM_I915_USERPTR=y CONFIG_DRM_I915_GVT_KVMGT=m CONFIG_DRM_I915_PXP=y +CONFIG_DRM_I915_DP_TUNNEL=y CONFIG_DRM_I915_REQUEST_TIMEOUT=20000 CONFIG_DRM_I915_FENCE_TIMEOUT=10000 CONFIG_DRM_I915_USERFAULT_AUTOSUSPEND=250 @@ -6928,6 +6993,7 @@ CONFIG_DRM_PANEL=y # Display Panels # CONFIG_DRM_PANEL_AUO_A030JTN01=m +CONFIG_DRM_PANEL_ILITEK_ILI9341=m CONFIG_DRM_PANEL_ORISETECH_OTA5601A=m CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m CONFIG_DRM_PANEL_WIDECHIPS_WS2401=m @@ -6944,6 +7010,7 @@ CONFIG_DRM_ANALOGIX_DP=m # end of Display Interface Bridges # CONFIG_DRM_ETNAVIV is not set +CONFIG_DRM_APPLETBDRM=m CONFIG_DRM_BOCHS=m CONFIG_DRM_CIRRUS_QEMU=m CONFIG_DRM_GM12U320=m @@ -6966,8 +7033,8 @@ CONFIG_DRM_SSD130X=m CONFIG_DRM_SSD130X_I2C=m CONFIG_DRM_SSD130X_SPI=m CONFIG_DRM_HYPERV=m -CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y CONFIG_DRM_PRIVACY_SCREEN=y +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y # # Frame buffer Devices @@ -7058,6 +7125,7 @@ CONFIG_LCD_HX8357=m CONFIG_LCD_OTM3225A=m CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_BACKLIGHT_KTD253=m +CONFIG_BACKLIGHT_KTD2801=m CONFIG_BACKLIGHT_KTZ8866=m CONFIG_BACKLIGHT_LM3533=m CONFIG_BACKLIGHT_PWM=m @@ -7271,6 +7339,7 @@ CONFIG_SND_HDA_PATCH_LOADER=y CONFIG_SND_HDA_CIRRUS_SCODEC=m CONFIG_SND_HDA_SCODEC_CS35L41=m CONFIG_SND_HDA_CS_DSP_CONTROLS=m +CONFIG_SND_HDA_SCODEC_COMPONENT=m CONFIG_SND_HDA_SCODEC_CS35L41_I2C=m CONFIG_SND_HDA_SCODEC_CS35L41_SPI=m CONFIG_SND_HDA_SCODEC_CS35L56=m @@ -7373,7 +7442,11 @@ CONFIG_SND_AMD_ASOC_ACP70=m CONFIG_SND_SOC_AMD_MACH_COMMON=m CONFIG_SND_SOC_AMD_LEGACY_MACH=m CONFIG_SND_SOC_AMD_SOF_MACH=m +CONFIG_SND_AMD_SOUNDWIRE_ACPI=m CONFIG_SND_SOC_AMD_RPL_ACP6x=m +CONFIG_SND_SOC_AMD_ACP63_TOPLEVEL=m +CONFIG_SND_SOC_AMD_SOUNDWIRE_LINK_BASELINE=m +CONFIG_SND_SOC_AMD_SOUNDWIRE=m CONFIG_SND_SOC_AMD_PS=m CONFIG_SND_SOC_AMD_PS_MACH=m CONFIG_SND_ATMEL_SOC=m @@ -7463,7 +7536,6 @@ CONFIG_SND_SOC_INTEL_SOF_MAXIM_COMMON=m CONFIG_SND_SOC_INTEL_SOF_REALTEK_COMMON=m CONFIG_SND_SOC_INTEL_SOF_CIRRUS_COMMON=m CONFIG_SND_SOC_INTEL_SOF_NUVOTON_COMMON=m -CONFIG_SND_SOC_INTEL_SOF_SSP_COMMON=m CONFIG_SND_SOC_INTEL_SOF_BOARD_HELPERS=m CONFIG_SND_SOC_INTEL_HASWELL_MACH=m CONFIG_SND_SOC_INTEL_BDW_RT5650_MACH=m @@ -7484,7 +7556,6 @@ CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m CONFIG_SND_SOC_INTEL_DA7219_MAX98357A_GENERIC=m -CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_COMMON=m CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m CONFIG_SND_SOC_INTEL_SOF_WM8804_MACH=m @@ -7525,6 +7596,8 @@ CONFIG_SND_SOC_SOF_AMD_RENOIR=m CONFIG_SND_SOC_SOF_AMD_VANGOGH=m CONFIG_SND_SOC_SOF_AMD_REMBRANDT=m CONFIG_SND_SOC_SOF_ACP_PROBES=m +CONFIG_SND_SOC_SOF_AMD_SOUNDWIRE_LINK_BASELINE=m +CONFIG_SND_SOC_SOF_AMD_SOUNDWIRE=m CONFIG_SND_SOC_SOF_AMD_ACP63=m CONFIG_SND_SOC_SOF_INTEL_TOPLEVEL=y CONFIG_SND_SOC_SOF_INTEL_HIFI_EP_IPC=m @@ -7554,6 +7627,7 @@ CONFIG_SND_SOC_SOF_METEORLAKE=m CONFIG_SND_SOC_SOF_INTEL_LNL=m CONFIG_SND_SOC_SOF_LUNARLAKE=m CONFIG_SND_SOC_SOF_HDA_COMMON=m +CONFIG_SND_SOC_SOF_HDA_GENERIC=m CONFIG_SND_SOC_SOF_HDA_MLINK=m CONFIG_SND_SOC_SOF_HDA_LINK=y CONFIG_SND_SOC_SOF_HDA_AUDIO_CODEC=y @@ -7615,6 +7689,7 @@ CONFIG_SND_SOC_BD28623=m # CONFIG_SND_SOC_BT_SCO is not set CONFIG_SND_SOC_CHV3_CODEC=m CONFIG_SND_SOC_CROS_EC_CODEC=m +CONFIG_SND_SOC_CS_AMP_LIB=m CONFIG_SND_SOC_CS35L32=m CONFIG_SND_SOC_CS35L33=m CONFIG_SND_SOC_CS35L34=m @@ -7674,7 +7749,6 @@ CONFIG_SND_SOC_HDAC_HDA=m CONFIG_SND_SOC_HDA=m CONFIG_SND_SOC_ICS43432=m CONFIG_SND_SOC_IDT821034=m -CONFIG_SND_SOC_INNO_RK3036=m CONFIG_SND_SOC_MAX98088=m CONFIG_SND_SOC_MAX98090=m CONFIG_SND_SOC_MAX98357A=m @@ -7710,8 +7784,8 @@ CONFIG_SND_SOC_PCM5102A=m CONFIG_SND_SOC_PCM512x=m CONFIG_SND_SOC_PCM512x_I2C=m CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_PCM6240=m CONFIG_SND_SOC_PEB2466=m -CONFIG_SND_SOC_RK3328=m CONFIG_SND_SOC_RL6231=m CONFIG_SND_SOC_RL6347A=m CONFIG_SND_SOC_RT274=m @@ -7815,6 +7889,8 @@ CONFIG_SND_SOC_WCD_MBHC=m CONFIG_SND_SOC_WCD934X=m CONFIG_SND_SOC_WCD938X=m CONFIG_SND_SOC_WCD938X_SDW=m +CONFIG_SND_SOC_WCD939X=m +CONFIG_SND_SOC_WCD939X_SDW=m CONFIG_SND_SOC_WM5102=m CONFIG_SND_SOC_WM8510=m CONFIG_SND_SOC_WM8523=m @@ -7891,6 +7967,9 @@ CONFIG_HID_ACRUX=m CONFIG_HID_ACRUX_FF=y CONFIG_HID_APPLE=m CONFIG_HID_APPLEIR=m +CONFIG_HID_APPLETB_BL=m +CONFIG_HID_APPLETB_KBD=m +CONFIG_HID_APPLE_MAGIC_BACKLIGHT=m CONFIG_HID_ASUS=m CONFIG_HID_AUREAL=m CONFIG_HID_BELKIN=m @@ -8009,6 +8088,7 @@ CONFIG_HID_UDRAW_PS3=m CONFIG_HID_U2FZERO=m CONFIG_HID_WACOM=m CONFIG_HID_WIIMOTE=m +CONFIG_HID_WINWING=m CONFIG_HID_XINMO=m CONFIG_HID_ZEROPLUS=m CONFIG_ZEROPLUS_FF=y @@ -8082,6 +8162,7 @@ CONFIG_USB_DEFAULT_PERSIST=y # CONFIG_USB_OTG_PRODUCTLIST is not set CONFIG_USB_LEDS_TRIGGER_USBPORT=m CONFIG_USB_AUTOSUSPEND_DELAY=2 +CONFIG_USB_DEFAULT_AUTHORIZATION_MODE=1 CONFIG_USB_MON=m # @@ -8089,7 +8170,7 @@ CONFIG_USB_MON=m # CONFIG_USB_C67X00_HCD=m CONFIG_USB_XHCI_HCD=y -# CONFIG_USB_XHCI_DBGCAP is not set +CONFIG_USB_XHCI_DBGCAP=y CONFIG_USB_XHCI_PCI=m CONFIG_USB_XHCI_PCI_RENESAS=m CONFIG_USB_XHCI_PLATFORM=m @@ -8469,6 +8550,7 @@ CONFIG_TYPEC_MUX_FSA4480=m CONFIG_TYPEC_MUX_GPIO_SBU=m CONFIG_TYPEC_MUX_PI3USB30532=m CONFIG_TYPEC_MUX_INTEL_PMC=m +CONFIG_TYPEC_MUX_IT5205=m CONFIG_TYPEC_MUX_NB7VPQ904M=m CONFIG_TYPEC_MUX_PTN36502=m CONFIG_TYPEC_MUX_WCD939X_USBSS=m @@ -8544,6 +8626,7 @@ CONFIG_MEMSTICK_JMICRON_38X=m CONFIG_MEMSTICK_R592=m CONFIG_MEMSTICK_REALTEK_PCI=m CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_LEDS_EXPRESSWIRE=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_CLASS_FLASH=m @@ -8643,9 +8726,7 @@ CONFIG_LEDS_TRIGGER_CAMERA=m CONFIG_LEDS_TRIGGER_PANIC=y CONFIG_LEDS_TRIGGER_NETDEV=m CONFIG_LEDS_TRIGGER_PATTERN=m -CONFIG_LEDS_TRIGGER_AUDIO=m CONFIG_LEDS_TRIGGER_TTY=m -CONFIG_LEDS_TRIGGER_BLKDEV=m # # Simple LED drivers @@ -8798,6 +8879,7 @@ CONFIG_RTC_DRV_RC5T583=m CONFIG_RTC_DRV_S35390A=m CONFIG_RTC_DRV_FM3130=m CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8111=m CONFIG_RTC_DRV_RX8581=m CONFIG_RTC_DRV_RX8025=m CONFIG_RTC_DRV_EM3027=m @@ -8941,7 +9023,6 @@ CONFIG_UIO_AEC=m CONFIG_UIO_SERCOS3=m CONFIG_UIO_PCI_GENERIC=m CONFIG_UIO_NETX=m -CONFIG_UIO_PRUSS=m CONFIG_UIO_MF624=m CONFIG_UIO_HV_GENERIC=m CONFIG_UIO_DFL=m @@ -8966,10 +9047,11 @@ CONFIG_VFIO_PCI_IGD=y CONFIG_MLX5_VFIO_PCI=m CONFIG_PDS_VFIO_PCI=m CONFIG_VIRTIO_VFIO_PCI=m +CONFIG_QAT_VFIO_PCI=m # end of VFIO support for PCI devices CONFIG_VFIO_MDEV=m -CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_IRQ_BYPASS_MANAGER=y CONFIG_VIRT_DRIVERS=y CONFIG_VMGENID=y CONFIG_VBOXGUEST=m @@ -8981,10 +9063,10 @@ CONFIG_SEV_GUEST=m CONFIG_TDX_GUEST_DRIVER=m CONFIG_VIRTIO_ANCHOR=y CONFIG_VIRTIO=y -CONFIG_VIRTIO_PCI_LIB=m -CONFIG_VIRTIO_PCI_LIB_LEGACY=m +CONFIG_VIRTIO_PCI_LIB=y +CONFIG_VIRTIO_PCI_LIB_LEGACY=y CONFIG_VIRTIO_MENU=y -CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_PCI_ADMIN_LEGACY=y CONFIG_VIRTIO_PCI_LEGACY=y CONFIG_VIRTIO_VDPA=m @@ -8995,6 +9077,7 @@ CONFIG_VIRTIO_INPUT=m CONFIG_VIRTIO_MMIO=m CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y CONFIG_VIRTIO_DMA_SHARED_BUFFER=m +CONFIG_VIRTIO_DEBUG=y CONFIG_VDPA=m CONFIG_VDPA_SIM=m CONFIG_VDPA_SIM_NET=m @@ -9072,7 +9155,6 @@ CONFIG_XEN_VIRTIO=y # CONFIG_GREYBUS is not set # CONFIG_COMEDI is not set CONFIG_STAGING=y -CONFIG_PRISM2_USB=m CONFIG_RTLLIB=m CONFIG_RTLLIB_CRYPTO_CCMP=m CONFIG_RTLLIB_CRYPTO_TKIP=m @@ -9144,9 +9226,10 @@ CONFIG_MOST_NET=m CONFIG_MOST_VIDEO=m CONFIG_MOST_I2C=m CONFIG_KS7010=m -CONFIG_PI433=m CONFIG_FIELDBUS_DEV=m # CONFIG_VME_BUS is not set +CONFIG_APPLE_BCE=m +# CONFIG_GOLDFISH is not set CONFIG_CHROME_PLATFORMS=y CONFIG_CHROMEOS_ACPI=m CONFIG_CHROMEOS_LAPTOP=m @@ -9205,13 +9288,14 @@ CONFIG_NVIDIA_WMI_EC_BACKLIGHT=m CONFIG_XIAOMI_WMI=m CONFIG_GIGABYTE_WMI=m CONFIG_YOGABOOK=m -CONFIG_LEGION_LAPTOP=m +CONFIG_YT2_1380=m CONFIG_ACERHDF=m CONFIG_ACER_WIRELESS=m CONFIG_ACER_WMI=m CONFIG_AMD_PMF=m # CONFIG_AMD_PMF_DEBUG is not set CONFIG_AMD_PMC=m +CONFIG_AMD_MP2_STB=y CONFIG_AMD_HSMP=m CONFIG_AMD_WBRF=y CONFIG_ADV_SWBUTTON=m @@ -9234,6 +9318,7 @@ CONFIG_DELL_SMBIOS=m CONFIG_DELL_SMBIOS_WMI=y CONFIG_DELL_SMBIOS_SMM=y CONFIG_DELL_SMO8800=m +CONFIG_DELL_UART_BACKLIGHT=m CONFIG_DELL_WMI=m CONFIG_DELL_WMI_PRIVACY=y CONFIG_DELL_WMI_AIO=m @@ -9262,7 +9347,6 @@ CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y CONFIG_THINKPAD_ACPI_VIDEO=y CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y CONFIG_THINKPAD_LMI=m -CONFIG_LEGION_LAPTOP=m CONFIG_INTEL_ATOMISP2_PDX86=y CONFIG_INTEL_ATOMISP2_LED=m CONFIG_INTEL_ATOMISP2_PM=m @@ -9310,9 +9394,12 @@ CONFIG_INTEL_SMARTCONNECT=m CONFIG_INTEL_TPMI=m CONFIG_INTEL_TURBO_MAX_3=y CONFIG_INTEL_VSEC=m +CONFIG_ACPI_QUICKSTART=m +CONFIG_MEEGOPAD_ANX7428=m CONFIG_MSI_EC=m CONFIG_MSI_LAPTOP=m CONFIG_MSI_WMI=m +CONFIG_MSI_WMI_PLATFORM=m CONFIG_PCENGINES_APU2=m CONFIG_BARCO_P50_GPIO=m CONFIG_SAMSUNG_LAPTOP=m @@ -9333,6 +9420,7 @@ CONFIG_SERIAL_MULTI_INSTANTIATE=m CONFIG_MLX_PLATFORM=m CONFIG_TOUCHSCREEN_DMI=y CONFIG_INSPUR_PLATFORM_PROFILE=m +CONFIG_LENOVO_WMI_CAMERA=m CONFIG_X86_ANDROID_TABLETS=m CONFIG_FW_ATTR_CLASS=m CONFIG_INTEL_IPS=m @@ -9349,7 +9437,6 @@ CONFIG_SIEMENS_SIMATIC_IPC_BATT_F7188X=m CONFIG_SILICOM_PLATFORM=m CONFIG_WINMATE_FM07_KEYS=m CONFIG_SEL3350_PLATFORM=m -CONFIG_STEAMDECK=m CONFIG_P2SB=y CONFIG_HAVE_CLK=y CONFIG_HAVE_CLK_PREPARE=y @@ -9398,6 +9485,7 @@ CONFIG_IOMMU_DEFAULT_DMA_LAZY=y # CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set CONFIG_IOMMU_DMA=y CONFIG_IOMMU_SVA=y +CONFIG_IOMMU_IOPF=y CONFIG_AMD_IOMMU=y CONFIG_DMAR_TABLE=y CONFIG_INTEL_IOMMU=y @@ -9573,6 +9661,7 @@ CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 CONFIG_IIO_SW_DEVICE=m CONFIG_IIO_SW_TRIGGER=m CONFIG_IIO_TRIGGERED_EVENT=m +CONFIG_IIO_BACKEND=m # # Accelerometers @@ -9599,6 +9688,7 @@ CONFIG_BMC150_ACCEL=m CONFIG_BMC150_ACCEL_I2C=m CONFIG_BMC150_ACCEL_SPI=m CONFIG_BMI088_ACCEL=m +CONFIG_BMI088_ACCEL_I2C=m CONFIG_BMI088_ACCEL_SPI=m CONFIG_DA280=m CONFIG_DA311=m @@ -9647,6 +9737,7 @@ CONFIG_AD7091R=m CONFIG_AD7091R5=m CONFIG_AD7091R8=m CONFIG_AD7124=m +CONFIG_AD7173=m CONFIG_AD7192=m CONFIG_AD7266=m CONFIG_AD7280=m @@ -9664,8 +9755,11 @@ CONFIG_AD7791=m CONFIG_AD7793=m CONFIG_AD7887=m CONFIG_AD7923=m +CONFIG_AD7944=m CONFIG_AD7949=m CONFIG_AD799X=m +CONFIG_AD9467=m +CONFIG_ADI_AXI_ADC=m CONFIG_AXP20X_ADC=m CONFIG_AXP288_ADC=m CONFIG_CC10001_ADC=m @@ -9701,6 +9795,7 @@ CONFIG_MEDIATEK_MT6370_ADC=m CONFIG_MEN_Z188_ADC=m CONFIG_MP2629_ADC=m CONFIG_NAU7802=m +CONFIG_PAC1934=m CONFIG_PALMAS_GPADC=m CONFIG_RICHTEK_RTQ6056=m CONFIG_SD_ADC_MODULATOR=m @@ -9714,6 +9809,7 @@ CONFIG_TI_ADC161S626=m CONFIG_TI_ADS1015=m CONFIG_TI_ADS7924=m CONFIG_TI_ADS1100=m +CONFIG_TI_ADS1298=m CONFIG_TI_ADS7950=m CONFIG_TI_ADS8344=m CONFIG_TI_ADS8688=m @@ -9827,6 +9923,8 @@ CONFIG_AD5592R=m CONFIG_AD5593R=m CONFIG_AD5504=m CONFIG_AD5624R_SPI=m +CONFIG_AD9739A=m +CONFIG_ADI_AXI_DAC=m CONFIG_LTC2688=m CONFIG_AD5686=m CONFIG_AD5686_SPI=m @@ -9888,6 +9986,7 @@ CONFIG_AD9523=m CONFIG_ADF4350=m CONFIG_ADF4371=m CONFIG_ADF4377=m +CONFIG_ADMFM2000=m CONFIG_ADMV1013=m CONFIG_ADMV1014=m CONFIG_ADMV4420=m @@ -9996,6 +10095,7 @@ CONFIG_ADUX1020=m CONFIG_AL3010=m CONFIG_AL3320A=m CONFIG_APDS9300=m +CONFIG_APDS9306=m CONFIG_APDS9960=m CONFIG_AS73211=m CONFIG_BH1750=m @@ -10152,6 +10252,8 @@ CONFIG_MPL115_I2C=m CONFIG_MPL115_SPI=m CONFIG_MPL3115=m CONFIG_MPRLS0025PA=m +CONFIG_MPRLS0025PA_I2C=m +CONFIG_MPRLS0025PA_SPI=m CONFIG_MS5611=m CONFIG_MS5611_I2C=m CONFIG_MS5611_SPI=m @@ -10235,7 +10337,6 @@ CONFIG_NTB_SWITCHTEC=m # CONFIG_NTB_MSI_TEST is not set CONFIG_NTB_TRANSPORT=m CONFIG_PWM=y -CONFIG_PWM_SYSFS=y # CONFIG_PWM_DEBUG is not set CONFIG_PWM_CLK=m CONFIG_PWM_CRC=m @@ -10261,6 +10362,7 @@ CONFIG_IPACK_BUS=m CONFIG_BOARD_TPCI200=m CONFIG_SERIAL_IPOCTAL=m CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_GPIO=m CONFIG_RESET_TI_SYSCON=m CONFIG_RESET_TI_TPS380X=m @@ -10301,11 +10403,14 @@ CONFIG_MCB_LPC=m # Performance monitor support # CONFIG_DWC_PCIE_PMU=m +CONFIG_CXL_PMU=m # end of Performance monitor support CONFIG_RAS=y CONFIG_RAS_CEC=y # CONFIG_RAS_CEC_DEBUG is not set +CONFIG_AMD_ATL=m +CONFIG_RAS_FMPM=m CONFIG_USB4=m # CONFIG_USB4_DEBUGFS_WRITE is not set # CONFIG_USB4_DMA_TEST is not set @@ -10366,6 +10471,8 @@ CONFIG_FPGA=m CONFIG_ALTERA_PR_IP_CORE=m CONFIG_FPGA_MGR_ALTERA_PS_SPI=m CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_CORE=m +CONFIG_FPGA_MGR_XILINX_SELECTMAP=m CONFIG_FPGA_MGR_XILINX_SPI=m CONFIG_FPGA_MGR_MACHXO2_SPI=m CONFIG_FPGA_BRIDGE=m @@ -10451,6 +10558,9 @@ CONFIG_XFS_QUOTA=y CONFIG_XFS_POSIX_ACL=y CONFIG_XFS_RT=y CONFIG_XFS_DRAIN_INTENTS=y +CONFIG_XFS_LIVE_HOOKS=y +CONFIG_XFS_MEMORY_BUFS=y +CONFIG_XFS_BTREE_IN_MEM=y CONFIG_XFS_ONLINE_SCRUB=y # CONFIG_XFS_ONLINE_SCRUB_STATS is not set CONFIG_XFS_ONLINE_REPAIR=y @@ -10520,10 +10630,11 @@ CONFIG_QFMT_V1=m CONFIG_QFMT_V2=m CONFIG_QUOTACTL=y CONFIG_AUTOFS_FS=y -CONFIG_FUSE_FS=m +CONFIG_FUSE_FS=y CONFIG_CUSE=m -CONFIG_VIRTIO_FS=m +CONFIG_VIRTIO_FS=y CONFIG_FUSE_DAX=y +CONFIG_FUSE_PASSTHROUGH=y CONFIG_OVERLAY_FS=m CONFIG_OVERLAY_FS_REDIRECT_DIR=y # CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW is not set @@ -10566,11 +10677,11 @@ CONFIG_FAT_DEFAULT_IOCHARSET="ascii" CONFIG_FAT_DEFAULT_UTF8=y CONFIG_EXFAT_FS=m CONFIG_EXFAT_DEFAULT_IOCHARSET="utf8" -# CONFIG_NTFS_FS is not set CONFIG_NTFS3_FS=m # CONFIG_NTFS3_64BIT_CLUSTER is not set CONFIG_NTFS3_LZX_XPRESS=y CONFIG_NTFS3_FS_POSIX_ACL=y +# CONFIG_NTFS_FS is not set # end of DOS/FAT/EXFAT/NT Filesystems # @@ -10688,6 +10799,7 @@ CONFIG_EROFS_FS_SECURITY=y CONFIG_EROFS_FS_ZIP=y CONFIG_EROFS_FS_ZIP_LZMA=y CONFIG_EROFS_FS_ZIP_DEFLATE=y +CONFIG_EROFS_FS_ZIP_ZSTD=y CONFIG_EROFS_FS_ONDEMAND=y CONFIG_EROFS_FS_PCPU_KTHREAD=y CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI=y @@ -10837,6 +10949,7 @@ CONFIG_KEYS=y CONFIG_KEYS_REQUEST_CACHE=y CONFIG_PERSISTENT_KEYRINGS=y CONFIG_TRUSTED_KEYS=m +CONFIG_HAVE_TRUSTED_KEYS=y CONFIG_TRUSTED_KEYS_TPM=y CONFIG_TRUSTED_KEYS_TEE=y CONFIG_ENCRYPTED_KEYS=m @@ -10925,6 +11038,7 @@ CONFIG_INIT_STACK_ALL_ZERO=y # CONFIG_GCC_PLUGIN_STACKLEAK is not set CONFIG_INIT_ON_ALLOC_DEFAULT_ON=y # CONFIG_INIT_ON_FREE_DEFAULT_ON is not set +# CONFIG_INIT_MLOCKED_ON_FREE_DEFAULT_ON is not set CONFIG_CC_HAS_ZERO_CALL_USED_REGS=y # CONFIG_ZERO_CALL_USED_REGS is not set # end of Memory initialization @@ -10949,8 +11063,7 @@ CONFIG_ASYNC_XOR=m CONFIG_ASYNC_PQ=m CONFIG_ASYNC_RAID6_RECOV=m CONFIG_CRYPTO=y -CONFIG_NTSYNC=m -CONFIG_ACPI_CALL=m + # # Crypto core or helper # @@ -10958,6 +11071,7 @@ CONFIG_CRYPTO_ALGAPI=y CONFIG_CRYPTO_ALGAPI2=y CONFIG_CRYPTO_AEAD=m CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_SIG=y CONFIG_CRYPTO_SIG2=y CONFIG_CRYPTO_SKCIPHER=y CONFIG_CRYPTO_SKCIPHER2=y @@ -10992,7 +11106,7 @@ CONFIG_CRYPTO_RSA=y CONFIG_CRYPTO_DH=y CONFIG_CRYPTO_DH_RFC7919_GROUPS=y CONFIG_CRYPTO_ECC=y -CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_ECDH=y CONFIG_CRYPTO_ECDSA=y CONFIG_CRYPTO_ECRDSA=m CONFIG_CRYPTO_SM2=m @@ -11090,7 +11204,7 @@ CONFIG_CRYPTO_CRC64_ROCKSOFT=y # Compression # CONFIG_CRYPTO_DEFLATE=m -CONFIG_CRYPTO_LZO=m +CONFIG_CRYPTO_LZO=y CONFIG_CRYPTO_842=m CONFIG_CRYPTO_LZ4=m CONFIG_CRYPTO_LZ4HC=m @@ -11123,7 +11237,6 @@ CONFIG_CRYPTO_USER_API_RNG=m # CONFIG_CRYPTO_USER_API_RNG_CAVP is not set CONFIG_CRYPTO_USER_API_AEAD=m # CONFIG_CRYPTO_USER_API_ENABLE_OBSOLETE is not set -CONFIG_CRYPTO_STATS=y # end of Userspace interface CONFIG_CRYPTO_HASH_INFO=y @@ -11192,6 +11305,7 @@ CONFIG_CRYPTO_DEV_QAT_420XX=m CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m CONFIG_CRYPTO_DEV_QAT_C62XVF=m +# CONFIG_CRYPTO_DEV_QAT_ERROR_INJECTION is not set CONFIG_CRYPTO_DEV_IAA_CRYPTO=m # CONFIG_CRYPTO_DEV_IAA_CRYPTO_STATS is not set CONFIG_CRYPTO_DEV_CHELSIO=m @@ -11242,7 +11356,6 @@ CONFIG_GENERIC_NET_UTILS=y CONFIG_CORDIC=m # CONFIG_PRIME_NUMBERS is not set CONFIG_RATIONAL=y -CONFIG_GENERIC_PCI_IOMAP=y CONFIG_GENERIC_IOMAP=y CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y CONFIG_ARCH_HAS_FAST_MULTIPLIER=y @@ -11253,6 +11366,7 @@ CONFIG_ARCH_USE_SYM_ANNOTATIONS=y # CONFIG_CRYPTO_LIB_UTILS=y CONFIG_CRYPTO_LIB_AES=y +CONFIG_CRYPTO_LIB_AESCFB=y CONFIG_CRYPTO_LIB_ARC4=m CONFIG_CRYPTO_LIB_GF128MUL=m CONFIG_CRYPTO_ARCH_HAVE_LIB_BLAKE2S=y @@ -11346,6 +11460,7 @@ CONFIG_ARCH_DMA_ADDR_T_64BIT=y CONFIG_ARCH_HAS_FORCE_DMA_UNENCRYPTED=y CONFIG_SWIOTLB=y # CONFIG_SWIOTLB_DYNAMIC is not set +CONFIG_DMA_NEED_SYNC=y CONFIG_DMA_COHERENT_POOL=y CONFIG_DMA_CMA=y # CONFIG_DMA_NUMA_CMA is not set @@ -11373,15 +11488,16 @@ CONFIG_CLZ_TAB=y CONFIG_IRQ_POLL=y CONFIG_MPILIB=y CONFIG_SIGNATURE=y -CONFIG_DIMLIB=y +CONFIG_DIMLIB=m CONFIG_OID_REGISTRY=y CONFIG_UCS2_STRING=y CONFIG_HAVE_GENERIC_VDSO=y CONFIG_GENERIC_GETTIMEOFDAY=y CONFIG_GENERIC_VDSO_TIME_NS=y +CONFIG_GENERIC_VDSO_OVERFLOW_PROTECT=y CONFIG_FONT_SUPPORT=y CONFIG_FONTS=y -# CONFIG_FONT_8x8 is not set +CONFIG_FONT_8x8=y CONFIG_FONT_8x16=y # CONFIG_FONT_6x11 is not set # CONFIG_FONT_7x14 is not set @@ -11480,7 +11596,7 @@ CONFIG_DEBUG_FS_ALLOW_ALL=y # CONFIG_DEBUG_FS_ALLOW_NONE is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set -CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +CONFIG_ARCH_HAS_UBSAN=y # CONFIG_UBSAN is not set CONFIG_HAVE_ARCH_KCSAN=y CONFIG_HAVE_KCSAN_COMPILER=y @@ -11528,9 +11644,11 @@ CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_PER_CPU_MAPS is not set CONFIG_ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP=y # CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP is not set +# CONFIG_MEM_ALLOC_PROFILING is not set CONFIG_HAVE_ARCH_KASAN=y CONFIG_HAVE_ARCH_KASAN_VMALLOC=y CONFIG_CC_HAS_KASAN_GENERIC=y +CONFIG_CC_HAS_KASAN_SW_TAGS=y CONFIG_CC_HAS_WORKING_NOSANITIZE_ADDRESS=y # CONFIG_KASAN is not set CONFIG_HAVE_ARCH_KFENCE=y @@ -11554,6 +11672,7 @@ CONFIG_PANIC_ON_OOPS_VALUE=0 CONFIG_PANIC_TIMEOUT=0 CONFIG_LOCKUP_DETECTOR=y CONFIG_SOFTLOCKUP_DETECTOR=y +CONFIG_SOFTLOCKUP_DETECTOR_INTR_STORM=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_HAVE_HARDLOCKUP_DETECTOR_BUDDY=y CONFIG_HARDLOCKUP_DETECTOR=y @@ -11711,6 +11830,7 @@ CONFIG_HIST_TRIGGERS=y # CONFIG_RING_BUFFER_BENCHMARK is not set # CONFIG_TRACE_EVAL_MAP_FILE is not set # CONFIG_FTRACE_RECORD_RECURSION is not set +# CONFIG_FTRACE_VALIDATE_RCU_IS_WATCHING is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_FTRACE_SORT_STARTUP_TEST is not set # CONFIG_RING_BUFFER_STARTUP_TEST is not set @@ -11779,8 +11899,6 @@ CONFIG_RUNTIME_TESTING_MENU=y # CONFIG_ATOMIC64_SELFTEST is not set CONFIG_ASYNC_RAID6_TEST=m # CONFIG_TEST_HEXDUMP is not set -# CONFIG_STRING_SELFTEST is not set -# CONFIG_TEST_STRING_HELPERS is not set # CONFIG_TEST_KSTRTOX is not set # CONFIG_TEST_PRINTF is not set # CONFIG_TEST_SCANF is not set @@ -11814,7 +11932,6 @@ CONFIG_ASYNC_RAID6_TEST=m # CONFIG_TEST_OBJPOOL is not set CONFIG_ARCH_USE_MEMTEST=y CONFIG_MEMTEST=y - # CONFIG_HYPERV_TESTING is not set # end of Kernel Testing and Coverage @@ -11822,4 +11939,4 @@ CONFIG_MEMTEST=y # Rust hacking # # end of Rust hacking -# end of Kernel hacking +# end of Kernel hacking \ No newline at end of file diff --git a/patches/cachyos/0001-bore-cachy.patch b/patches/cachyos/0001-bore-cachy.patch index cf2571b..a49989b 100644 --- a/patches/cachyos/0001-bore-cachy.patch +++ b/patches/cachyos/0001-bore-cachy.patch @@ -1,21 +1,22 @@ -From 37fd243d8f075b558f54a36fc85887269310709c Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Tue, 26 Mar 2024 08:11:18 +0100 +From fea4a499d6783faff756fe852c645f90aa73ccf7 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Mon, 15 Jul 2024 13:57:19 +0200 Subject: [PATCH] bore-cachy -Signed-off-by: Piotr Gorski +Signed-off-by: Peter Jung --- include/linux/sched.h | 10 ++ init/Kconfig | 17 +++ - kernel/sched/core.c | 144 +++++++++++++++++++++++++ - kernel/sched/debug.c | 60 ++++++++++- - kernel/sched/fair.c | 231 +++++++++++++++++++++++++++++++++++++--- - kernel/sched/features.h | 4 + - kernel/sched/sched.h | 7 ++ - 7 files changed, 457 insertions(+), 16 deletions(-) + kernel/Kconfig.hz | 16 +++ + kernel/sched/core.c | 143 ++++++++++++++++++ + kernel/sched/debug.c | 60 +++++++- + kernel/sched/fair.c | 310 ++++++++++++++++++++++++++++++++++++---- + kernel/sched/features.h | 22 ++- + kernel/sched/sched.h | 7 + + 8 files changed, 555 insertions(+), 30 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h -index ffe8f618a..0ab0b0424 100644 +index a5f4b48fca18..df62c56b13ae 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -547,6 +547,16 @@ struct sched_entity { @@ -36,10 +37,10 @@ index ffe8f618a..0ab0b0424 100644 u64 slice; diff --git a/init/Kconfig b/init/Kconfig -index 9ea39297f..f9bb5401f 100644 +index 3ba6142f2f42..2966dec64df7 100644 --- a/init/Kconfig +++ b/init/Kconfig -@@ -1299,6 +1299,23 @@ config CHECKPOINT_RESTORE +@@ -1303,6 +1303,23 @@ config CHECKPOINT_RESTORE If unsure, say N here. @@ -63,16 +64,41 @@ index 9ea39297f..f9bb5401f 100644 config SCHED_AUTOGROUP bool "Automatic process group scheduling" select CGROUPS +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 0f78364efd4f..b50189ee5b93 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -79,5 +79,21 @@ config HZ + default 750 if HZ_750 + default 1000 if HZ_1000 + ++config MIN_BASE_SLICE_NS ++ int "Default value for min_base_slice_ns" ++ default 2000000 ++ help ++ The BORE Scheduler automatically calculates the optimal base ++ slice for the configured HZ using the following equation: ++ ++ base_slice_ns = max(min_base_slice_ns, 1000000000/HZ) ++ ++ This option sets the default lower bound limit of the base slice ++ to prevent the loss of task throughput due to overscheduling. ++ ++ Setting this value too high can cause the system to boot with ++ an unnecessarily large base slice, resulting in high scheduling ++ latency and poor system responsiveness. ++ + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index 9116bcc90..fc3d7b48e 100644 +index 59ce0841eb1f..c5d10b464779 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c -@@ -4507,6 +4507,139 @@ int wake_up_state(struct task_struct *p, unsigned int state) +@@ -4515,6 +4515,138 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +#ifdef CONFIG_SCHED_BORE -+extern bool sched_bore; +extern u8 sched_burst_fork_atavistic; +extern uint sched_burst_cache_lifetime; + @@ -85,7 +111,7 @@ index 9116bcc90..fc3d7b48e 100644 + init_task.se.child_burst_last_cached = 0; +} + -+void inline sched_fork_bore(struct task_struct *p) { ++inline void sched_fork_bore(struct task_struct *p) { + p->se.burst_time = 0; + p->se.curr_burst_penalty = 0; + p->se.burst_score = 0; @@ -207,7 +233,7 @@ index 9116bcc90..fc3d7b48e 100644 /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. -@@ -4523,6 +4656,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) +@@ -4531,6 +4663,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; @@ -217,7 +243,7 @@ index 9116bcc90..fc3d7b48e 100644 p->se.vlag = 0; p->se.slice = sysctl_sched_base_slice; INIT_LIST_HEAD(&p->se.group_node); -@@ -4839,6 +4975,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) +@@ -4846,6 +4981,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) void sched_post_fork(struct task_struct *p) { @@ -227,20 +253,20 @@ index 9116bcc90..fc3d7b48e 100644 uclamp_post_fork(p); } -@@ -9910,6 +10049,11 @@ void __init sched_init(void) +@@ -9933,6 +10071,11 @@ void __init sched_init(void) BUG_ON(&dl_sched_class != &stop_sched_class + 1); #endif +#ifdef CONFIG_SCHED_BORE + sched_init_bore(); -+ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.0.3 by Masahito Suzuki"); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.2.5 by Masahito Suzuki"); +#endif // CONFIG_SCHED_BORE + wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 8d5d98a58..b17861261 100644 +index c1eb9a1afd13..e2da8d773877 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { @@ -326,7 +352,7 @@ index 8d5d98a58..b17861261 100644 debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); -@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +@@ -596,6 +648,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); @@ -336,7 +362,7 @@ index 8d5d98a58..b17861261 100644 #ifdef CONFIG_NUMA_BALANCING SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); #endif -@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, +@@ -1069,6 +1124,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.load.weight); #ifdef CONFIG_SMP @@ -347,7 +373,7 @@ index 8d5d98a58..b17861261 100644 P(se.avg.runnable_sum); P(se.avg.util_sum); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index fc0a9de42..ae55f46a8 100644 +index c2bb8eb1d6ba..9e8b220f27e6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -19,6 +19,9 @@ @@ -360,7 +386,7 @@ index fc0a9de42..ae55f46a8 100644 */ #include #include -@@ -64,28 +67,125 @@ +@@ -64,28 +67,126 @@ * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus * @@ -388,7 +414,7 @@ index fc0a9de42..ae55f46a8 100644 +#ifdef CONFIG_SCHED_BORE +unsigned int sysctl_sched_base_slice = 1000000000ULL / HZ; +static unsigned int configured_sched_base_slice = 1000000000ULL / HZ; -+unsigned int sysctl_sched_min_base_slice = 2000000ULL; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; +#else // !CONFIG_SCHED_BORE unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; @@ -409,7 +435,8 @@ index fc0a9de42..ae55f46a8 100644 +u8 __read_mostly sched_burst_penalty_offset = 22; +uint __read_mostly sched_burst_penalty_scale = 1280; +uint __read_mostly sched_burst_cache_lifetime = 60000000; -+static int __maybe_unused thirty_two = 32; ++uint __read_mostly sched_deadline_boost_mask = 0x81; // ENQUEUE_INITIAL | ENQUEUE_WAKEUP ++uint __read_mostly sched_deadline_preserve_mask = 0x42; // ENQUEUE_RESTORE | ENQUEUE_MIGRATED +static int __maybe_unused sixty_four = 64; +static int __maybe_unused maxval_12_bits = 4095; + @@ -495,9 +522,9 @@ index fc0a9de42..ae55f46a8 100644 +} +#endif // CONFIG_SCHED_BORE - int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) -@@ -136,12 +236,8 @@ int __weak arch_asym_cpu_priority(int cpu) + { +@@ -130,12 +231,8 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ @@ -510,7 +537,7 @@ index fc0a9de42..ae55f46a8 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ -@@ -150,6 +246,69 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; +@@ -144,6 +241,83 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { @@ -576,11 +603,25 @@ index fc0a9de42..ae55f46a8 100644 + .mode = 0644, + .proc_handler = proc_douintvec, + }, ++ { ++ .procname = "sched_deadline_boost_mask", ++ .data = &sched_deadline_boost_mask, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_deadline_preserve_mask", ++ .data = &sched_deadline_preserve_mask, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, +#endif // CONFIG_SCHED_BORE #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", -@@ -208,6 +367,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) +@@ -201,6 +375,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) * * This idea comes from the SD scheduler of Con Kolivas: */ @@ -594,7 +635,7 @@ index fc0a9de42..ae55f46a8 100644 static unsigned int get_update_sysctl_factor(void) { unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); -@@ -238,6 +404,7 @@ static void update_sysctl(void) +@@ -231,6 +412,7 @@ static void update_sysctl(void) SET_SYSCTL(sched_base_slice); #undef SET_SYSCTL } @@ -602,17 +643,93 @@ index fc0a9de42..ae55f46a8 100644 void __init sched_init_granularity(void) { -@@ -717,6 +884,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) - lag = avg_vruntime(cfs_rq) - se->vruntime; +@@ -708,6 +890,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) + vlag = avruntime - se->vruntime; limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); +#ifdef CONFIG_SCHED_BORE + limit >>= 1; +#endif // CONFIG_SCHED_BORE - se->vlag = clamp(lag, -limit, limit); + + return clamp(vlag, -limit, limit); + } +@@ -868,6 +1053,39 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) + return __node_2_se(left); } -@@ -968,6 +1138,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) ++static inline bool pick_curr(struct cfs_rq *cfs_rq, ++ struct sched_entity *curr, struct sched_entity *wakee) ++{ ++ /* ++ * Nothing to preserve... ++ */ ++ if (!curr || !sched_feat(RESPECT_SLICE)) ++ return false; ++ ++ /* ++ * Allow preemption at the 0-lag point -- even if not all of the slice ++ * is consumed. Note: placement of positive lag can push V left and render ++ * @curr instantly ineligible irrespective the time on-cpu. ++ */ ++ if (sched_feat(RUN_TO_PARITY) && !entity_eligible(cfs_rq, curr)) ++ return false; ++ ++ /* ++ * Don't preserve @curr when the @wakee has a shorter slice and earlier ++ * deadline. IOW, explicitly allow preemption. ++ */ ++ if (sched_feat(PREEMPT_SHORT) && wakee && ++ wakee->slice < curr->slice && ++ (s64)(wakee->deadline - curr->deadline) < 0) ++ return false; ++ ++ /* ++ * Preserve @curr to allow it to finish its first slice. ++ * See the HACK in set_next_entity(). ++ */ ++ return curr->vlag == curr->deadline; ++} ++ + /* + * Earliest Eligible Virtual Deadline First + * +@@ -887,28 +1105,27 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) + * + * Which allows tree pruning through eligibility. + */ +-static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) ++static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *wakee) + { + struct rb_node *node = cfs_rq->tasks_timeline.rb_root.rb_node; + struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *curr = cfs_rq->curr; + struct sched_entity *best = NULL; + ++ if (curr && !curr->on_rq) ++ curr = NULL; ++ + /* + * We can safely skip eligibility check if there is only one entity + * in this cfs_rq, saving some cycles. + */ + if (cfs_rq->nr_running == 1) +- return curr && curr->on_rq ? curr : se; +- +- if (curr && (!curr->on_rq || !entity_eligible(cfs_rq, curr))) +- curr = NULL; ++ return curr ?: se; + + /* +- * Once selected, run a task until it either becomes non-eligible or +- * until it gets a new slice. See the HACK in set_next_entity(). ++ * Preserve @curr to let it finish its slice. + */ +- if (sched_feat(RUN_TO_PARITY) && curr && curr->vlag == curr->deadline) ++ if (pick_curr(cfs_rq, curr, wakee)) + return curr; + + /* Pick the leftmost entity if it's eligible */ +@@ -967,6 +1184,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) * Scheduling class statistics methods: */ #ifdef CONFIG_SMP @@ -620,7 +737,7 @@ index fc0a9de42..ae55f46a8 100644 int sched_update_scaling(void) { unsigned int factor = get_update_sysctl_factor(); -@@ -979,6 +1150,7 @@ int sched_update_scaling(void) +@@ -978,6 +1196,7 @@ int sched_update_scaling(void) return 0; } @@ -628,7 +745,7 @@ index fc0a9de42..ae55f46a8 100644 #endif #endif -@@ -1178,7 +1350,13 @@ static void update_curr(struct cfs_rq *cfs_rq) +@@ -1178,7 +1397,13 @@ static void update_curr(struct cfs_rq *cfs_rq) if (unlikely(delta_exec <= 0)) return; @@ -642,7 +759,19 @@ index fc0a9de42..ae55f46a8 100644 update_deadline(cfs_rq, curr); update_min_vruntime(cfs_rq); -@@ -5184,6 +5362,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5193,6 +5418,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + s64 lag = 0; + + se->slice = sysctl_sched_base_slice; ++#ifdef CONFIG_SCHED_BORE ++ if (flags & ~sched_deadline_boost_mask & sched_deadline_preserve_mask) ++ vslice = se->deadline - se->vruntime; ++ else ++#endif // CONFIG_SCHED_BORE + vslice = calc_delta_fair(se->slice, se); + + /* +@@ -5203,6 +5433,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * * EEVDF: placement strategy #1 / #2 */ @@ -652,7 +781,28 @@ index fc0a9de42..ae55f46a8 100644 if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { struct sched_entity *curr = cfs_rq->curr; unsigned long load; -@@ -6816,6 +6997,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +@@ -5278,7 +5511,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. + */ ++#if !defined(CONFIG_SCHED_BORE) + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) ++#else // CONFIG_SCHED_BORE ++ if (flags & sched_deadline_boost_mask) ++#endif // CONFIG_SCHED_BORE + vslice /= 2; + + /* +@@ -5492,7 +5729,7 @@ pick_next_entity(struct cfs_rq *cfs_rq) + cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) + return cfs_rq->next; + +- return pick_eevdf(cfs_rq); ++ return pick_eevdf(cfs_rq, NULL); + } + + static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -6860,6 +7097,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) bool was_sched_idle = sched_idle_rq(rq); util_est_dequeue(&rq->cfs, p); @@ -667,7 +817,19 @@ index fc0a9de42..ae55f46a8 100644 for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); -@@ -8565,16 +8754,25 @@ static void yield_task_fair(struct rq *rq) +@@ -8425,10 +8670,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + +- /* +- * XXX pick_eevdf(cfs_rq) != se ? +- */ +- if (pick_eevdf(cfs_rq) == pse) ++ if (pick_eevdf(cfs_rq, pse) == pse) + goto preempt; + + return; +@@ -8646,16 +8888,25 @@ static void yield_task_fair(struct rq *rq) /* * Are we the only task in the tree? */ @@ -693,7 +855,7 @@ index fc0a9de42..ae55f46a8 100644 /* * Tell update_rq_clock() that we've just updated, * so we don't do microscopic update in schedule() -@@ -12664,6 +12862,9 @@ static void task_fork_fair(struct task_struct *p) +@@ -12723,6 +12974,9 @@ static void task_fork_fair(struct task_struct *p) curr = cfs_rq->curr; if (curr) update_curr(cfs_rq); @@ -704,26 +866,44 @@ index fc0a9de42..ae55f46a8 100644 rq_unlock(rq, &rf); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index 143f55df8..3f0fe409f 100644 +index 143f55df890b..3aad8900c35e 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h -@@ -6,7 +6,11 @@ +@@ -5,8 +5,28 @@ + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. */ SCHED_FEAT(PLACE_LAG, true) ++/* ++ * Give new tasks half a slice to ease into the competition. ++ */ ++#if !defined(CONFIG_SCHED_BORE) SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) -+#ifdef CONFIG_SCHED_BORE -+SCHED_FEAT(RUN_TO_PARITY, false) -+#else // !CONFIG_SCHED_BORE - SCHED_FEAT(RUN_TO_PARITY, true) +-SCHED_FEAT(RUN_TO_PARITY, true) +#endif // CONFIG_SCHED_BORE ++/* ++ * Inhibit (wakeup) preemption until the current task has exhausted its slice. ++ */ ++#ifdef CONFIG_SCHED_BORE ++SCHED_FEAT(RESPECT_SLICE, false) ++#else // !CONFIG_SCHED_BORE ++SCHED_FEAT(RESPECT_SLICE, true) ++#endif // CONFIG_SCHED_BORE ++/* ++ * Relax RESPECT_SLICE to allow preemption once current has reached 0-lag. ++ */ ++SCHED_FEAT(RUN_TO_PARITY, false) ++/* ++ * Allow tasks with a shorter slice to disregard RESPECT_SLICE ++ */ ++SCHED_FEAT(PREEMPT_SHORT, true) /* * Prefer to schedule the task we woke last (assuming it failed diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index ed5c758c7..9d62372ae 100644 +index 10c1caff5e06..5d845dbd0cf9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -1965,7 +1965,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) +@@ -1969,7 +1969,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) } #endif @@ -735,7 +915,7 @@ index ed5c758c7..9d62372ae 100644 static inline const struct cpumask *task_user_cpus(struct task_struct *p) { -@@ -2552,6 +2556,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; +@@ -2554,6 +2558,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; extern const_debug unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_base_slice; @@ -746,4 +926,4 @@ index ed5c758c7..9d62372ae 100644 #ifdef CONFIG_SCHED_DEBUG extern int sysctl_resched_latency_warn_ms; -- -2.43.0.232.ge79552d197 +2.46.0.rc0 diff --git a/patches/cachyos/0001-cachyos-base-all.patch b/patches/cachyos/0001-cachyos-base-all.patch index 84d0f67..b5b57c9 100644 --- a/patches/cachyos/0001-cachyos-base-all.patch +++ b/patches/cachyos/0001-cachyos-base-all.patch @@ -1,1353 +1,75 @@ -From 2b7dc22b0a950292985c4d5118c5eeaa51ea2918 Mon Sep 17 00:00:00 2001 +From 35b09dfe053ff6308ab58d44175727d0d20f4ce0 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 3 Apr 2024 17:06:09 +0200 -Subject: [PATCH 1/8] aex-xts +Date: Mon, 15 Jul 2024 13:23:07 +0200 +Subject: [PATCH 01/11] amd-pstate Signed-off-by: Peter Jung --- - arch/x86/Kconfig.assembler | 10 + - arch/x86/crypto/Makefile | 3 +- - arch/x86/crypto/aes-xts-avx-x86_64.S | 838 +++++++++++++++++++++++++++ - arch/x86/crypto/aesni-intel_glue.c | 270 ++++++++- - 4 files changed, 1118 insertions(+), 3 deletions(-) - create mode 100644 arch/x86/crypto/aes-xts-avx-x86_64.S + Documentation/admin-guide/pm/amd-pstate.rst | 18 +- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/msr-index.h | 2 + + arch/x86/kernel/cpu/scattered.c | 1 + + drivers/cpufreq/Kconfig.x86 | 1 + + drivers/cpufreq/acpi-cpufreq.c | 3 +- + drivers/cpufreq/amd-pstate-ut.c | 12 +- + drivers/cpufreq/amd-pstate.c | 350 ++++++++++++++------ + drivers/cpufreq/amd-pstate.h | 2 + + drivers/cpufreq/cpufreq.c | 11 +- + 10 files changed, 281 insertions(+), 120 deletions(-) -diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler -index 8ad41da301e5..59aedf32c4ea 100644 ---- a/arch/x86/Kconfig.assembler -+++ b/arch/x86/Kconfig.assembler -@@ -25,6 +25,16 @@ config AS_GFNI - help - Supported by binutils >= 2.30 and LLVM integrated assembler - -+config AS_VAES -+ def_bool $(as-instr,vaesenc %ymm0$(comma)%ymm1$(comma)%ymm2) -+ help -+ Supported by binutils >= 2.30 and LLVM integrated assembler -+ -+config AS_VPCLMULQDQ -+ def_bool $(as-instr,vpclmulqdq \$0x10$(comma)%ymm0$(comma)%ymm1$(comma)%ymm2) -+ help -+ Supported by binutils >= 2.30 and LLVM integrated assembler -+ - config AS_WRUSS - def_bool $(as-instr,wrussq %rax$(comma)(%rbx)) - help -diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile -index 9aa46093c91b..9c5ce5613738 100644 ---- a/arch/x86/crypto/Makefile -+++ b/arch/x86/crypto/Makefile -@@ -48,7 +48,8 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o - - obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o - aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o --aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o aes_ctrby8_avx-x86_64.o -+aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ -+ aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o - - obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o - sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o -diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S -new file mode 100644 -index 000000000000..b8005d0205f8 ---- /dev/null -+++ b/arch/x86/crypto/aes-xts-avx-x86_64.S -@@ -0,0 +1,838 @@ -+/* SPDX-License-Identifier: GPL-2.0-or-later */ -+/* -+ * AES-XTS for modern x86_64 CPUs -+ * -+ * Copyright 2024 Google LLC -+ * -+ * Author: Eric Biggers -+ */ -+ -+/* -+ * This file implements AES-XTS for modern x86_64 CPUs. To handle the -+ * complexities of coding for x86 SIMD, e.g. where every vector length needs -+ * different code, it uses a macro to generate several implementations that -+ * share similar source code but are targeted at different CPUs, listed below: -+ * -+ * AES-NI + AVX -+ * - 128-bit vectors (1 AES block per vector) -+ * - VEX-coded instructions -+ * - xmm0-xmm15 -+ * - This is for older CPUs that lack VAES but do have AVX. -+ * -+ * VAES + VPCLMULQDQ + AVX2 -+ * - 256-bit vectors (2 AES blocks per vector) -+ * - VEX-coded instructions -+ * - ymm0-ymm15 -+ * - This is for CPUs that have VAES but lack AVX512 or AVX10, -+ * e.g. Intel's Alder Lake and AMD's Zen 3. -+ * -+ * VAES + VPCLMULQDQ + AVX10/256 + BMI2 -+ * - 256-bit vectors (2 AES blocks per vector) -+ * - EVEX-coded instructions -+ * - ymm0-ymm31 -+ * - This is for CPUs that have AVX512 but where using zmm registers causes -+ * downclocking, and for CPUs that have AVX10/256 but not AVX10/512. -+ * - By "AVX10/256" we really mean (AVX512BW + AVX512VL) || AVX10/256. -+ * To avoid confusion with 512-bit, we just write AVX10/256. -+ * -+ * VAES + VPCLMULQDQ + AVX10/512 + BMI2 -+ * - Same as the previous one, but upgrades to 512-bit vectors -+ * (4 AES blocks per vector) in zmm0-zmm31. -+ * - This is for CPUs that have good AVX512 or AVX10/512 support. -+ * -+ * This file doesn't have an implementation for AES-NI alone (without AVX), as -+ * the lack of VEX would make all the assembly code different. -+ * -+ * When we use VAES, we also use VPCLMULQDQ to parallelize the computation of -+ * the XTS tweaks. This avoids a bottleneck. Currently there don't seem to be -+ * any CPUs that support VAES but not VPCLMULQDQ. If that changes, we might -+ * need to start also providing an implementation using VAES alone. -+ * -+ * The AES-XTS implementations in this file support everything required by the -+ * crypto API, including support for arbitrary input lengths and multi-part -+ * processing. However, they are most heavily optimized for the common case of -+ * power-of-2 length inputs that are processed in a single part (disk sectors). -+ */ -+ -+#include -+#include -+ -+.section .rodata -+.p2align 4 -+.Lgf_poly: -+ // The low 64 bits of this value represent the polynomial x^7 + x^2 + x -+ // + 1. It is the value that must be XOR'd into the low 64 bits of the -+ // tweak each time a 1 is carried out of the high 64 bits. -+ // -+ // The high 64 bits of this value is just the internal carry bit that -+ // exists when there's a carry out of the low 64 bits of the tweak. -+ .quad 0x87, 1 -+ -+ // This table contains constants for vpshufb and vpblendvb, used to -+ // handle variable byte shifts and blending during ciphertext stealing -+ // on CPUs that don't support AVX10-style masking. -+.Lcts_permute_table: -+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 -+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 -+ .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 -+ .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f -+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 -+ .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 -+.text -+ -+// Function parameters -+.set KEY, %rdi // Initially points to crypto_aes_ctx, then is -+ // advanced to point directly to the round keys -+.set SRC, %rsi // Pointer to next source data -+.set DST, %rdx // Pointer to next destination data -+.set LEN, %rcx // Remaining length in bytes -+.set TWEAK, %r8 // Pointer to next tweak -+ -+// %r9d holds the AES key length in bytes. -+.set KEYLEN, %r9d -+ -+// %rax and %r10-r11 are available as temporaries. -+ -+.macro _define_Vi i -+.if VL == 16 -+ .set V\i, %xmm\i -+.elseif VL == 32 -+ .set V\i, %ymm\i -+.elseif VL == 64 -+ .set V\i, %zmm\i -+.else -+ .error "Unsupported Vector Length (VL)" -+.endif -+.endm -+ -+.macro _define_aliases -+ // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers -+ // are available, that map to the xmm, ymm, or zmm registers according -+ // to the selected Vector Length (VL). -+ _define_Vi 0 -+ _define_Vi 1 -+ _define_Vi 2 -+ _define_Vi 3 -+ _define_Vi 4 -+ _define_Vi 5 -+ _define_Vi 6 -+ _define_Vi 7 -+ _define_Vi 8 -+ _define_Vi 9 -+ _define_Vi 10 -+ _define_Vi 11 -+ _define_Vi 12 -+ _define_Vi 13 -+ _define_Vi 14 -+ _define_Vi 15 -+.if USE_AVX10 -+ _define_Vi 16 -+ _define_Vi 17 -+ _define_Vi 18 -+ _define_Vi 19 -+ _define_Vi 20 -+ _define_Vi 21 -+ _define_Vi 22 -+ _define_Vi 23 -+ _define_Vi 24 -+ _define_Vi 25 -+ _define_Vi 26 -+ _define_Vi 27 -+ _define_Vi 28 -+ _define_Vi 29 -+ _define_Vi 30 -+ _define_Vi 31 -+.endif -+ -+ // V0-V3 hold the data blocks during the main loop, or temporary values -+ // otherwise. V4-V5 hold temporary values. -+ -+ // V6-V9 hold XTS tweaks. Each 128-bit lane holds one tweak. -+ .set TWEAK0_XMM, %xmm6 -+ .set TWEAK0, V6 -+ .set TWEAK1_XMM, %xmm7 -+ .set TWEAK1, V7 -+ .set TWEAK2, V8 -+ .set TWEAK3, V9 -+ -+ // V10-V13 are used for computing the next values of TWEAK[0-3]. -+ .set NEXT_TWEAK0, V10 -+ .set NEXT_TWEAK1, V11 -+ .set NEXT_TWEAK2, V12 -+ .set NEXT_TWEAK3, V13 -+ -+ // V14 holds the constant from .Lgf_poly, copied to all 128-bit lanes. -+ .set GF_POLY_XMM, %xmm14 -+ .set GF_POLY, V14 -+ -+ // V15 holds the first AES round key, copied to all 128-bit lanes. -+ .set KEY0_XMM, %xmm15 -+ .set KEY0, V15 -+ -+ // If 32 SIMD registers are available, then V16-V29 hold the remaining -+ // AES round keys, copied to all 128-bit lanes. -+.if USE_AVX10 -+ .set KEY1_XMM, %xmm16 -+ .set KEY1, V16 -+ .set KEY2_XMM, %xmm17 -+ .set KEY2, V17 -+ .set KEY3_XMM, %xmm18 -+ .set KEY3, V18 -+ .set KEY4_XMM, %xmm19 -+ .set KEY4, V19 -+ .set KEY5_XMM, %xmm20 -+ .set KEY5, V20 -+ .set KEY6_XMM, %xmm21 -+ .set KEY6, V21 -+ .set KEY7_XMM, %xmm22 -+ .set KEY7, V22 -+ .set KEY8_XMM, %xmm23 -+ .set KEY8, V23 -+ .set KEY9_XMM, %xmm24 -+ .set KEY9, V24 -+ .set KEY10_XMM, %xmm25 -+ .set KEY10, V25 -+ .set KEY11_XMM, %xmm26 -+ .set KEY11, V26 -+ .set KEY12_XMM, %xmm27 -+ .set KEY12, V27 -+ .set KEY13_XMM, %xmm28 -+ .set KEY13, V28 -+ .set KEY14_XMM, %xmm29 -+ .set KEY14, V29 -+.endif -+ // V30-V31 are currently unused. -+.endm -+ -+// Move a vector between memory and a register. -+.macro _vmovdqu src, dst -+.if VL < 64 -+ vmovdqu \src, \dst -+.else -+ vmovdqu8 \src, \dst -+.endif -+.endm -+ -+// Broadcast a 128-bit value into a vector. -+.macro _vbroadcast128 src, dst -+.if VL == 16 && !USE_AVX10 -+ vmovdqu \src, \dst -+.elseif VL == 32 && !USE_AVX10 -+ vbroadcasti128 \src, \dst -+.else -+ vbroadcasti32x4 \src, \dst -+.endif -+.endm -+ -+// XOR two vectors together. -+.macro _vpxor src1, src2, dst -+.if USE_AVX10 -+ vpxord \src1, \src2, \dst -+.else -+ vpxor \src1, \src2, \dst -+.endif -+.endm -+ -+// XOR three vectors together. -+.macro _xor3 src1, src2, src3_and_dst -+.if USE_AVX10 -+ // vpternlogd with immediate 0x96 is a three-argument XOR. -+ vpternlogd $0x96, \src1, \src2, \src3_and_dst -+.else -+ vpxor \src1, \src3_and_dst, \src3_and_dst -+ vpxor \src2, \src3_and_dst, \src3_and_dst -+.endif -+.endm -+ -+// Given a 128-bit XTS tweak in the xmm register \src, compute the next tweak -+// (by multiplying by the polynomial 'x') and write it to \dst. -+.macro _next_tweak src, tmp, dst -+ vpshufd $0x13, \src, \tmp -+ vpaddq \src, \src, \dst -+ vpsrad $31, \tmp, \tmp -+ vpand GF_POLY_XMM, \tmp, \tmp -+ vpxor \tmp, \dst, \dst -+.endm -+ -+// Given the XTS tweak(s) in the vector \src, compute the next vector of -+// tweak(s) (by multiplying by the polynomial 'x^(VL/16)') and write it to \dst. -+// -+// If VL > 16, then there are multiple tweaks, and we use vpclmulqdq to compute -+// all tweaks in the vector in parallel. If VL=16, we just do the regular -+// computation without vpclmulqdq, as it's the faster method for a single tweak. -+.macro _next_tweakvec src, tmp1, tmp2, dst -+.if VL == 16 -+ _next_tweak \src, \tmp1, \dst -+.else -+ vpsrlq $64 - VL/16, \src, \tmp1 -+ vpclmulqdq $0x01, GF_POLY, \tmp1, \tmp2 -+ vpslldq $8, \tmp1, \tmp1 -+ vpsllq $VL/16, \src, \dst -+ _xor3 \tmp1, \tmp2, \dst -+.endif -+.endm -+ -+// Given the first XTS tweak at (TWEAK), compute the first set of tweaks and -+// store them in the vector registers TWEAK0-TWEAK3. Clobbers V0-V5. -+.macro _compute_first_set_of_tweaks -+ vmovdqu (TWEAK), TWEAK0_XMM -+ _vbroadcast128 .Lgf_poly(%rip), GF_POLY -+.if VL == 16 -+ // With VL=16, multiplying by x serially is fastest. -+ _next_tweak TWEAK0, %xmm0, TWEAK1 -+ _next_tweak TWEAK1, %xmm0, TWEAK2 -+ _next_tweak TWEAK2, %xmm0, TWEAK3 -+.else -+.if VL == 32 -+ // Compute the second block of TWEAK0. -+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1 -+ vinserti128 $1, %xmm1, TWEAK0, TWEAK0 -+.elseif VL == 64 -+ // Compute the remaining blocks of TWEAK0. -+ _next_tweak TWEAK0_XMM, %xmm0, %xmm1 -+ _next_tweak %xmm1, %xmm0, %xmm2 -+ _next_tweak %xmm2, %xmm0, %xmm3 -+ vinserti32x4 $1, %xmm1, TWEAK0, TWEAK0 -+ vinserti32x4 $2, %xmm2, TWEAK0, TWEAK0 -+ vinserti32x4 $3, %xmm3, TWEAK0, TWEAK0 -+.endif -+ // Compute TWEAK[1-3] from TWEAK0. -+ vpsrlq $64 - 1*VL/16, TWEAK0, V0 -+ vpsrlq $64 - 2*VL/16, TWEAK0, V2 -+ vpsrlq $64 - 3*VL/16, TWEAK0, V4 -+ vpclmulqdq $0x01, GF_POLY, V0, V1 -+ vpclmulqdq $0x01, GF_POLY, V2, V3 -+ vpclmulqdq $0x01, GF_POLY, V4, V5 -+ vpslldq $8, V0, V0 -+ vpslldq $8, V2, V2 -+ vpslldq $8, V4, V4 -+ vpsllq $1*VL/16, TWEAK0, TWEAK1 -+ vpsllq $2*VL/16, TWEAK0, TWEAK2 -+ vpsllq $3*VL/16, TWEAK0, TWEAK3 -+.if USE_AVX10 -+ vpternlogd $0x96, V0, V1, TWEAK1 -+ vpternlogd $0x96, V2, V3, TWEAK2 -+ vpternlogd $0x96, V4, V5, TWEAK3 -+.else -+ vpxor V0, TWEAK1, TWEAK1 -+ vpxor V2, TWEAK2, TWEAK2 -+ vpxor V4, TWEAK3, TWEAK3 -+ vpxor V1, TWEAK1, TWEAK1 -+ vpxor V3, TWEAK2, TWEAK2 -+ vpxor V5, TWEAK3, TWEAK3 -+.endif -+.endif -+.endm -+ -+// Do one step in computing the next set of tweaks using the method of just -+// multiplying by x repeatedly (the same method _next_tweak uses). -+.macro _tweak_step_mulx i -+.if \i == 0 -+ .set PREV_TWEAK, TWEAK3 -+ .set NEXT_TWEAK, NEXT_TWEAK0 -+.elseif \i == 5 -+ .set PREV_TWEAK, NEXT_TWEAK0 -+ .set NEXT_TWEAK, NEXT_TWEAK1 -+.elseif \i == 10 -+ .set PREV_TWEAK, NEXT_TWEAK1 -+ .set NEXT_TWEAK, NEXT_TWEAK2 -+.elseif \i == 15 -+ .set PREV_TWEAK, NEXT_TWEAK2 -+ .set NEXT_TWEAK, NEXT_TWEAK3 -+.endif -+.if \i < 20 && \i % 5 == 0 -+ vpshufd $0x13, PREV_TWEAK, V5 -+.elseif \i < 20 && \i % 5 == 1 -+ vpaddq PREV_TWEAK, PREV_TWEAK, NEXT_TWEAK -+.elseif \i < 20 && \i % 5 == 2 -+ vpsrad $31, V5, V5 -+.elseif \i < 20 && \i % 5 == 3 -+ vpand GF_POLY, V5, V5 -+.elseif \i < 20 && \i % 5 == 4 -+ vpxor V5, NEXT_TWEAK, NEXT_TWEAK -+.elseif \i == 1000 -+ vmovdqa NEXT_TWEAK0, TWEAK0 -+ vmovdqa NEXT_TWEAK1, TWEAK1 -+ vmovdqa NEXT_TWEAK2, TWEAK2 -+ vmovdqa NEXT_TWEAK3, TWEAK3 -+.endif -+.endm -+ -+// Do one step in computing the next set of tweaks using the VPCLMULQDQ method -+// (the same method _next_tweakvec uses for VL > 16). This means multiplying -+// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 -+// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, -+// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. -+.macro _tweak_step_pclmul i -+.if \i == 2 -+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 -+.elseif \i == 4 -+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK1, NEXT_TWEAK1 -+.elseif \i == 6 -+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK2, NEXT_TWEAK2 -+.elseif \i == 8 -+ vpsrldq $(128 - 4*VL/16) / 8, TWEAK3, NEXT_TWEAK3 -+.elseif \i == 10 -+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK0, NEXT_TWEAK0 -+.elseif \i == 12 -+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK1, NEXT_TWEAK1 -+.elseif \i == 14 -+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK2, NEXT_TWEAK2 -+.elseif \i == 16 -+ vpclmulqdq $0x00, GF_POLY, NEXT_TWEAK3, NEXT_TWEAK3 -+.elseif \i == 1000 -+ vpslldq $(4*VL/16) / 8, TWEAK0, TWEAK0 -+ vpslldq $(4*VL/16) / 8, TWEAK1, TWEAK1 -+ vpslldq $(4*VL/16) / 8, TWEAK2, TWEAK2 -+ vpslldq $(4*VL/16) / 8, TWEAK3, TWEAK3 -+ _vpxor NEXT_TWEAK0, TWEAK0, TWEAK0 -+ _vpxor NEXT_TWEAK1, TWEAK1, TWEAK1 -+ _vpxor NEXT_TWEAK2, TWEAK2, TWEAK2 -+ _vpxor NEXT_TWEAK3, TWEAK3, TWEAK3 -+.endif -+.endm -+ -+// _tweak_step does one step of the computation of the next set of tweaks from -+// TWEAK[0-3]. To complete all steps, this must be invoked with \i values 0 -+// through at least 19, then 1000 which signals the last step. -+// -+// This is used to interleave the computation of the next set of tweaks with the -+// AES en/decryptions, which increases performance in some cases. -+.macro _tweak_step i -+.if VL == 16 -+ _tweak_step_mulx \i -+.else -+ _tweak_step_pclmul \i -+.endif -+.endm -+ -+// Load the round keys: just the first one if !USE_AVX10, otherwise all of them. -+.macro _load_round_keys -+ _vbroadcast128 0*16(KEY), KEY0 -+.if USE_AVX10 -+ _vbroadcast128 1*16(KEY), KEY1 -+ _vbroadcast128 2*16(KEY), KEY2 -+ _vbroadcast128 3*16(KEY), KEY3 -+ _vbroadcast128 4*16(KEY), KEY4 -+ _vbroadcast128 5*16(KEY), KEY5 -+ _vbroadcast128 6*16(KEY), KEY6 -+ _vbroadcast128 7*16(KEY), KEY7 -+ _vbroadcast128 8*16(KEY), KEY8 -+ _vbroadcast128 9*16(KEY), KEY9 -+ _vbroadcast128 10*16(KEY), KEY10 -+ // Note: if it's AES-128 or AES-192, the last several round keys won't -+ // be used. We do the loads anyway to save a conditional jump. -+ _vbroadcast128 11*16(KEY), KEY11 -+ _vbroadcast128 12*16(KEY), KEY12 -+ _vbroadcast128 13*16(KEY), KEY13 -+ _vbroadcast128 14*16(KEY), KEY14 -+.endif -+.endm -+ -+// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) -+// on the block(s) in \data using the round key(s) in \key. The register length -+// determines the number of AES blocks en/decrypted. -+.macro _vaes enc, last, key, data -+.if \enc -+.if \last -+ vaesenclast \key, \data, \data -+.else -+ vaesenc \key, \data, \data -+.endif -+.else -+.if \last -+ vaesdeclast \key, \data, \data -+.else -+ vaesdec \key, \data, \data -+.endif -+.endif -+.endm -+ -+// Do a single round of AES en/decryption on the block(s) in \data, using the -+// same key for all block(s). The round key is loaded from the appropriate -+// register or memory location for round \i. May clobber V4. -+.macro _vaes_1x enc, last, i, xmm_suffix, data -+.if USE_AVX10 -+ _vaes \enc, \last, KEY\i\xmm_suffix, \data -+.else -+.ifnb \xmm_suffix -+ _vaes \enc, \last, \i*16(KEY), \data -+.else -+ _vbroadcast128 \i*16(KEY), V4 -+ _vaes \enc, \last, V4, \data -+.endif -+.endif -+.endm -+ -+// Do a single round of AES en/decryption on the blocks in registers V0-V3, -+// using the same key for all blocks. The round key is loaded from the -+// appropriate register or memory location for round \i. In addition, does step -+// \i of the computation of the next set of tweaks. May clobber V4. -+.macro _vaes_4x enc, last, i -+.if USE_AVX10 -+ _tweak_step (2*(\i-1)) -+ _vaes \enc, \last, KEY\i, V0 -+ _vaes \enc, \last, KEY\i, V1 -+ _tweak_step (2*(\i-1) + 1) -+ _vaes \enc, \last, KEY\i, V2 -+ _vaes \enc, \last, KEY\i, V3 -+.else -+ _vbroadcast128 \i*16(KEY), V4 -+ _tweak_step (2*(\i-1)) -+ _vaes \enc, \last, V4, V0 -+ _vaes \enc, \last, V4, V1 -+ _tweak_step (2*(\i-1) + 1) -+ _vaes \enc, \last, V4, V2 -+ _vaes \enc, \last, V4, V3 -+.endif -+.endm -+ -+// Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, -+// then XOR with \tweak again) of the block(s) in \data. To process a single -+// block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of -+// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. -+.macro _aes_crypt enc, xmm_suffix, tweak, data -+ _xor3 KEY0\xmm_suffix, \tweak, \data -+ _vaes_1x \enc, 0, 1, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 2, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 3, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 4, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 5, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 6, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 7, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 8, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 9, \xmm_suffix, \data -+ cmp $24, KEYLEN -+ jle .Laes_128_or_192\@ -+ _vaes_1x \enc, 0, 10, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 11, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 12, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 13, \xmm_suffix, \data -+ _vaes_1x \enc, 1, 14, \xmm_suffix, \data -+ jmp .Laes_done\@ -+.Laes_128_or_192\@: -+ je .Laes_192\@ -+ _vaes_1x \enc, 1, 10, \xmm_suffix, \data -+ jmp .Laes_done\@ -+.Laes_192\@: -+ _vaes_1x \enc, 0, 10, \xmm_suffix, \data -+ _vaes_1x \enc, 0, 11, \xmm_suffix, \data -+ _vaes_1x \enc, 1, 12, \xmm_suffix, \data -+.Laes_done\@: -+ _vpxor \tweak, \data, \data -+.endm -+ -+.macro _aes_xts_crypt enc -+ _define_aliases -+ -+ // Load the AES key length: 16 (AES-128), 24 (AES-192), or 32 (AES-256). -+ movl 480(KEY), KEYLEN -+ -+ // If decrypting, advance KEY to the decryption round keys. -+.if !\enc -+ add $240, KEY -+.endif -+ -+ // Check whether the data length is a multiple of the AES block length. -+ test $15, LEN -+ jnz .Lneed_cts\@ -+.Lxts_init\@: -+ -+ // Cache as many round keys as possible. -+ _load_round_keys -+ -+ // Compute the first set of tweaks TWEAK[0-3]. -+ _compute_first_set_of_tweaks -+ -+ sub $4*VL, LEN -+ jl .Lhandle_remainder\@ -+ -+.Lmain_loop\@: -+ // This is the main loop, en/decrypting 4*VL bytes per iteration. -+ -+ // XOR each source block with its tweak and the first round key. -+.if USE_AVX10 -+ vmovdqu8 0*VL(SRC), V0 -+ vmovdqu8 1*VL(SRC), V1 -+ vmovdqu8 2*VL(SRC), V2 -+ vmovdqu8 3*VL(SRC), V3 -+ vpternlogd $0x96, TWEAK0, KEY0, V0 -+ vpternlogd $0x96, TWEAK1, KEY0, V1 -+ vpternlogd $0x96, TWEAK2, KEY0, V2 -+ vpternlogd $0x96, TWEAK3, KEY0, V3 -+.else -+ vpxor 0*VL(SRC), KEY0, V0 -+ vpxor 1*VL(SRC), KEY0, V1 -+ vpxor 2*VL(SRC), KEY0, V2 -+ vpxor 3*VL(SRC), KEY0, V3 -+ vpxor TWEAK0, V0, V0 -+ vpxor TWEAK1, V1, V1 -+ vpxor TWEAK2, V2, V2 -+ vpxor TWEAK3, V3, V3 -+.endif -+ // Do all the AES rounds on the data blocks, interleaved with -+ // the computation of the next set of tweaks. -+ _vaes_4x \enc, 0, 1 -+ _vaes_4x \enc, 0, 2 -+ _vaes_4x \enc, 0, 3 -+ _vaes_4x \enc, 0, 4 -+ _vaes_4x \enc, 0, 5 -+ _vaes_4x \enc, 0, 6 -+ _vaes_4x \enc, 0, 7 -+ _vaes_4x \enc, 0, 8 -+ _vaes_4x \enc, 0, 9 -+ // Try to optimize for AES-256 by keeping the code for AES-128 and -+ // AES-192 out-of-line. -+ cmp $24, KEYLEN -+ jle .Lencrypt_4x_aes_128_or_192\@ -+ _vaes_4x \enc, 0, 10 -+ _vaes_4x \enc, 0, 11 -+ _vaes_4x \enc, 0, 12 -+ _vaes_4x \enc, 0, 13 -+ _vaes_4x \enc, 1, 14 -+.Lencrypt_4x_done\@: -+ -+ // XOR in the tweaks again. -+ _vpxor TWEAK0, V0, V0 -+ _vpxor TWEAK1, V1, V1 -+ _vpxor TWEAK2, V2, V2 -+ _vpxor TWEAK3, V3, V3 -+ -+ // Store the destination blocks. -+ _vmovdqu V0, 0*VL(DST) -+ _vmovdqu V1, 1*VL(DST) -+ _vmovdqu V2, 2*VL(DST) -+ _vmovdqu V3, 3*VL(DST) -+ -+ // Finish computing the next set of tweaks. -+ _tweak_step 1000 -+ -+ add $4*VL, SRC -+ add $4*VL, DST -+ sub $4*VL, LEN -+ jge .Lmain_loop\@ -+ -+ // Check for the uncommon case where the data length isn't a multiple of -+ // 4*VL. Handle it out-of-line in order to optimize for the common -+ // case. In the common case, just fall through to the ret. -+ test $4*VL-1, LEN -+ jnz .Lhandle_remainder\@ -+.Ldone\@: -+ // Store the next tweak back to *TWEAK to support continuation calls. -+ vmovdqu TWEAK0_XMM, (TWEAK) -+.if VL > 16 -+ vzeroupper -+.endif -+ RET -+ -+.Lhandle_remainder\@: -+ add $4*VL, LEN // Undo the extra sub from earlier. -+ -+ // En/decrypt any remaining full blocks, one vector at a time. -+.if VL > 16 -+ sub $VL, LEN -+ jl .Lvec_at_a_time_done\@ -+.Lvec_at_a_time\@: -+ _vmovdqu (SRC), V0 -+ _aes_crypt \enc, , TWEAK0, V0 -+ _vmovdqu V0, (DST) -+ _next_tweakvec TWEAK0, V0, V1, TWEAK0 -+ add $VL, SRC -+ add $VL, DST -+ sub $VL, LEN -+ jge .Lvec_at_a_time\@ -+.Lvec_at_a_time_done\@: -+ add $VL-16, LEN // Undo the extra sub from earlier. -+.else -+ sub $16, LEN -+.endif -+ -+ // En/decrypt any remaining full blocks, one at a time. -+ jl .Lblock_at_a_time_done\@ -+.Lblock_at_a_time\@: -+ vmovdqu (SRC), %xmm0 -+ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 -+ vmovdqu %xmm0, (DST) -+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM -+ add $16, SRC -+ add $16, DST -+ sub $16, LEN -+ jge .Lblock_at_a_time\@ -+.Lblock_at_a_time_done\@: -+ add $16, LEN // Undo the extra sub from earlier. -+ -+.Lfull_blocks_done\@: -+ // Now 0 <= LEN <= 15. If LEN is nonzero, do ciphertext stealing to -+ // process the last 16 + LEN bytes. If LEN is zero, we're done. -+ test LEN, LEN -+ jnz .Lcts\@ -+ jmp .Ldone\@ -+ -+ // Out-of-line handling of AES-128 and AES-192 -+.Lencrypt_4x_aes_128_or_192\@: -+ jz .Lencrypt_4x_aes_192\@ -+ _vaes_4x \enc, 1, 10 -+ jmp .Lencrypt_4x_done\@ -+.Lencrypt_4x_aes_192\@: -+ _vaes_4x \enc, 0, 10 -+ _vaes_4x \enc, 0, 11 -+ _vaes_4x \enc, 1, 12 -+ jmp .Lencrypt_4x_done\@ -+ -+.Lneed_cts\@: -+ // The data length isn't a multiple of the AES block length, so -+ // ciphertext stealing (CTS) will be needed. Subtract one block from -+ // LEN so that the main loop doesn't process the last full block. The -+ // CTS step will process it specially along with the partial block. -+ sub $16, LEN -+ jmp .Lxts_init\@ -+ -+.Lcts\@: -+ // Do ciphertext stealing (CTS) to en/decrypt the last full block and -+ // the partial block. CTS needs two tweaks. TWEAK0_XMM contains the -+ // next tweak; compute the one after that. Decryption uses these two -+ // tweaks in reverse order, so also define aliases to handle that. -+ _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM -+.if \enc -+ .set CTS_TWEAK0, TWEAK0_XMM -+ .set CTS_TWEAK1, TWEAK1_XMM -+.else -+ .set CTS_TWEAK0, TWEAK1_XMM -+ .set CTS_TWEAK1, TWEAK0_XMM -+.endif -+ -+ // En/decrypt the last full block. -+ vmovdqu (SRC), %xmm0 -+ _aes_crypt \enc, _XMM, CTS_TWEAK0, %xmm0 -+ -+.if USE_AVX10 -+ // Create a mask that has the first LEN bits set. -+ mov $-1, %rax -+ bzhi LEN, %rax, %rax -+ kmovq %rax, %k1 -+ -+ // Swap the first LEN bytes of the above result with the partial block. -+ // Note that to support in-place en/decryption, the load from the src -+ // partial block must happen before the store to the dst partial block. -+ vmovdqa %xmm0, %xmm1 -+ vmovdqu8 16(SRC), %xmm0{%k1} -+ vmovdqu8 %xmm1, 16(DST){%k1} -+.else -+ lea .Lcts_permute_table(%rip), %rax -+ -+ // Load the src partial block, left-aligned. Note that to support -+ // in-place en/decryption, this must happen before the store to the dst -+ // partial block. -+ vmovdqu (SRC, LEN, 1), %xmm1 -+ -+ // Shift the first LEN bytes of the en/decryption of the last full block -+ // to the end of a register, then store it to DST+LEN. This stores the -+ // dst partial block. It also writes to the second part of the dst last -+ // full block, but that part is overwritten later. -+ vpshufb (%rax, LEN, 1), %xmm0, %xmm2 -+ vmovdqu %xmm2, (DST, LEN, 1) -+ -+ // Make xmm3 contain [16-LEN,16-LEN+1,...,14,15,0x80,0x80,...]. -+ sub LEN, %rax -+ vmovdqu 32(%rax), %xmm3 -+ -+ // Shift the src partial block to the beginning of its register. -+ vpshufb %xmm3, %xmm1, %xmm1 -+ -+ // Do a blend to generate the src partial block followed by the second -+ // part of the en/decryption of the last full block. -+ vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 -+.endif -+ // En/decrypt again and store the last full block. -+ _aes_crypt \enc, _XMM, CTS_TWEAK1, %xmm0 -+ vmovdqu %xmm0, (DST) -+ jmp .Ldone\@ -+.endm -+ -+// void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, -+// u8 iv[AES_BLOCK_SIZE]); -+SYM_FUNC_START(aes_xts_encrypt_iv) -+ vmovdqu (%rsi), %xmm0 -+ vpxor 0*16(%rdi), %xmm0, %xmm0 -+ vaesenc 1*16(%rdi), %xmm0, %xmm0 -+ vaesenc 2*16(%rdi), %xmm0, %xmm0 -+ vaesenc 3*16(%rdi), %xmm0, %xmm0 -+ vaesenc 4*16(%rdi), %xmm0, %xmm0 -+ vaesenc 5*16(%rdi), %xmm0, %xmm0 -+ vaesenc 6*16(%rdi), %xmm0, %xmm0 -+ vaesenc 7*16(%rdi), %xmm0, %xmm0 -+ vaesenc 8*16(%rdi), %xmm0, %xmm0 -+ vaesenc 9*16(%rdi), %xmm0, %xmm0 -+ cmpl $24, 480(%rdi) -+ jle .Lencrypt_iv_aes_128_or_192 -+ vaesenc 10*16(%rdi), %xmm0, %xmm0 -+ vaesenc 11*16(%rdi), %xmm0, %xmm0 -+ vaesenc 12*16(%rdi), %xmm0, %xmm0 -+ vaesenc 13*16(%rdi), %xmm0, %xmm0 -+ vaesenclast 14*16(%rdi), %xmm0, %xmm0 -+.Lencrypt_iv_done: -+ vmovdqu %xmm0, (%rsi) -+ RET -+ -+ // Out-of-line handling of AES-128 and AES-192 -+.Lencrypt_iv_aes_128_or_192: -+ jz .Lencrypt_iv_aes_192 -+ vaesenclast 10*16(%rdi), %xmm0, %xmm0 -+ jmp .Lencrypt_iv_done -+.Lencrypt_iv_aes_192: -+ vaesenc 10*16(%rdi), %xmm0, %xmm0 -+ vaesenc 11*16(%rdi), %xmm0, %xmm0 -+ vaesenclast 12*16(%rdi), %xmm0, %xmm0 -+ jmp .Lencrypt_iv_done -+SYM_FUNC_END(aes_xts_encrypt_iv) -+ -+// Below are the actual AES-XTS encryption and decryption functions, -+// instantiated from the above macro. They all have the following prototype: -+// -+// void (*xts_asm_func)(const struct crypto_aes_ctx *key, -+// const u8 *src, u8 *dst, size_t len, -+// u8 tweak[AES_BLOCK_SIZE]); -+// -+// |key| is the data key. |tweak| contains the next tweak; the encryption of -+// the original IV with the tweak key was already done. This function supports -+// incremental computation, but |len| must always be >= 16 (AES_BLOCK_SIZE), and -+// |len| must be a multiple of 16 except on the last call. If |len| is a -+// multiple of 16, then this function updates |tweak| to contain the next tweak. -+ -+.set VL, 16 -+.set USE_AVX10, 0 -+SYM_TYPED_FUNC_START(aes_xts_encrypt_aesni_avx) -+ _aes_xts_crypt 1 -+SYM_FUNC_END(aes_xts_encrypt_aesni_avx) -+SYM_TYPED_FUNC_START(aes_xts_decrypt_aesni_avx) -+ _aes_xts_crypt 0 -+SYM_FUNC_END(aes_xts_decrypt_aesni_avx) -+ -+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) -+.set VL, 32 -+.set USE_AVX10, 0 -+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx2) -+ _aes_xts_crypt 1 -+SYM_FUNC_END(aes_xts_encrypt_vaes_avx2) -+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx2) -+ _aes_xts_crypt 0 -+SYM_FUNC_END(aes_xts_decrypt_vaes_avx2) -+ -+.set VL, 32 -+.set USE_AVX10, 1 -+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_256) -+ _aes_xts_crypt 1 -+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_256) -+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_256) -+ _aes_xts_crypt 0 -+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_256) -+ -+.set VL, 64 -+.set USE_AVX10, 1 -+SYM_TYPED_FUNC_START(aes_xts_encrypt_vaes_avx10_512) -+ _aes_xts_crypt 1 -+SYM_FUNC_END(aes_xts_encrypt_vaes_avx10_512) -+SYM_TYPED_FUNC_START(aes_xts_decrypt_vaes_avx10_512) -+ _aes_xts_crypt 0 -+SYM_FUNC_END(aes_xts_decrypt_vaes_avx10_512) -+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ -diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c -index b1d90c25975a..0855ace8659c 100644 ---- a/arch/x86/crypto/aesni-intel_glue.c -+++ b/arch/x86/crypto/aesni-intel_glue.c -@@ -1137,7 +1137,264 @@ static struct skcipher_alg aesni_xctr = { - }; - - static struct simd_skcipher_alg *aesni_simd_xctr; --#endif /* CONFIG_X86_64 */ -+ -+asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, -+ u8 iv[AES_BLOCK_SIZE]); -+ -+typedef void (*xts_asm_func)(const struct crypto_aes_ctx *key, -+ const u8 *src, u8 *dst, size_t len, -+ u8 tweak[AES_BLOCK_SIZE]); -+ -+/* This handles cases where the source and/or destination span pages. */ -+static noinline int -+xts_crypt_slowpath(struct skcipher_request *req, xts_asm_func asm_func) -+{ -+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); -+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); -+ int tail = req->cryptlen % AES_BLOCK_SIZE; -+ struct scatterlist sg_src[2], sg_dst[2]; -+ struct skcipher_request subreq; -+ struct skcipher_walk walk; -+ struct scatterlist *src, *dst; -+ int err; -+ -+ /* -+ * If the message length isn't divisible by the AES block size, then -+ * separate off the last full block and the partial block. This ensures -+ * that they are processed in the same call to the assembly function, -+ * which is required for ciphertext stealing. -+ */ -+ if (tail) { -+ skcipher_request_set_tfm(&subreq, tfm); -+ skcipher_request_set_callback(&subreq, -+ skcipher_request_flags(req), -+ NULL, NULL); -+ skcipher_request_set_crypt(&subreq, req->src, req->dst, -+ req->cryptlen - tail - AES_BLOCK_SIZE, -+ req->iv); -+ req = &subreq; -+ } -+ -+ err = skcipher_walk_virt(&walk, req, false); -+ -+ while (walk.nbytes) { -+ unsigned int nbytes = walk.nbytes; -+ -+ if (nbytes < walk.total) -+ nbytes = round_down(nbytes, AES_BLOCK_SIZE); -+ -+ kernel_fpu_begin(); -+ (*asm_func)(&ctx->crypt_ctx, walk.src.virt.addr, -+ walk.dst.virt.addr, nbytes, req->iv); -+ kernel_fpu_end(); -+ err = skcipher_walk_done(&walk, walk.nbytes - nbytes); -+ } -+ -+ if (err || !tail) -+ return err; -+ -+ /* Do ciphertext stealing with the last full block and partial block. */ -+ -+ dst = src = scatterwalk_ffwd(sg_src, req->src, req->cryptlen); -+ if (req->dst != req->src) -+ dst = scatterwalk_ffwd(sg_dst, req->dst, req->cryptlen); -+ -+ skcipher_request_set_crypt(req, src, dst, AES_BLOCK_SIZE + tail, -+ req->iv); -+ -+ err = skcipher_walk_virt(&walk, req, false); -+ if (err) -+ return err; -+ -+ kernel_fpu_begin(); -+ (*asm_func)(&ctx->crypt_ctx, walk.src.virt.addr, walk.dst.virt.addr, -+ walk.nbytes, req->iv); -+ kernel_fpu_end(); -+ -+ return skcipher_walk_done(&walk, 0); -+} -+ -+/* __always_inline to avoid indirect call in fastpath */ -+static __always_inline int -+xts_crypt2(struct skcipher_request *req, xts_asm_func asm_func) -+{ -+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); -+ const struct aesni_xts_ctx *ctx = aes_xts_ctx(tfm); -+ const unsigned int cryptlen = req->cryptlen; -+ struct scatterlist *src = req->src; -+ struct scatterlist *dst = req->dst; -+ -+ if (unlikely(cryptlen < AES_BLOCK_SIZE)) -+ return -EINVAL; -+ -+ kernel_fpu_begin(); -+ aes_xts_encrypt_iv(&ctx->tweak_ctx, req->iv); -+ -+ /* -+ * In practice, virtually all XTS plaintexts and ciphertexts are either -+ * 512 or 4096 bytes, aligned such that they don't span page boundaries. -+ * To optimize the performance of these cases, and also any other case -+ * where no page boundary is spanned, the below fast-path handles -+ * single-page sources and destinations as efficiently as possible. -+ */ -+ if (likely(src->length >= cryptlen && dst->length >= cryptlen && -+ src->offset + cryptlen <= PAGE_SIZE && -+ dst->offset + cryptlen <= PAGE_SIZE)) { -+ struct page *src_page = sg_page(src); -+ struct page *dst_page = sg_page(dst); -+ void *src_virt = kmap_local_page(src_page) + src->offset; -+ void *dst_virt = kmap_local_page(dst_page) + dst->offset; -+ -+ (*asm_func)(&ctx->crypt_ctx, src_virt, dst_virt, cryptlen, -+ req->iv); -+ kunmap_local(dst_virt); -+ kunmap_local(src_virt); -+ kernel_fpu_end(); -+ return 0; -+ } -+ kernel_fpu_end(); -+ return xts_crypt_slowpath(req, asm_func); -+} -+ -+#define DEFINE_XTS_ALG(suffix, driver_name, priority) \ -+ \ -+asmlinkage void aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, \ -+ const u8 *src, u8 *dst, size_t len, \ -+ u8 tweak[AES_BLOCK_SIZE]); \ -+asmlinkage void aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, \ -+ const u8 *src, u8 *dst, size_t len, \ -+ u8 tweak[AES_BLOCK_SIZE]); \ -+ \ -+static int xts_encrypt_##suffix(struct skcipher_request *req) \ -+{ \ -+ return xts_crypt2(req, aes_xts_encrypt_##suffix); \ -+} \ -+ \ -+static int xts_decrypt_##suffix(struct skcipher_request *req) \ -+{ \ -+ return xts_crypt2(req, aes_xts_decrypt_##suffix); \ -+} \ -+ \ -+static struct skcipher_alg aes_xts_alg_##suffix = { \ -+ .base = { \ -+ .cra_name = "__xts(aes)", \ -+ .cra_driver_name = "__" driver_name, \ -+ .cra_priority = priority, \ -+ .cra_flags = CRYPTO_ALG_INTERNAL, \ -+ .cra_blocksize = AES_BLOCK_SIZE, \ -+ .cra_ctxsize = XTS_AES_CTX_SIZE, \ -+ .cra_module = THIS_MODULE, \ -+ }, \ -+ .min_keysize = 2 * AES_MIN_KEY_SIZE, \ -+ .max_keysize = 2 * AES_MAX_KEY_SIZE, \ -+ .ivsize = AES_BLOCK_SIZE, \ -+ .walksize = 2 * AES_BLOCK_SIZE, \ -+ .setkey = xts_aesni_setkey, \ -+ .encrypt = xts_encrypt_##suffix, \ -+ .decrypt = xts_decrypt_##suffix, \ -+}; \ -+ \ -+static struct simd_skcipher_alg *aes_xts_simdalg_##suffix -+ -+DEFINE_XTS_ALG(aesni_avx, "xts-aes-aesni-avx", 500); -+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) -+DEFINE_XTS_ALG(vaes_avx2, "xts-aes-vaes-avx2", 600); -+DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); -+DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); -+#endif -+ -+/* -+ * This is a list of CPU models that are known to suffer from downclocking when -+ * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS -+ * implementation with zmm registers won't be used by default. An -+ * implementation with ymm registers (256-bit vectors) will be used instead. -+ */ -+static const struct x86_cpu_id zmm_exclusion_list[] = { -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_SKYLAKE_X }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_X }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_D }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_L }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_ICELAKE_NNPI }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE_L }, -+ { .vendor = X86_VENDOR_INTEL, .family = 6, .model = INTEL_FAM6_TIGERLAKE }, -+ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ -+ /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */ -+ {}, -+}; -+ -+static int __init register_xts_algs(void) -+{ -+ int err; -+ -+ if (!boot_cpu_has(X86_FEATURE_AVX)) -+ return 0; -+ err = simd_register_skciphers_compat(&aes_xts_alg_aesni_avx, 1, -+ &aes_xts_simdalg_aesni_avx); -+ if (err) -+ return err; -+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) -+ if (!boot_cpu_has(X86_FEATURE_AVX2) || -+ !boot_cpu_has(X86_FEATURE_VAES) || -+ !boot_cpu_has(X86_FEATURE_VPCLMULQDQ) || -+ !boot_cpu_has(X86_FEATURE_PCLMULQDQ) || -+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM, NULL)) -+ return 0; -+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx2, 1, -+ &aes_xts_simdalg_vaes_avx2); -+ if (err) -+ return err; -+ -+ if (!boot_cpu_has(X86_FEATURE_AVX512BW) || -+ !boot_cpu_has(X86_FEATURE_AVX512VL) || -+ !boot_cpu_has(X86_FEATURE_BMI2) || -+ !cpu_has_xfeatures(XFEATURE_MASK_SSE | XFEATURE_MASK_YMM | -+ XFEATURE_MASK_AVX512, NULL)) -+ return 0; -+ -+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_256, 1, -+ &aes_xts_simdalg_vaes_avx10_256); -+ if (err) -+ return err; -+ -+ if (x86_match_cpu(zmm_exclusion_list)) -+ aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; -+ -+ err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, -+ &aes_xts_simdalg_vaes_avx10_512); -+ if (err) -+ return err; -+#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ -+ return 0; -+} -+ -+static void unregister_xts_algs(void) -+{ -+ if (aes_xts_simdalg_aesni_avx) -+ simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, -+ &aes_xts_simdalg_aesni_avx); -+#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) -+ if (aes_xts_simdalg_vaes_avx2) -+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, -+ &aes_xts_simdalg_vaes_avx2); -+ if (aes_xts_simdalg_vaes_avx10_256) -+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, -+ &aes_xts_simdalg_vaes_avx10_256); -+ if (aes_xts_simdalg_vaes_avx10_512) -+ simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, -+ &aes_xts_simdalg_vaes_avx10_512); -+#endif -+} -+#else /* CONFIG_X86_64 */ -+static int __init register_xts_algs(void) -+{ -+ return 0; -+} -+ -+static void unregister_xts_algs(void) -+{ -+} -+#endif /* !CONFIG_X86_64 */ - - #ifdef CONFIG_X86_64 - static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, -@@ -1276,13 +1533,21 @@ static int __init aesni_init(void) - goto unregister_aeads; - #endif /* CONFIG_X86_64 */ - -+ err = register_xts_algs(); -+ if (err) -+ goto unregister_xts; -+ - return 0; - -+unregister_xts: -+ unregister_xts_algs(); - #ifdef CONFIG_X86_64 -+ if (aesni_simd_xctr) -+ simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); - unregister_aeads: -+#endif /* CONFIG_X86_64 */ - simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), - aesni_simd_aeads); --#endif /* CONFIG_X86_64 */ - - unregister_skciphers: - simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), -@@ -1303,6 +1568,7 @@ static void __exit aesni_exit(void) - if (boot_cpu_has(X86_FEATURE_AVX)) - simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); - #endif /* CONFIG_X86_64 */ -+ unregister_xts_algs(); - } - - late_initcall(aesni_init); --- -2.44.0 - -From 4a47b09deb67c3854ac102bcb18ef0df00aae437 Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Wed, 3 Apr 2024 17:06:20 +0200 -Subject: [PATCH 2/8] amd-pstate - -Signed-off-by: Peter Jung ---- - .../admin-guide/kernel-parameters.txt | 5 + - Documentation/admin-guide/pm/amd-pstate.rst | 70 ++- - arch/x86/Kconfig | 5 +- - arch/x86/include/asm/msr-index.h | 2 + - drivers/acpi/cppc_acpi.c | 17 +- - drivers/acpi/processor_driver.c | 6 + - drivers/cpufreq/acpi-cpufreq.c | 2 - - drivers/cpufreq/amd-pstate-ut.c | 2 +- - drivers/cpufreq/amd-pstate.c | 499 +++++++++++++++--- - include/acpi/cppc_acpi.h | 5 + - include/linux/amd-pstate.h | 32 +- - include/linux/cpufreq.h | 1 + - 12 files changed, 560 insertions(+), 86 deletions(-) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index d2150bd3acc5..71ed7f1b0f9b 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -374,6 +374,11 @@ - selects a performance level in this range and appropriate - to the current workload. - -+ amd_prefcore= -+ [X86] -+ disable -+ Disable amd-pstate preferred core. -+ - amijoy.map= [HW,JOY] Amiga joystick support - Map of devices attached to JOY0DAT and JOY1DAT - Format: , diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst -index 9eb26014d34b..82fbd01da658 100644 +index 1e0d101b020a..d0324d44f548 100644 --- a/Documentation/admin-guide/pm/amd-pstate.rst +++ b/Documentation/admin-guide/pm/amd-pstate.rst -@@ -300,8 +300,8 @@ platforms. The AMD P-States mechanism is the more performance and energy - efficiency frequency management method on AMD processors. +@@ -281,6 +281,22 @@ integer values defined between 0 to 255 when EPP feature is enabled by platform + firmware, if EPP feature is disabled, driver will ignore the written value + This attribute is read-write. ++``boost`` ++The `boost` sysfs attribute provides control over the CPU core ++performance boost, allowing users to manage the maximum frequency limitation ++of the CPU. This attribute can be used to enable or disable the boost feature ++on individual CPUs. ++ ++When the boost feature is enabled, the CPU can dynamically increase its frequency ++beyond the base frequency, providing enhanced performance for demanding workloads. ++On the other hand, disabling the boost feature restricts the CPU to operate at the ++base frequency, which may be desirable in certain scenarios to prioritize power ++efficiency or manage temperature. ++ ++To manipulate the `boost` attribute, users can write a value of `0` to disable the ++boost or `1` to enable it, for the respective CPU using the sysfs path ++`/sys/devices/system/cpu/cpuX/cpufreq/boost`, where `X` represents the CPU number. ++ + Other performance and frequency values can be read back from + ``/sys/devices/system/cpu/cpuX/acpi_cppc/``, see :ref:`cppc_sysfs`. --AMD Pstate Driver Operation Modes --================================= -+``amd-pstate`` Driver Operation Modes -+====================================== +@@ -406,7 +422,7 @@ control its functionality at the system level. They are located in the + ``/sys/devices/system/cpu/amd_pstate/`` directory and affect all CPUs. - ``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, - non-autonomous (passive) mode and guided autonomous (guided) mode. -@@ -353,6 +353,48 @@ is activated. In this mode, driver requests minimum and maximum performance - level and the platform autonomously selects a performance level in this range - and appropriate to the current workload. + ``status`` +- Operation mode of the driver: "active", "passive" or "disable". ++ Operation mode of the driver: "active", "passive", "guided" or "disable". -+``amd-pstate`` Preferred Core -+================================= -+ -+The core frequency is subjected to the process variation in semiconductors. -+Not all cores are able to reach the maximum frequency respecting the -+infrastructure limits. Consequently, AMD has redefined the concept of -+maximum frequency of a part. This means that a fraction of cores can reach -+maximum frequency. To find the best process scheduling policy for a given -+scenario, OS needs to know the core ordering informed by the platform through -+highest performance capability register of the CPPC interface. -+ -+``amd-pstate`` preferred core enables the scheduler to prefer scheduling on -+cores that can achieve a higher frequency with lower voltage. The preferred -+core rankings can dynamically change based on the workload, platform conditions, -+thermals and ageing. -+ -+The priority metric will be initialized by the ``amd-pstate`` driver. The ``amd-pstate`` -+driver will also determine whether or not ``amd-pstate`` preferred core is -+supported by the platform. -+ -+``amd-pstate`` driver will provide an initial core ordering when the system boots. -+The platform uses the CPPC interfaces to communicate the core ranking to the -+operating system and scheduler to make sure that OS is choosing the cores -+with highest performance firstly for scheduling the process. When ``amd-pstate`` -+driver receives a message with the highest performance change, it will -+update the core ranking and set the cpu's priority. -+ -+``amd-pstate`` Preferred Core Switch -+===================================== -+Kernel Parameters -+----------------- -+ -+``amd-pstate`` peferred core`` has two states: enable and disable. -+Enable/disable states can be chosen by different kernel parameters. -+Default enable ``amd-pstate`` preferred core. -+ -+``amd_prefcore=disable`` -+ -+For systems that support ``amd-pstate`` preferred core, the core rankings will -+always be advertised by the platform. But OS can choose to ignore that via the -+kernel parameter ``amd_prefcore=disable``. -+ - User Space Interface in ``sysfs`` - General - =========================================== + "active" + The driver is functional and in the ``active mode`` +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 3c7434329661..6c128d463a14 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -470,6 +470,7 @@ + #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ + #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ + #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ ++#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* "" AMD Fast CPPC */ -@@ -385,6 +427,30 @@ control its functionality at the system level. They are located in the - to the operation mode represented by that string - or to be - unregistered in the "disable" case. - -+``prefcore`` -+ Preferred core state of the driver: "enabled" or "disabled". -+ -+ "enabled" -+ Enable the ``amd-pstate`` preferred core. -+ -+ "disabled" -+ Disable the ``amd-pstate`` preferred core -+ -+ -+ This attribute is read-only to check the state of preferred core set -+ by the kernel parameter. -+ -+``cpb_boost`` -+ Specifies whether core performance boost is requested to be enabled or disabled -+ If core performance boost is disabled while a core is in a boosted P-state, the -+ core automatically transitions to the highest performance non-boosted P-state. -+ AMD Core Performance Boost(CPB) is controlled by this new attribute file which -+ allow user to change all cores frequency boosting state. It supports both -+ ``active``, ``passive`` and ``guided`` mode control with below value write to it. -+ -+ "0" Disable Core Performance Boosting -+ "1" Enable Core Performance Boosting -+ - ``cpupower`` tool support for ``amd-pstate`` - =============================================== - -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 184730705650..70732a76171f 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1054,8 +1054,9 @@ config SCHED_MC - - config SCHED_MC_PRIO - bool "CPU core priorities scheduler support" -- depends on SCHED_MC && CPU_SUP_INTEL -- select X86_INTEL_PSTATE -+ depends on SCHED_MC -+ select X86_INTEL_PSTATE if CPU_SUP_INTEL -+ select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI - select CPU_FREQ - default y - help + /* + * BUG word(s) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h -index d1b5edaf6c34..bfe139eb75b6 100644 +index e022e6eb766c..384739d592af 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h -@@ -744,6 +744,8 @@ +@@ -781,6 +781,8 @@ #define MSR_K7_HWCR_IRPERF_EN BIT_ULL(MSR_K7_HWCR_IRPERF_EN_BIT) #define MSR_K7_FID_VID_CTL 0xc0010041 #define MSR_K7_FID_VID_STATUS 0xc0010042 @@ -1356,68 +78,32 @@ index d1b5edaf6c34..bfe139eb75b6 100644 /* K6 MSRs */ #define MSR_K6_WHCR 0xc0000082 -diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c -index d155a86a8614..e23a84f4a50a 100644 ---- a/drivers/acpi/cppc_acpi.c -+++ b/drivers/acpi/cppc_acpi.c -@@ -679,8 +679,10 @@ int acpi_cppc_processor_probe(struct acpi_processor *pr) - - if (!osc_sb_cppc2_support_acked) { - pr_debug("CPPC v2 _OSC not acked\n"); -- if (!cpc_supported_by_cpu()) -+ if (!cpc_supported_by_cpu()) { -+ pr_debug("CPPC is not supported by the CPU\n"); - return -ENODEV; -+ } - } - - /* Parse the ACPI _CPC table for this CPU. */ -@@ -1157,6 +1159,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) - return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); - } - -+/** -+ * cppc_get_highest_perf - Get the highest performance register value. -+ * @cpunum: CPU from which to get highest performance. -+ * @highest_perf: Return address. -+ * -+ * Return: 0 for success, -EIO otherwise. -+ */ -+int cppc_get_highest_perf(int cpunum, u64 *highest_perf) -+{ -+ return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf); -+} -+EXPORT_SYMBOL_GPL(cppc_get_highest_perf); -+ - /** - * cppc_get_epp_perf - Get the epp register value. - * @cpunum: CPU from which to get epp preference value. -diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c -index 4bd16b3f0781..67db60eda370 100644 ---- a/drivers/acpi/processor_driver.c -+++ b/drivers/acpi/processor_driver.c -@@ -27,6 +27,7 @@ - #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80 - #define ACPI_PROCESSOR_NOTIFY_POWER 0x81 - #define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82 -+#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED 0x85 - - MODULE_AUTHOR("Paul Diefenbaugh"); - MODULE_DESCRIPTION("ACPI Processor Driver"); -@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data) - acpi_bus_generate_netlink_event(device->pnp.device_class, - dev_name(&device->dev), event, 0); - break; -+ case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED: -+ cpufreq_update_limits(pr->id); -+ acpi_bus_generate_netlink_event(device->pnp.device_class, -+ dev_name(&device->dev), event, 0); -+ break; - default: - acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); - break; +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index af5aa2c754c2..c84c30188fdf 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -45,6 +45,7 @@ static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, ++ { X86_FEATURE_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, +diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 +index 438c9e75a04d..97c2d4f15d76 100644 +--- a/drivers/cpufreq/Kconfig.x86 ++++ b/drivers/cpufreq/Kconfig.x86 +@@ -71,6 +71,7 @@ config X86_AMD_PSTATE_DEFAULT_MODE + config X86_AMD_PSTATE_UT + tristate "selftest for AMD Processor P-State driver" + depends on X86 && ACPI_PROCESSOR ++ depends on X86_AMD_PSTATE + default n + help + This kernel module is used for testing. It's safe to say M here. diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c -index 37f1cdf46d29..2fc82831bddd 100644 +index 4ac3a35dcd98..f4f8587c4ea0 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -50,8 +50,6 @@ enum { @@ -1429,151 +115,198 @@ index 37f1cdf46d29..2fc82831bddd 100644 struct acpi_cpufreq_data { unsigned int resume; unsigned int cpu_feature; -diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c -index f04ae67dda37..b3601b0e6dd3 100644 ---- a/drivers/cpufreq/amd-pstate-ut.c -+++ b/drivers/cpufreq/amd-pstate-ut.c -@@ -226,7 +226,7 @@ static void amd_pstate_ut_check_freq(u32 index) - goto skip_test; - } +@@ -139,6 +137,7 @@ static int set_boost(struct cpufreq_policy *policy, int val) + (void *)(long)val, 1); + pr_debug("CPU %*pbl: Core Boosting %s.\n", + cpumask_pr_args(policy->cpus), str_enabled_disabled(val)); ++ policy->boost_enabled = val; -- if (cpudata->boost_supported) { -+ if (amd_pstate_global_params.cpb_boost) { - if ((policy->max == cpudata->max_freq) || - (policy->max == cpudata->nominal_freq)) - amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; -diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 07f341995439..651055df1710 100644 ---- a/drivers/cpufreq/amd-pstate.c -+++ b/drivers/cpufreq/amd-pstate.c -@@ -37,6 +37,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -64,6 +65,10 @@ static struct cpufreq_driver amd_pstate_driver; - static struct cpufreq_driver amd_pstate_epp_driver; - static int cppc_state = AMD_PSTATE_UNDEFINED; - static bool cppc_enabled; -+static bool amd_pstate_prefcore = true; -+static struct quirk_entry *quirks; -+struct amd_pstate_global_params amd_pstate_global_params; -+EXPORT_SYMBOL_GPL(amd_pstate_global_params); - - /* - * AMD Energy Preference Performance (EPP) -@@ -108,6 +113,41 @@ static unsigned int epp_values[] = { - - typedef int (*cppc_mode_transition_fn)(int); - -+static struct quirk_entry quirk_amd_7k62 = { -+ .nominal_freq = 2600, -+ .lowest_freq = 550, -+}; -+ -+static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) -+{ -+ /** -+ * match the broken bios for family 17h processor support CPPC V2 -+ * broken BIOS lack of nominal_freq and lowest_freq capabilities -+ * definition in ACPI tables -+ */ -+ if (boot_cpu_has(X86_FEATURE_ZEN2)) { -+ quirks = dmi->driver_data; -+ pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static const struct dmi_system_id amd_pstate_quirks_table[] __initconst = { -+ { -+ .callback = dmi_matched_7k62_bios_bug, -+ .ident = "AMD EPYC 7K62", -+ .matches = { -+ DMI_MATCH(DMI_BIOS_VERSION, "5.14"), -+ DMI_MATCH(DMI_BIOS_RELEASE, "12/12/2019"), -+ }, -+ .driver_data = &quirk_amd_7k62, -+ }, -+ {} -+}; -+MODULE_DEVICE_TABLE(dmi, amd_pstate_quirks_table); -+ - static inline int get_mode_idx_from_str(const char *str, size_t size) - { - int i; -@@ -291,16 +331,20 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) - { - u64 cap1; - u32 highest_perf; -+ struct cppc_perf_caps cppc_perf; -+ int ret; - -- int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, -+ ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, - &cap1); - if (ret) - return ret; - -- /* -- * TODO: Introduce AMD specific power feature. -- * -- * CPPC entry doesn't indicate the highest performance in some ASICs. -+ ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); -+ if (ret) -+ return ret; -+ -+ /* Some CPUs have different highest_perf from others, it is safer -+ * to read it than to assume some erroneous value, leading to performance issues. - */ - highest_perf = amd_get_highest_perf(); - if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) -@@ -311,7 +355,11 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) - WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); -+ WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1)); -+ WRITE_ONCE(cpudata->lowest_freq, cppc_perf.lowest_freq); -+ WRITE_ONCE(cpudata->nominal_freq, cppc_perf.nominal_freq); -+ return 0; } +diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c +index fc275d41d51e..66b73c308ce6 100644 +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -202,6 +202,7 @@ static void amd_pstate_ut_check_freq(u32 index) + int cpu = 0; + struct cpufreq_policy *policy = NULL; + struct amd_cpudata *cpudata = NULL; ++ u32 nominal_freq_khz; -@@ -319,11 +367,15 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) + for_each_possible_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); +@@ -209,13 +210,14 @@ static void amd_pstate_ut_check_freq(u32 index) + break; + cpudata = policy->driver_data; + +- if (!((cpudata->max_freq >= cpudata->nominal_freq) && +- (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && ++ nominal_freq_khz = cpudata->nominal_freq*1000; ++ if (!((cpudata->max_freq >= nominal_freq_khz) && ++ (nominal_freq_khz > cpudata->lowest_nonlinear_freq) && + (cpudata->lowest_nonlinear_freq > cpudata->min_freq) && + (cpudata->min_freq > 0))) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", +- __func__, cpu, cpudata->max_freq, cpudata->nominal_freq, ++ __func__, cpu, cpudata->max_freq, nominal_freq_khz, + cpudata->lowest_nonlinear_freq, cpudata->min_freq); + goto skip_test; + } +@@ -229,13 +231,13 @@ static void amd_pstate_ut_check_freq(u32 index) + + if (cpudata->boost_supported) { + if ((policy->max == cpudata->max_freq) || +- (policy->max == cpudata->nominal_freq)) ++ (policy->max == nominal_freq_khz)) + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; + else { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", + __func__, cpu, policy->max, cpudata->max_freq, +- cpudata->nominal_freq); ++ nominal_freq_khz); + goto skip_test; + } + } else { +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index 9ad62dbe8bfb..804fab4ebb26 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -51,6 +51,7 @@ + + #define AMD_PSTATE_TRANSITION_LATENCY 20000 + #define AMD_PSTATE_TRANSITION_DELAY 1000 ++#define AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY 600 + #define CPPC_HIGHEST_PERF_PERFORMANCE 196 + #define CPPC_HIGHEST_PERF_DEFAULT 166 + +@@ -85,15 +86,6 @@ struct quirk_entry { + u32 lowest_freq; + }; + +-/* +- * TODO: We need more time to fine tune processors with shared memory solution +- * with community together. +- * +- * There are some performance drops on the CPU benchmarks which reports from +- * Suse. We are co-working with them to fine tune the shared memory solution. So +- * we disable it by default to go acpi-cpufreq on these processors and add a +- * module parameter to be able to enable it manually for debugging. +- */ + static struct cpufreq_driver *current_pstate_driver; + static struct cpufreq_driver amd_pstate_driver; + static struct cpufreq_driver amd_pstate_epp_driver; +@@ -157,7 +149,7 @@ static int __init dmi_matched_7k62_bios_bug(const struct dmi_system_id *dmi) + * broken BIOS lack of nominal_freq and lowest_freq capabilities + * definition in ACPI tables + */ +- if (boot_cpu_has(X86_FEATURE_ZEN2)) { ++ if (cpu_feature_enabled(X86_FEATURE_ZEN2)) { + quirks = dmi->driver_data; + pr_info("Overriding nominal and lowest frequencies for %s\n", dmi->ident); + return 1; +@@ -199,7 +191,7 @@ static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) + u64 epp; + int ret; + +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + if (!cppc_req_cached) { + epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, + &cppc_req_cached); +@@ -247,12 +239,32 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) + return index; + } + ++static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, ++ u32 des_perf, u32 max_perf, bool fast_switch) ++{ ++ if (fast_switch) ++ wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); ++ else ++ wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, ++ READ_ONCE(cpudata->cppc_req_cached)); ++} ++ ++DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf); ++ ++static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, ++ u32 min_perf, u32 des_perf, ++ u32 max_perf, bool fast_switch) ++{ ++ static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, ++ max_perf, fast_switch); ++} ++ + static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) { - struct cppc_perf_caps cppc_perf; - u32 highest_perf; -+ int ret; + int ret; + struct cppc_perf_ctrls perf_ctrls; -- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); -+ ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); - if (ret) - return ret; +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); -+ /* Some CPUs have different highest_perf from others, it is safer -+ * to read it than to assume some erroneous value, leading to performance issues. -+ */ - highest_perf = amd_get_highest_perf(); - if (highest_perf > cppc_perf.highest_perf) - highest_perf = cppc_perf.highest_perf; -@@ -334,7 +386,10 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) - WRITE_ONCE(cpudata->lowest_nonlinear_perf, - cppc_perf.lowest_nonlinear_perf); - WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); -+ WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf); -+ WRITE_ONCE(cpudata->lowest_freq, cppc_perf.lowest_freq); -+ WRITE_ONCE(cpudata->nominal_freq, cppc_perf.nominal_freq); + value &= ~GENMASK_ULL(31, 24); +@@ -263,6 +275,9 @@ static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) + if (!ret) + cpudata->epp_cached = epp; + } else { ++ amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, ++ cpudata->max_limit_perf, false); ++ + perf_ctrls.energy_perf = epp; + ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + if (ret) { +@@ -281,10 +296,8 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, + int epp = -EINVAL; + int ret; - if (cppc_state == AMD_PSTATE_ACTIVE) - return 0; -@@ -430,7 +485,10 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) +- if (!pref_index) { +- pr_debug("EPP pref_index is invalid\n"); +- return -EINVAL; +- } ++ if (!pref_index) ++ epp = cpudata->epp_default; + + if (epp == -EINVAL) + epp = epp_values[pref_index]; +@@ -452,16 +465,6 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) + return static_call(amd_pstate_init_perf)(cpudata); + } + +-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, +- u32 des_perf, u32 max_perf, bool fast_switch) +-{ +- if (fast_switch) +- wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); +- else +- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, +- READ_ONCE(cpudata->cppc_req_cached)); +-} +- + static void cppc_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, + u32 max_perf, bool fast_switch) +@@ -475,16 +478,6 @@ static void cppc_update_perf(struct amd_cpudata *cpudata, + cppc_set_perf(cpudata->cpu, &perf_ctrls); + } + +-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf); +- +-static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, +- u32 min_perf, u32 des_perf, +- u32 max_perf, bool fast_switch) +-{ +- static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, +- max_perf, fast_switch); +-} +- + static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) + { + u64 aperf, mperf, tsc; +@@ -521,7 +514,10 @@ static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, u32 des_perf, u32 max_perf, bool fast_switch, int gov_flags) { @@ -1584,7 +317,7 @@ index 07f341995439..651055df1710 100644 u64 value = prev; min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf, -@@ -439,6 +497,9 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, +@@ -530,6 +526,9 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, cpudata->max_limit_perf); des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); @@ -1594,39 +327,18 @@ index 07f341995439..651055df1710 100644 if ((cppc_state == AMD_PSTATE_GUIDED) && (gov_flags & CPUFREQ_GOV_DYNAMIC_SWITCHING)) { min_perf = des_perf; des_perf = 0; -@@ -450,6 +511,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, +@@ -541,6 +540,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, value &= ~AMD_CPPC_DES_PERF(~0L); value |= AMD_CPPC_DES_PERF(des_perf); + /* limit the max perf when core performance boost feature is disabled */ -+ if (!amd_pstate_global_params.cpb_boost) ++ if (!cpudata->boost_supported) + max_perf = min_t(unsigned long, nominal_perf, max_perf); + value &= ~AMD_CPPC_MAX_PERF(~0L); value |= AMD_CPPC_MAX_PERF(max_perf); -@@ -477,12 +542,19 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy) - - static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) - { -- u32 max_limit_perf, min_limit_perf; -+ u32 max_limit_perf, min_limit_perf, lowest_perf; - struct amd_cpudata *cpudata = policy->driver_data; - - max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); - min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); - -+ lowest_perf = READ_ONCE(cpudata->lowest_perf); -+ if (min_limit_perf < lowest_perf) -+ min_limit_perf = lowest_perf; -+ -+ if (max_limit_perf < min_limit_perf) -+ max_limit_perf = min_limit_perf; -+ - WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); - WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); - WRITE_ONCE(cpudata->max_limit_freq, policy->max); -@@ -553,10 +625,9 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -651,10 +654,9 @@ static void amd_pstate_adjust_perf(unsigned int cpu, unsigned long capacity) { unsigned long max_perf, min_perf, des_perf, @@ -1638,7 +350,7 @@ index 07f341995439..651055df1710 100644 if (policy->min != cpudata->min_limit_freq || policy->max != cpudata->max_limit_freq) amd_pstate_update_min_max_limit(policy); -@@ -564,7 +635,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -662,7 +664,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, cap_perf = READ_ONCE(cpudata->highest_perf); lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); @@ -1646,7 +358,7 @@ index 07f341995439..651055df1710 100644 des_perf = cap_perf; if (target_perf < capacity) -@@ -582,8 +652,6 @@ static void amd_pstate_adjust_perf(unsigned int cpu, +@@ -680,51 +681,111 @@ static void amd_pstate_adjust_perf(unsigned int cpu, max_perf = min_perf; des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); @@ -1655,378 +367,24 @@ index 07f341995439..651055df1710 100644 amd_pstate_update(cpudata, min_perf, des_perf, max_perf, true, policy->governor->flags); -@@ -592,30 +660,30 @@ static void amd_pstate_adjust_perf(unsigned int cpu, - - static int amd_get_min_freq(struct amd_cpudata *cpudata) - { -- struct cppc_perf_caps cppc_perf; -+ u32 lowest_freq; - -- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); -- if (ret) -- return ret; -+ if (quirks && quirks->lowest_freq) -+ lowest_freq = quirks->lowest_freq; -+ else -+ lowest_freq = READ_ONCE(cpudata->lowest_freq); - - /* Switch to khz */ -- return cppc_perf.lowest_freq * 1000; -+ return lowest_freq * 1000; - } - - static int amd_get_max_freq(struct amd_cpudata *cpudata) - { -- struct cppc_perf_caps cppc_perf; - u32 max_perf, max_freq, nominal_freq, nominal_perf; - u64 boost_ratio; - -- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); -- if (ret) -- return ret; -- -- nominal_freq = cppc_perf.nominal_freq; -+ nominal_freq = READ_ONCE(cpudata->nominal_freq); - nominal_perf = READ_ONCE(cpudata->nominal_perf); - max_perf = READ_ONCE(cpudata->highest_perf); - -+ /* when boost is off, the highest perf will be limited to nominal_perf */ -+ if (!amd_pstate_global_params.cpb_boost) -+ max_perf = nominal_perf; -+ - boost_ratio = div_u64(max_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); - -@@ -627,31 +695,25 @@ static int amd_get_max_freq(struct amd_cpudata *cpudata) - - static int amd_get_nominal_freq(struct amd_cpudata *cpudata) - { -- struct cppc_perf_caps cppc_perf; -+ u32 nominal_freq; - -- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); -- if (ret) -- return ret; -+ if (quirks && quirks->nominal_freq) -+ nominal_freq = quirks->nominal_freq; -+ else -+ nominal_freq = READ_ONCE(cpudata->nominal_freq); - -- /* Switch to khz */ -- return cppc_perf.nominal_freq * 1000; -+ return nominal_freq; - } - - static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) - { -- struct cppc_perf_caps cppc_perf; - u32 lowest_nonlinear_freq, lowest_nonlinear_perf, - nominal_freq, nominal_perf; - u64 lowest_nonlinear_ratio; - -- int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); -- if (ret) -- return ret; -- -- nominal_freq = cppc_perf.nominal_freq; -+ nominal_freq = READ_ONCE(cpudata->nominal_freq); - nominal_perf = READ_ONCE(cpudata->nominal_perf); -- -- lowest_nonlinear_perf = cppc_perf.lowest_nonlinear_perf; -+ lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); - - lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, - nominal_perf); -@@ -662,48 +724,164 @@ static int amd_get_lowest_nonlinear_freq(struct amd_cpudata *cpudata) - return lowest_nonlinear_freq * 1000; + cpufreq_cpu_put(policy); } -static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) -+static int amd_pstate_boost_init(struct amd_cpudata *cpudata) ++static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) { -- struct amd_cpudata *cpudata = policy->driver_data; -+ u64 boost_val; + struct amd_cpudata *cpudata = policy->driver_data; ++ struct cppc_perf_ctrls perf_ctrls; ++ u32 highest_perf, nominal_perf, nominal_freq, max_freq; int ret; - if (!cpudata->boost_supported) { - pr_err("Boost mode is not supported by this processor or SBIOS\n"); - return -EINVAL; -+ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_K7_HWCR, &boost_val); -+ if (ret) { -+ pr_err_once("failed to read initial CPU boost state!\n"); -+ return ret; - } - -- if (state) -- policy->cpuinfo.max_freq = cpudata->max_freq; -- else -- policy->cpuinfo.max_freq = cpudata->nominal_freq; -+ amd_pstate_global_params.cpb_supported = !(boost_val & MSR_K7_HWCR_CPB_DIS); -+ amd_pstate_global_params.cpb_boost = amd_pstate_global_params.cpb_supported; - -- policy->max = policy->cpuinfo.max_freq; -+ return ret; -+} - -- ret = freq_qos_update_request(&cpudata->req[1], -- policy->cpuinfo.max_freq); -- if (ret < 0) -- return ret; -+static void amd_perf_ctl_reset(unsigned int cpu) -+{ -+ wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); -+} - -- return 0; -+/* -+ * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks -+ * due to locking, so queue the work for later. -+ */ -+static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) -+{ -+ sched_set_itmt_support(); - } -+static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); - --static void amd_pstate_boost_init(struct amd_cpudata *cpudata) -+/* -+ * Get the highest performance register value. -+ * @cpu: CPU from which to get highest performance. -+ * @highest_perf: Return address. -+ * -+ * Return: 0 for success, -EIO otherwise. -+ */ -+static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) - { -- u32 highest_perf, nominal_perf; -+ int ret; - -- highest_perf = READ_ONCE(cpudata->highest_perf); -- nominal_perf = READ_ONCE(cpudata->nominal_perf); -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ u64 cap1; - -- if (highest_perf <= nominal_perf) -+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); -+ if (ret) -+ return ret; -+ WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); -+ } else { -+ u64 cppc_highest_perf; -+ -+ ret = cppc_get_highest_perf(cpu, &cppc_highest_perf); -+ if (ret) -+ return ret; -+ WRITE_ONCE(*highest_perf, cppc_highest_perf); -+ } -+ -+ return (ret); -+} -+ -+#define CPPC_MAX_PERF U8_MAX -+ -+static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) -+{ -+ int ret, prio; -+ u32 highest_perf; -+ -+ ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); -+ if (ret) - return; - -- cpudata->boost_supported = true; -- current_pstate_driver->boost_enabled = true; -+ cpudata->hw_prefcore = true; -+ /* check if CPPC preferred core feature is enabled*/ -+ if (highest_perf < CPPC_MAX_PERF) -+ prio = (int)highest_perf; -+ else { -+ pr_debug("AMD CPPC preferred core is unsupported!\n"); -+ cpudata->hw_prefcore = false; -+ return; -+ } -+ -+ if (!amd_pstate_prefcore) -+ return; -+ -+ /* -+ * The priorities can be set regardless of whether or not -+ * sched_set_itmt_support(true) has been called and it is valid to -+ * update them at any time after it has been called. -+ */ -+ sched_set_itmt_core_prio(prio, cpudata->cpu); -+ -+ schedule_work(&sched_prefcore_work); - } - --static void amd_perf_ctl_reset(unsigned int cpu) -+static void amd_pstate_update_limits(unsigned int cpu) - { -- wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); -+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); -+ struct amd_cpudata *cpudata = policy->driver_data; -+ u32 prev_high = 0, cur_high = 0; -+ int ret; -+ bool highest_perf_changed = false; -+ -+ mutex_lock(&amd_pstate_driver_lock); -+ if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore)) -+ goto free_cpufreq_put; -+ -+ ret = amd_pstate_get_highest_perf(cpu, &cur_high); -+ if (ret) -+ goto free_cpufreq_put; -+ -+ prev_high = READ_ONCE(cpudata->prefcore_ranking); -+ if (prev_high != cur_high) { -+ highest_perf_changed = true; -+ WRITE_ONCE(cpudata->prefcore_ranking, cur_high); -+ -+ if (cur_high < CPPC_MAX_PERF) -+ sched_set_itmt_core_prio((int)cur_high, cpu); -+ } -+ -+free_cpufreq_put: -+ cpufreq_cpu_put(policy); -+ -+ if (!highest_perf_changed) -+ cpufreq_update_policy(cpu); -+ -+ mutex_unlock(&amd_pstate_driver_lock); -+} -+ -+/** -+ * Get pstate transition delay time from ACPI tables that firmware set -+ * instead of using hardcode value directly. -+ */ -+static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) -+{ -+ u32 transition_delay_ns; -+ -+ transition_delay_ns = cppc_get_transition_latency(cpu); -+ if (transition_delay_ns == CPUFREQ_ETERNAL) -+ return AMD_PSTATE_TRANSITION_DELAY; -+ -+ return transition_delay_ns / NSEC_PER_USEC; -+} -+ -+/** -+ * Get pstate transition latency value from ACPI tables that firmware set -+ * instead of using hardcode value directly. -+ */ -+static u32 amd_pstate_get_transition_latency(unsigned int cpu) -+{ -+ u32 transition_latency; -+ -+ transition_latency = cppc_get_transition_latency(cpu); -+ if (transition_latency == CPUFREQ_ETERNAL) -+ return AMD_PSTATE_TRANSITION_LATENCY; -+ -+ return transition_latency; - } - - static int amd_pstate_cpu_init(struct cpufreq_policy *policy) -@@ -727,24 +905,30 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - - cpudata->cpu = policy->cpu; - -+ amd_pstate_init_prefcore(cpudata); -+ - ret = amd_pstate_init_perf(cpudata); - if (ret) - goto free_cpudata1; - -+ /* initialize cpu cores boot state */ -+ amd_pstate_boost_init(cpudata); -+ - min_freq = amd_get_min_freq(cpudata); -- max_freq = amd_get_max_freq(cpudata); - nominal_freq = amd_get_nominal_freq(cpudata); -+ cpudata->nominal_freq = nominal_freq; -+ max_freq = amd_get_max_freq(cpudata); - lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); - -- if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { -- dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", -- min_freq, max_freq); -+ if (min_freq < 0 || max_freq < 0 || min_freq > max_freq || nominal_freq == 0) { -+ dev_err(dev, "min_freq(%d) or max_freq(%d) or nominal_freq(%d) is incorrect\n", -+ min_freq, max_freq, nominal_freq); - ret = -EINVAL; - goto free_cpudata1; - } - -- policy->cpuinfo.transition_latency = AMD_PSTATE_TRANSITION_LATENCY; -- policy->transition_delay_us = AMD_PSTATE_TRANSITION_DELAY; -+ policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); -+ policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); - - policy->min = min_freq; - policy->max = max_freq; -@@ -777,12 +961,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - cpudata->min_freq = min_freq; - cpudata->max_limit_freq = max_freq; - cpudata->min_limit_freq = min_freq; -- cpudata->nominal_freq = nominal_freq; - cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; - - policy->driver_data = cpudata; - -- amd_pstate_boost_init(cpudata); - if (!current_pstate_driver->adjust_perf) - current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; - -@@ -877,6 +1059,28 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, - return sysfs_emit(buf, "%u\n", perf); - } - -+static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, -+ char *buf) -+{ -+ u32 perf; -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ perf = READ_ONCE(cpudata->prefcore_ranking); -+ -+ return sysfs_emit(buf, "%u\n", perf); -+} -+ -+static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, -+ char *buf) -+{ -+ bool hw_prefcore; -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ hw_prefcore = READ_ONCE(cpudata->hw_prefcore); -+ -+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(hw_prefcore)); -+} -+ - static ssize_t show_energy_performance_available_preferences( - struct cpufreq_policy *policy, char *buf) - { -@@ -1074,18 +1278,125 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, - return ret < 0 ? ret : count; - } - -+static ssize_t prefcore_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); -+} -+ -+static int amd_cpu_boost_update(struct amd_cpudata *cpudata, u32 on) -+{ -+ struct cpufreq_policy *policy = cpufreq_cpu_acquire(cpudata->cpu); -+ struct cppc_perf_ctrls perf_ctrls; -+ u32 highest_perf, nominal_perf; -+ int ret; -+ -+ if (!policy) -+ return -ENODATA; -+ + highest_perf = READ_ONCE(cpudata->highest_perf); + nominal_perf = READ_ONCE(cpudata->nominal_perf); ++ nominal_freq = READ_ONCE(cpudata->nominal_freq); ++ max_freq = READ_ONCE(cpudata->max_freq); + + if (boot_cpu_has(X86_FEATURE_CPPC)) { + u64 value = READ_ONCE(cpudata->cppc_req_cached); @@ -2036,156 +394,270 @@ index 07f341995439..651055df1710 100644 + WRITE_ONCE(cpudata->cppc_req_cached, value); + + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); -+ + } else { + perf_ctrls.max_perf = on ? highest_perf : nominal_perf; -+ ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); ++ ret = cppc_set_perf(cpudata->cpu, &perf_ctrls); + if (ret) { -+ pr_debug("failed to set energy perf value (%d)\n", ret); ++ cpufreq_cpu_release(policy); ++ pr_debug("Failed to set max perf on CPU:%d. ret:%d\n", ++ cpudata->cpu, ret); + return ret; + } -+ } -+ + } + +- if (state) +- policy->cpuinfo.max_freq = cpudata->max_freq; +- else +- policy->cpuinfo.max_freq = cpudata->nominal_freq * 1000; + if (on) -+ policy->cpuinfo.max_freq = cpudata->max_freq; -+ else -+ policy->cpuinfo.max_freq = cpudata->nominal_freq * 1000; -+ -+ policy->max = policy->cpuinfo.max_freq; -+ ++ policy->cpuinfo.max_freq = max_freq; ++ else if (policy->cpuinfo.max_freq > nominal_freq * 1000) ++ policy->cpuinfo.max_freq = nominal_freq * 1000; + + policy->max = policy->cpuinfo.max_freq; + +- ret = freq_qos_update_request(&cpudata->req[1], +- policy->cpuinfo.max_freq); +- if (ret < 0) +- return ret; + if (cppc_state == AMD_PSTATE_PASSIVE) { -+ ret = freq_qos_update_request(&cpudata->req[1], -+ policy->cpuinfo.max_freq); ++ ret = freq_qos_update_request(&cpudata->req[1], policy->cpuinfo.max_freq); ++ if (ret < 0) ++ pr_debug("Failed to update freq constraint: CPU%d\n", cpudata->cpu); + } -+ -+ cpufreq_cpu_release(policy); -+ + +- return 0; ++ return ret < 0 ? ret : 0; + } + +-static void amd_pstate_boost_init(struct amd_cpudata *cpudata) ++static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) + { +- u32 highest_perf, nominal_perf; ++ struct amd_cpudata *cpudata = policy->driver_data; ++ int ret; + +- highest_perf = READ_ONCE(cpudata->highest_perf); +- nominal_perf = READ_ONCE(cpudata->nominal_perf); ++ if (!cpudata->boost_supported) { ++ pr_err("Boost mode is not supported by this processor or SBIOS\n"); ++ return -EOPNOTSUPP; ++ } ++ mutex_lock(&amd_pstate_driver_lock); ++ ret = amd_pstate_cpu_boost_update(policy, state); ++ WRITE_ONCE(cpudata->boost_state, !ret ? state : false); ++ policy->boost_enabled = !ret ? state : false; ++ refresh_frequency_limits(policy); ++ mutex_unlock(&amd_pstate_driver_lock); + +- if (highest_perf <= nominal_perf) +- return; + return ret; +} + -+static ssize_t cpb_boost_show(struct device *dev, -+ struct device_attribute *attr, char *buf) ++static int amd_pstate_init_boost_support(struct amd_cpudata *cpudata) +{ -+ return sysfs_emit(buf, "%u\n", amd_pstate_global_params.cpb_boost); -+} ++ u64 boost_val; ++ int ret = -1; + -+static ssize_t cpb_boost_store(struct device *dev, struct device_attribute *b, -+ const char *buf, size_t count) -+{ -+ bool new_state; -+ ssize_t ret; -+ int cpu; ++ /* ++ * If platform has no CPB support or disable it, initialize current driver ++ * boost_enabled state to be false, it is not an error for cpufreq core to handle. ++ */ ++ if (!cpu_feature_enabled(X86_FEATURE_CPB)) { ++ pr_debug_once("Boost CPB capabilities not present in the processor\n"); ++ ret = 0; ++ goto exit_err; ++ } + +- cpudata->boost_supported = true; ++ /* at least one CPU supports CPB, even if others fail later on to set up */ + current_pstate_driver->boost_enabled = true; + -+ mutex_lock(&amd_pstate_driver_lock); -+ if (!amd_pstate_global_params.cpb_supported) { -+ pr_err("Boost mode is not supported by this processor or SBIOS\n"); ++ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_K7_HWCR, &boost_val); ++ if (ret) { ++ pr_err_once("failed to read initial CPU boost state!\n"); ++ ret = -EIO; ++ goto exit_err; ++ } ++ ++ if (!(boost_val & MSR_K7_HWCR_CPB_DIS)) ++ cpudata->boost_supported = true; ++ ++ return 0; ++ ++exit_err: ++ cpudata->boost_supported = false; ++ return ret; + } + + static void amd_perf_ctl_reset(unsigned int cpu) +@@ -753,7 +814,7 @@ static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) + { + int ret; + +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + u64 cap1; + + ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); +@@ -849,8 +910,12 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) + u32 transition_delay_ns; + + transition_delay_ns = cppc_get_transition_latency(cpu); +- if (transition_delay_ns == CPUFREQ_ETERNAL) +- return AMD_PSTATE_TRANSITION_DELAY; ++ if (transition_delay_ns == CPUFREQ_ETERNAL) { ++ if (cpu_feature_enabled(X86_FEATURE_FAST_CPPC)) ++ return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY; ++ else ++ return AMD_PSTATE_TRANSITION_DELAY; ++ } + + return transition_delay_ns / NSEC_PER_USEC; + } +@@ -921,12 +986,30 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + WRITE_ONCE(cpudata->nominal_freq, nominal_freq); + WRITE_ONCE(cpudata->max_freq, max_freq); + ++ /** ++ * Below values need to be initialized correctly, otherwise driver will fail to load ++ * max_freq is calculated according to (nominal_freq * highest_perf)/nominal_perf ++ * lowest_nonlinear_freq is a value between [min_freq, nominal_freq] ++ * Check _CPC in ACPI table objects if any values are incorrect ++ */ ++ if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) { ++ pr_err("min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect\n", ++ min_freq, max_freq, nominal_freq * 1000); + return -EINVAL; + } + -+ ret = kstrtobool(buf, &new_state); -+ if (ret) ++ if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq * 1000) { ++ pr_err("lowest_nonlinear_freq(%d) value is out of range [min_freq(%d), nominal_freq(%d)]\n", ++ lowest_nonlinear_freq, min_freq, nominal_freq * 1000); + return -EINVAL; -+ -+ amd_pstate_global_params.cpb_boost = !!new_state; -+ -+ for_each_online_cpu(cpu) { -+ -+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ if (!cpudata) { -+ pr_err("cpudata is NULL\n"); -+ ret = -ENODATA; -+ cpufreq_cpu_put(policy); -+ goto err_exit; -+ } -+ -+ amd_cpu_boost_update(cpudata, amd_pstate_global_params.cpb_boost); -+ refresh_frequency_limits(policy); -+ cpufreq_cpu_put(policy); + } + -+err_exit: -+ mutex_unlock(&amd_pstate_driver_lock); -+ return ret < 0 ? ret : count; -+} -+ - cpufreq_freq_attr_ro(amd_pstate_max_freq); - cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); + return 0; + } - cpufreq_freq_attr_ro(amd_pstate_highest_perf); -+cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); -+cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); - cpufreq_freq_attr_rw(energy_performance_preference); - cpufreq_freq_attr_ro(energy_performance_available_preferences); - static DEVICE_ATTR_RW(status); -+static DEVICE_ATTR_RO(prefcore); -+static DEVICE_ATTR_RW(cpb_boost); + static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + { +- int min_freq, max_freq, nominal_freq, ret; ++ int min_freq, max_freq, ret; + struct device *dev; + struct amd_cpudata *cpudata; - static struct freq_attr *amd_pstate_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, -+ &amd_pstate_prefcore_ranking, -+ &amd_pstate_hw_prefcore, - NULL, - }; - -@@ -1093,6 +1404,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, -+ &amd_pstate_prefcore_ranking, -+ &amd_pstate_hw_prefcore, - &energy_performance_preference, - &energy_performance_available_preferences, - NULL, -@@ -1100,6 +1413,8 @@ static struct freq_attr *amd_pstate_epp_attr[] = { - - static struct attribute *pstate_global_attributes[] = { - &dev_attr_status.attr, -+ &dev_attr_prefcore.attr, -+ &dev_attr_cpb_boost.attr, - NULL - }; - -@@ -1151,17 +1466,23 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) - cpudata->cpu = policy->cpu; - cpudata->epp_policy = 0; - -+ amd_pstate_init_prefcore(cpudata); -+ - ret = amd_pstate_init_perf(cpudata); +@@ -955,18 +1038,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) if (ret) goto free_cpudata1; -+ /* initialize cpu cores boot state */ -+ amd_pstate_boost_init(cpudata); ++ ret = amd_pstate_init_boost_support(cpudata); ++ if (ret) ++ goto free_cpudata1; + - min_freq = amd_get_min_freq(cpudata); -- max_freq = amd_get_max_freq(cpudata); - nominal_freq = amd_get_nominal_freq(cpudata); -+ cpudata->nominal_freq = nominal_freq; -+ max_freq = amd_get_max_freq(cpudata); - lowest_nonlinear_freq = amd_get_lowest_nonlinear_freq(cpudata); -- if (min_freq < 0 || max_freq < 0 || min_freq > max_freq) { -- dev_err(dev, "min_freq(%d) or max_freq(%d) value is incorrect\n", -- min_freq, max_freq); -+ if (min_freq < 0 || max_freq < 0 || min_freq > max_freq || nominal_freq == 0) { -+ dev_err(dev, "min_freq(%d) or max_freq(%d) or nominal_freq(%d) is incorrect\n", -+ min_freq, max_freq, nominal_freq); - ret = -EINVAL; - goto free_cpudata1; - } -@@ -1174,7 +1495,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) - /* Initial processor data capability frequencies */ - cpudata->max_freq = max_freq; - cpudata->min_freq = min_freq; -- cpudata->nominal_freq = nominal_freq; - cpudata->lowest_nonlinear_freq = lowest_nonlinear_freq; + min_freq = READ_ONCE(cpudata->min_freq); + max_freq = READ_ONCE(cpudata->max_freq); +- nominal_freq = READ_ONCE(cpudata->nominal_freq); +- +- if (min_freq <= 0 || max_freq <= 0 || +- nominal_freq <= 0 || min_freq > max_freq) { +- dev_err(dev, +- "min_freq(%d) or max_freq(%d) or nominal_freq (%d) value is incorrect, check _CPC in ACPI tables\n", +- min_freq, max_freq, nominal_freq); +- ret = -EINVAL; +- goto free_cpudata1; +- } + + policy->cpuinfo.transition_latency = amd_pstate_get_transition_latency(policy->cpu); + policy->transition_delay_us = amd_pstate_get_transition_delay_us(policy->cpu); +@@ -977,10 +1054,12 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + policy->cpuinfo.min_freq = min_freq; + policy->cpuinfo.max_freq = max_freq; + ++ policy->boost_enabled = READ_ONCE(cpudata->boost_supported); ++ + /* It will be updated by governor */ + policy->cur = policy->cpuinfo.min_freq; + +- if (boot_cpu_has(X86_FEATURE_CPPC)) ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) + policy->fast_switch_possible = true; + + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], +@@ -1002,7 +1081,6 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) policy->driver_data = cpudata; -@@ -1205,7 +1525,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + +- amd_pstate_boost_init(cpudata); + if (!current_pstate_driver->adjust_perf) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + +@@ -1213,7 +1291,7 @@ static int amd_pstate_change_mode_without_dvr_change(int mode) + + cppc_state = mode; + +- if (boot_cpu_has(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) ++ if (cpu_feature_enabled(X86_FEATURE_CPPC) || cppc_state == AMD_PSTATE_ACTIVE) + return 0; + + for_each_present_cpu(cpu) { +@@ -1386,7 +1464,7 @@ static bool amd_pstate_acpi_pm_profile_undefined(void) + + static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + { +- int min_freq, max_freq, nominal_freq, ret; ++ int min_freq, max_freq, ret; + struct amd_cpudata *cpudata; + struct device *dev; + u64 value; +@@ -1417,17 +1495,12 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + if (ret) + goto free_cpudata1; + ++ ret = amd_pstate_init_boost_support(cpudata); ++ if (ret) ++ goto free_cpudata1; ++ + min_freq = READ_ONCE(cpudata->min_freq); + max_freq = READ_ONCE(cpudata->max_freq); +- nominal_freq = READ_ONCE(cpudata->nominal_freq); +- if (min_freq <= 0 || max_freq <= 0 || +- nominal_freq <= 0 || min_freq > max_freq) { +- dev_err(dev, +- "min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect, check _CPC in ACPI tables\n", +- min_freq, max_freq, nominal_freq); +- ret = -EINVAL; +- goto free_cpudata1; +- } + + policy->cpuinfo.min_freq = min_freq; + policy->cpuinfo.max_freq = max_freq; +@@ -1436,11 +1509,13 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + + policy->driver_data = cpudata; + +- cpudata->epp_cached = amd_pstate_get_epp(cpudata, 0); ++ cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata, 0); + + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + ++ policy->boost_enabled = READ_ONCE(cpudata->boost_supported); ++ + /* + * Set the policy to provide a valid fallback value in case + * the default cpufreq governor is neither powersave nor performance. +@@ -1451,7 +1526,7 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + else + policy->policy = CPUFREQ_POLICY_POWERSAVE; + +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); + if (ret) + return ret; +@@ -1462,7 +1537,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) return ret; WRITE_ONCE(cpudata->cppc_cap1_cached, value); } @@ -2193,20 +665,16 @@ index 07f341995439..651055df1710 100644 return 0; -@@ -1232,6 +1551,12 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) - max_limit_perf = div_u64(policy->max * cpudata->highest_perf, cpudata->max_freq); - min_limit_perf = div_u64(policy->min * cpudata->highest_perf, cpudata->max_freq); +@@ -1541,7 +1615,7 @@ static void amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + epp = 0; -+ if (min_limit_perf < min_perf) -+ min_limit_perf = min_perf; -+ -+ if (max_limit_perf < min_limit_perf) -+ max_limit_perf = min_limit_perf; -+ - WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); - WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); - -@@ -1294,6 +1619,12 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) + /* Set initial EPP value */ +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + value &= ~GENMASK_ULL(31, 24); + value |= (u64)epp << 24; + } +@@ -1564,6 +1638,12 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) amd_pstate_epp_update_limit(policy); @@ -2219,174 +687,223 @@ index 07f341995439..651055df1710 100644 return 0; } -@@ -1431,7 +1762,7 @@ static struct cpufreq_driver amd_pstate_driver = { - .exit = amd_pstate_cpu_exit, - .suspend = amd_pstate_cpu_suspend, - .resume = amd_pstate_cpu_resume, -- .set_boost = amd_pstate_set_boost, -+ .update_limits = amd_pstate_update_limits, - .name = "amd-pstate", - .attr = amd_pstate_attr, - }; -@@ -1446,6 +1777,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { - .online = amd_pstate_epp_cpu_online, +@@ -1580,7 +1660,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + value = READ_ONCE(cpudata->cppc_req_cached); + max_perf = READ_ONCE(cpudata->highest_perf); + +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = max_perf; +@@ -1614,7 +1694,7 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy) + value = READ_ONCE(cpudata->cppc_req_cached); + + mutex_lock(&amd_pstate_limits_lock); +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + cpudata->epp_policy = CPUFREQ_POLICY_UNKNOWN; + + /* Set max perf same as min perf */ +@@ -1718,6 +1798,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { .suspend = amd_pstate_epp_suspend, .resume = amd_pstate_epp_resume, -+ .update_limits = amd_pstate_update_limits, + .update_limits = amd_pstate_update_limits, ++ .set_boost = amd_pstate_set_boost, .name = "amd-pstate-epp", .attr = amd_pstate_epp_attr, }; -@@ -1486,6 +1818,11 @@ static int __init amd_pstate_init(void) - if (cpufreq_get_current_driver()) - return -EEXIST; - -+ quirks = NULL; -+ -+ /* check if this machine need CPPC quirks */ -+ dmi_check_system(amd_pstate_quirks_table); -+ - switch (cppc_state) { - case AMD_PSTATE_UNDEFINED: - /* Disable on the following configs by default: -@@ -1567,7 +1904,17 @@ static int __init amd_pstate_param(char *str) - - return amd_pstate_set_driver(mode_idx); +@@ -1741,6 +1822,46 @@ static int __init amd_pstate_set_driver(int mode_idx) + return -EINVAL; } -+ -+static int __init amd_prefcore_param(char *str) -+{ -+ if (!strcmp(str, "disable")) -+ amd_pstate_prefcore = false; -+ -+ return 0; -+} -+ - early_param("amd_pstate", amd_pstate_param); -+early_param("amd_prefcore", amd_prefcore_param); - MODULE_AUTHOR("Huang Rui "); - MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); -diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index 3a0995f8bce8..930b6afba6f4 100644 ---- a/include/acpi/cppc_acpi.h -+++ b/include/acpi/cppc_acpi.h -@@ -139,6 +139,7 @@ struct cppc_cpudata { - #ifdef CONFIG_ACPI_CPPC_LIB - extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); - extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); -+extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf); - extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); - extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); - extern int cppc_set_enable(int cpu, bool enable); -@@ -167,6 +168,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) - { - return -ENOTSUPP; - } -+static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) -+{ -+ return -ENOTSUPP; -+} - static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) - { - return -ENOTSUPP; -diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 6ad02ad9c7b4..e89cf1249715 100644 ---- a/include/linux/amd-pstate.h -+++ b/include/linux/amd-pstate.h -@@ -39,11 +39,16 @@ struct amd_aperf_mperf { - * @cppc_req_cached: cached performance request hints - * @highest_perf: the maximum performance an individual processor may reach, - * assuming ideal conditions -+ * For platforms that do not support the preferred core feature, the -+ * highest_pef may be configured with 166 or 255, to avoid max frequency -+ * calculated wrongly. we take the fixed value as the highest_perf. - * @nominal_perf: the maximum sustained performance level of the processor, - * assuming ideal operating conditions - * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power - * savings are achieved - * @lowest_perf: the absolute lowest performance level of the processor -+ * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher -+ * priority. - * @max_freq: the frequency that mapped to highest_perf - * @min_freq: the frequency that mapped to lowest_perf - * @nominal_freq: the frequency that mapped to nominal_perf -@@ -51,7 +56,9 @@ struct amd_aperf_mperf { - * @cur: Difference of Aperf/Mperf/tsc count between last and current sample - * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value -- * @boost_supported: check whether the Processor or SBIOS supports boost mode -+ * @hw_prefcore: check whether HW supports preferred core featue. -+ * Only when hw_prefcore and early prefcore param are true, -+ * AMD P-State driver supports preferred core featue. - * @epp_policy: Last saved policy used to set energy-performance preference - * @epp_cached: Cached CPPC energy-performance preference value - * @policy: Cpufreq policy value -@@ -70,6 +77,7 @@ struct amd_cpudata { - u32 nominal_perf; - u32 lowest_nonlinear_perf; - u32 lowest_perf; -+ u32 prefcore_ranking; - u32 min_limit_perf; - u32 max_limit_perf; - u32 min_limit_freq; -@@ -79,12 +87,13 @@ struct amd_cpudata { - u32 min_freq; - u32 nominal_freq; - u32 lowest_nonlinear_freq; -+ u32 lowest_freq; - - struct amd_aperf_mperf cur; - struct amd_aperf_mperf prev; - - u64 freq; -- bool boost_supported; -+ bool hw_prefcore; - - /* EPP feature related attributes*/ - s16 epp_policy; -@@ -114,4 +123,23 @@ static const char * const amd_pstate_mode_string[] = { - [AMD_PSTATE_GUIDED] = "guided", - NULL, - }; -+ -+struct quirk_entry { -+ u32 nominal_freq; -+ u32 lowest_freq; -+}; -+ +/** -+ * struct amd_pstate_global_params - Global parameters, mostly tunable via sysfs. -+ * @cpb_boost: Whether or not to use boost CPU P-states. -+ * @cpb_supported: Whether or not CPU boost P-states are available -+ * based on the MSR_K7_HWCR bit[25] state ++ * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F. ++ * show the debug message that helps to check if the CPU has CPPC support for loading issue. + */ -+struct amd_pstate_global_params { -+ bool cpb_boost; -+ bool cpb_supported; -+}; ++static bool amd_cppc_supported(void) ++{ ++ struct cpuinfo_x86 *c = &cpu_data(0); ++ bool warn = false; + -+extern struct amd_pstate_global_params amd_pstate_global_params; ++ if ((boot_cpu_data.x86 == 0x17) && (boot_cpu_data.x86_model < 0x30)) { ++ pr_debug_once("CPPC feature is not supported by the processor\n"); ++ return false; ++ } + - #endif /* _LINUX_AMD_PSTATE_H */ -diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index 320fab7d2e94..3129411fa978 100644 ---- a/include/linux/cpufreq.h -+++ b/include/linux/cpufreq.h -@@ -263,6 +263,7 @@ static inline bool cpufreq_supports_freq_invariance(void) - return false; - } - static inline void disable_cpufreq(void) { } -+static inline void cpufreq_update_limits(unsigned int cpu) { } - #endif ++ /* ++ * If the CPPC feature is disabled in the BIOS for processors that support MSR-based CPPC, ++ * the AMD Pstate driver may not function correctly. ++ * Check the CPPC flag and display a warning message if the platform supports CPPC. ++ * Note: below checking code will not abort the driver registeration process because of ++ * the code is added for debugging purposes. ++ */ ++ if (!cpu_feature_enabled(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_ZEN1) || cpu_feature_enabled(X86_FEATURE_ZEN2)) { ++ if (c->x86_model > 0x60 && c->x86_model < 0xaf) ++ warn = true; ++ } else if (cpu_feature_enabled(X86_FEATURE_ZEN3) || cpu_feature_enabled(X86_FEATURE_ZEN4)) { ++ if ((c->x86_model > 0x10 && c->x86_model < 0x1F) || ++ (c->x86_model > 0x40 && c->x86_model < 0xaf)) ++ warn = true; ++ } else if (cpu_feature_enabled(X86_FEATURE_ZEN5)) { ++ warn = true; ++ } ++ } ++ ++ if (warn) ++ pr_warn_once("The CPPC feature is supported but currently disabled by the BIOS.\n" ++ "Please enable it if your BIOS has the CPPC option.\n"); ++ return true; ++} ++ + static int __init amd_pstate_init(void) + { + struct device *dev_root; +@@ -1749,6 +1870,11 @@ static int __init amd_pstate_init(void) + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return -ENODEV; - #ifdef CONFIG_CPU_FREQ_STAT ++ /* show debug message only if CPPC is not supported */ ++ if (!amd_cppc_supported()) ++ return -EOPNOTSUPP; ++ ++ /* show warning message when BIOS broken or ACPI disabled */ + if (!acpi_cpc_valid()) { + pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n"); + return -ENODEV; +@@ -1763,35 +1889,43 @@ static int __init amd_pstate_init(void) + /* check if this machine need CPPC quirks */ + dmi_check_system(amd_pstate_quirks_table); + +- switch (cppc_state) { +- case AMD_PSTATE_UNDEFINED: ++ /* ++ * determine the driver mode from the command line or kernel config. ++ * If no command line input is provided, cppc_state will be AMD_PSTATE_UNDEFINED. ++ * command line options will override the kernel config settings. ++ */ ++ ++ if (cppc_state == AMD_PSTATE_UNDEFINED) { + /* Disable on the following configs by default: + * 1. Undefined platforms + * 2. Server platforms +- * 3. Shared memory designs + */ + if (amd_pstate_acpi_pm_profile_undefined() || +- amd_pstate_acpi_pm_profile_server() || +- !boot_cpu_has(X86_FEATURE_CPPC)) { ++ amd_pstate_acpi_pm_profile_server()) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } +- ret = amd_pstate_set_driver(CONFIG_X86_AMD_PSTATE_DEFAULT_MODE); +- if (ret) +- return ret; +- break; ++ /* get driver mode from kernel config option [1:4] */ ++ cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE; ++ } ++ ++ switch (cppc_state) { + case AMD_PSTATE_DISABLE: ++ pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + case AMD_PSTATE_PASSIVE: + case AMD_PSTATE_ACTIVE: + case AMD_PSTATE_GUIDED: ++ ret = amd_pstate_set_driver(cppc_state); ++ if (ret) ++ return ret; + break; + default: + return -EINVAL; + } + + /* capability check */ +- if (boot_cpu_has(X86_FEATURE_CPPC)) { ++ if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); + if (cppc_state != AMD_PSTATE_ACTIVE) + current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; +@@ -1805,13 +1939,15 @@ static int __init amd_pstate_init(void) + /* enable amd pstate feature */ + ret = amd_pstate_enable(true); + if (ret) { +- pr_err("failed to enable with return %d\n", ret); ++ pr_err("failed to enable driver mode(%d)\n", cppc_state); + return ret; + } + + ret = cpufreq_register_driver(current_pstate_driver); +- if (ret) ++ if (ret) { + pr_err("failed to register with return %d\n", ret); ++ goto disable_driver; ++ } + + dev_root = bus_get_dev_root(&cpu_subsys); + if (dev_root) { +@@ -1827,6 +1963,8 @@ static int __init amd_pstate_init(void) + + global_attr_free: + cpufreq_unregister_driver(current_pstate_driver); ++disable_driver: ++ amd_pstate_enable(false); + return ret; + } + device_initcall(amd_pstate_init); +diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h +index e6a28e7f4dbf..cc8bb2bc325a 100644 +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -99,6 +99,8 @@ struct amd_cpudata { + u32 policy; + u64 cppc_cap1_cached; + bool suspended; ++ s16 epp_default; ++ bool boost_state; + }; + + #endif /* _LINUX_AMD_PSTATE_H */ +diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c +index 9e5060b27864..270ea04fb616 100644 +--- a/drivers/cpufreq/cpufreq.c ++++ b/drivers/cpufreq/cpufreq.c +@@ -614,10 +614,9 @@ static ssize_t show_boost(struct kobject *kobj, + static ssize_t store_boost(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) + { +- int ret, enable; ++ bool enable; + +- ret = sscanf(buf, "%d", &enable); +- if (ret != 1 || enable < 0 || enable > 1) ++ if (kstrtobool(buf, &enable)) + return -EINVAL; + + if (cpufreq_boost_trigger_state(enable)) { +@@ -641,10 +640,10 @@ static ssize_t show_local_boost(struct cpufreq_policy *policy, char *buf) + static ssize_t store_local_boost(struct cpufreq_policy *policy, + const char *buf, size_t count) + { +- int ret, enable; ++ int ret; ++ bool enable; + +- ret = kstrtoint(buf, 10, &enable); +- if (ret || enable < 0 || enable > 1) ++ if (kstrtobool(buf, &enable)) + return -EINVAL; + + if (!cpufreq_driver->boost_enabled) -- -2.44.0 +2.46.0.rc1 -From 7f2e4860d7405f71337e99ea74b84ebcd2c3b90c Mon Sep 17 00:00:00 2001 +From fdecce0ee8a06092cd381604a8f4f26ef0c9561a Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 3 Apr 2024 17:06:31 +0200 -Subject: [PATCH 3/8] bbr3 +Date: Mon, 15 Jul 2024 13:23:19 +0200 +Subject: [PATCH 02/11] bbr3 Signed-off-by: Peter Jung --- @@ -2409,7 +926,7 @@ Signed-off-by: Peter Jung 16 files changed, 1940 insertions(+), 553 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h -index a1c47a6d69b0..9e63e5580dc5 100644 +index 6a5e08b937b3..27aab715490e 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -369,7 +369,9 @@ struct tcp_sock { @@ -2421,10 +938,10 @@ index a1c47a6d69b0..9e63e5580dc5 100644 + tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ + unused:2; u8 thin_lto : 1,/* Use linear timeouts for thin streams */ - recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index 9ab4bf704e86..f681cfdb2164 100644 +index c0deaafebfdc..d53f042d936e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -137,8 +137,8 @@ struct inet_connection_sock { @@ -2439,10 +956,10 @@ index 9ab4bf704e86..f681cfdb2164 100644 #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h -index f6eba9652d01..3998a5f145ad 100644 +index 060e95b331a2..953244eefe7d 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h -@@ -381,6 +381,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) +@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 @@ -2451,7 +968,7 @@ index f6eba9652d01..3998a5f145ad 100644 enum tcp_tw_status { TCP_TW_SUCCESS = 0, -@@ -737,6 +739,15 @@ static inline void tcp_fast_path_check(struct sock *sk) +@@ -778,6 +780,15 @@ static inline void tcp_fast_path_check(struct sock *sk) u32 tcp_delack_max(const struct sock *sk); @@ -2467,7 +984,7 @@ index f6eba9652d01..3998a5f145ad 100644 /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(const struct sock *sk) { -@@ -842,6 +853,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +@@ -883,6 +894,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } @@ -2479,7 +996,7 @@ index f6eba9652d01..3998a5f145ad 100644 /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { -@@ -930,9 +946,14 @@ struct tcp_skb_cb { +@@ -972,9 +988,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -2496,7 +1013,7 @@ index f6eba9652d01..3998a5f145ad 100644 } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; -@@ -1036,6 +1057,7 @@ enum tcp_ca_event { +@@ -1078,6 +1099,7 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ @@ -2504,7 +1021,7 @@ index f6eba9652d01..3998a5f145ad 100644 }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -@@ -1058,7 +1080,11 @@ enum tcp_ca_ack_event_flags { +@@ -1100,7 +1122,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 @@ -2517,7 +1034,7 @@ index f6eba9652d01..3998a5f145ad 100644 union tcp_cc_info; -@@ -1078,10 +1104,13 @@ struct ack_sample { +@@ -1120,10 +1146,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ @@ -2532,7 +1049,7 @@ index f6eba9652d01..3998a5f145ad 100644 long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ -@@ -1092,7 +1121,9 @@ struct rate_sample { +@@ -1134,7 +1163,9 @@ struct rate_sample { u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ @@ -2542,7 +1059,7 @@ index f6eba9652d01..3998a5f145ad 100644 }; struct tcp_congestion_ops { -@@ -1116,8 +1147,11 @@ struct tcp_congestion_ops { +@@ -1158,8 +1189,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); @@ -2556,7 +1073,7 @@ index f6eba9652d01..3998a5f145ad 100644 /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) -@@ -1183,6 +1217,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +@@ -1225,6 +1259,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif @@ -2571,7 +1088,7 @@ index f6eba9652d01..3998a5f145ad 100644 static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1202,6 +1244,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) +@@ -1244,6 +1286,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ @@ -2579,7 +1096,7 @@ index f6eba9652d01..3998a5f145ad 100644 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); -@@ -1214,6 +1257,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +@@ -1256,6 +1299,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } @@ -2601,7 +1118,7 @@ index f6eba9652d01..3998a5f145ad 100644 /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. -@@ -2373,7 +2431,7 @@ struct tcp_plb_state { +@@ -2418,7 +2476,7 @@ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ @@ -2665,10 +1182,10 @@ index 3b687d20c9ed..a7c30c243b54 100644 struct rta_session { __u8 proto; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h -index c07e9f90c084..5c88336ced60 100644 +index dbf896f3146c..4702cd2f1ffc 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h -@@ -176,6 +176,7 @@ enum tcp_fastopen_client_fail { +@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ @@ -2709,10 +1226,10 @@ index 8e94ed7c56a0..50dc9970cad2 100644 choice prompt "Default TCP congestion control" diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c -index ae8b15e6896f..beb040e80b6f 100644 +index 18227757ec0c..f180befc28bd 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c -@@ -296,11 +296,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp +@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp { } @@ -2726,10 +1243,10 @@ index ae8b15e6896f..beb040e80b6f 100644 +{ +} + - static void bpf_tcp_ca_cong_control(struct sock *sk, const struct rate_sample *rs) + static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) { - } -@@ -330,7 +334,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { +@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { .cwnd_event = bpf_tcp_ca_cwnd_event, .in_ack_event = bpf_tcp_ca_in_ack_event, .pkts_acked = bpf_tcp_ca_pkts_acked, @@ -2740,10 +1257,10 @@ index ae8b15e6896f..beb040e80b6f 100644 .undo_cwnd = bpf_tcp_ca_undo_cwnd, .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index a4f418592314..58469fe5195e 100644 +index e6790ea74877..b63e27eba536 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -3089,6 +3089,7 @@ int tcp_disconnect(struct sock *sk, int flags) +@@ -3120,6 +3120,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; @@ -2751,7 +1268,7 @@ index a4f418592314..58469fe5195e 100644 /* Clean up fastopen related fields */ -@@ -3815,6 +3816,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) +@@ -3846,6 +3847,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; @@ -2761,7 +1278,7 @@ index a4f418592314..58469fe5195e 100644 info->tcpi_options |= TCPI_OPT_SYN_DATA; if (tp->tcp_usec_ts) diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c -index 22358032dd48..cd6bef71bf4c 100644 +index 760941e55153..a180fa648d5e 100644 --- a/net/ipv4/tcp_bbr.c +++ b/net/ipv4/tcp_bbr.c @@ -1,18 +1,19 @@ @@ -3934,7 +2451,7 @@ index 22358032dd48..cd6bef71bf4c 100644 + return 3; } --__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) +/* Incorporate a new bw sample into the current window of our max filter. */ +static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) { @@ -5036,7 +3553,7 @@ index 22358032dd48..cd6bef71bf4c 100644 + return false; +} + -+__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); @@ -5378,7 +3895,7 @@ index 22358032dd48..cd6bef71bf4c 100644 .get_info = bbr_get_info, .set_state = bbr_set_state, }; -@@ -1161,10 +2361,11 @@ BTF_SET8_START(tcp_bbr_check_kfunc_ids) +@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) BTF_ID_FLAGS(func, bbr_init) BTF_ID_FLAGS(func, bbr_main) BTF_ID_FLAGS(func, bbr_sndbuf_expand) @@ -5389,9 +3906,9 @@ index 22358032dd48..cd6bef71bf4c 100644 -BTF_ID_FLAGS(func, bbr_min_tso_segs) +BTF_ID_FLAGS(func, bbr_tso_segs) BTF_ID_FLAGS(func, bbr_set_state) - #endif - #endif -@@ -1199,5 +2400,12 @@ MODULE_AUTHOR("Van Jacobson "); + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson "); MODULE_AUTHOR("Neal Cardwell "); MODULE_AUTHOR("Yuchung Cheng "); MODULE_AUTHOR("Soheil Hassas Yeganeh "); @@ -5405,10 +3922,10 @@ index 22358032dd48..cd6bef71bf4c 100644 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +MODULE_VERSION(__stringify(BBR_VERSION)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index 1b34050a7538..66d40449b3f4 100644 +index 28ffcfbeef14..7b13915ba288 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c -@@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk) +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) struct inet_connection_sock *icsk = inet_csk(sk); tcp_sk(sk)->prior_ssthresh = 0; @@ -5417,10 +3934,10 @@ index 1b34050a7538..66d40449b3f4 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index df7b13f0e5e0..8415aa41524e 100644 +index 38da23f991d6..37d2b393088a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c -@@ -364,7 +364,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +@@ -365,7 +365,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: @@ -5429,7 +3946,7 @@ index df7b13f0e5e0..8415aa41524e 100644 tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { -@@ -375,7 +375,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: @@ -5438,7 +3955,7 @@ index df7b13f0e5e0..8415aa41524e 100644 tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; -@@ -1112,7 +1112,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) +@@ -1115,7 +1115,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { @@ -5451,7 +3968,7 @@ index df7b13f0e5e0..8415aa41524e 100644 } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) -@@ -1493,6 +1498,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, +@@ -1496,6 +1501,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); @@ -5469,7 +3986,7 @@ index df7b13f0e5e0..8415aa41524e 100644 /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep -@@ -3761,7 +3777,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +@@ -3790,7 +3806,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ @@ -5479,7 +3996,7 @@ index df7b13f0e5e0..8415aa41524e 100644 { struct tcp_sock *tp = tcp_sk(sk); -@@ -3778,6 +3795,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +@@ -3807,6 +3824,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) /* ACK advances: there was a loss, so reduce cwnd. Reset * tlp_high_seq in tcp_init_cwnd_reduction() */ @@ -5487,7 +4004,7 @@ index df7b13f0e5e0..8415aa41524e 100644 tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); -@@ -3788,6 +3806,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +@@ -3817,6 +3835,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; @@ -5499,7 +4016,7 @@ index df7b13f0e5e0..8415aa41524e 100644 } } -@@ -3896,6 +3919,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3925,6 +3948,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); @@ -5507,7 +4024,7 @@ index df7b13f0e5e0..8415aa41524e 100644 /* ts_recent update must be made after we are sure that the packet * is in window. -@@ -3970,7 +3994,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3999,7 +4023,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) @@ -5516,7 +4033,7 @@ index df7b13f0e5e0..8415aa41524e 100644 if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | -@@ -3994,6 +4018,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -4023,6 +4047,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); @@ -5524,7 +4041,7 @@ index df7b13f0e5e0..8415aa41524e 100644 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); -@@ -4013,7 +4038,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -4042,7 +4067,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_ack_probe(sk); if (tp->tlp_high_seq) @@ -5533,7 +4050,7 @@ index df7b13f0e5e0..8415aa41524e 100644 return 1; old_ack: -@@ -5664,13 +5689,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -5704,13 +5729,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -5551,7 +4068,7 @@ index df7b13f0e5e0..8415aa41524e 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index 0ecc7311dc6c..82622782486a 100644 +index 538c06f95918..e4c861c071ae 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -460,6 +460,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -5564,10 +4081,10 @@ index 0ecc7311dc6c..82622782486a 100644 const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index e3167ad96567..08fe7a626be1 100644 +index 95618d0e78e4..3f4bdd2b6476 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c -@@ -332,10 +332,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || tcp_ca_needs_ecn(sk) || bpf_needs_ecn; @@ -5579,7 +4096,7 @@ index e3167ad96567..08fe7a626be1 100644 if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) use_ecn = true; } -@@ -347,6 +346,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) +@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = TCP_ECN_OK; if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) INET_ECN_xmit(sk); @@ -5589,7 +4106,7 @@ index e3167ad96567..08fe7a626be1 100644 } } -@@ -384,7 +386,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, +@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, th->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; } @@ -5599,7 +4116,7 @@ index e3167ad96567..08fe7a626be1 100644 /* ACK or retransmitted segment: clear ECT|CE */ INET_ECN_dontxmit(sk); } -@@ -1593,7 +1596,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, +@@ -1601,7 +1604,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -5608,7 +4125,7 @@ index e3167ad96567..08fe7a626be1 100644 long limit; int nlen; u8 flags; -@@ -1668,6 +1671,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, +@@ -1676,6 +1679,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, if (diff) tcp_adjust_pcount(sk, skb, diff); @@ -5639,7 +4156,7 @@ index e3167ad96567..08fe7a626be1 100644 } /* Link BUFF into the send queue. */ -@@ -2025,13 +2052,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, +@@ -2033,13 +2060,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; @@ -5658,7 +4175,7 @@ index e3167ad96567..08fe7a626be1 100644 return min_t(u32, tso_segs, sk->sk_gso_max_segs); } -@@ -2731,6 +2757,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, +@@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); @@ -5666,7 +4183,7 @@ index e3167ad96567..08fe7a626be1 100644 goto repair; /* Skip network transmission */ } -@@ -2944,6 +2971,7 @@ void tcp_send_loss_probe(struct sock *sk) +@@ -2981,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk) if (WARN_ON(!skb || !tcp_skb_pcount(skb))) goto rearm_timer; @@ -5755,10 +4272,10 @@ index a8f6d9d06f2e..8737f2134648 100644 rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index d1ad20ce1c8c..ef74f33c7905 100644 +index 892c86657fbc..33c2c9252364 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c -@@ -678,6 +678,7 @@ void tcp_write_timer_handler(struct sock *sk) +@@ -693,6 +693,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } @@ -5767,22 +4284,22 @@ index d1ad20ce1c8c..ef74f33c7905 100644 event = icsk->icsk_pending; -- -2.44.0 +2.46.0.rc1 -From 71b4361aff469d7e31d2260c0f689a976a1a89d0 Mon Sep 17 00:00:00 2001 +From 3bf203491864f9a7c6c234128a2d82fb8f448683 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 3 Apr 2024 17:06:41 +0200 -Subject: [PATCH 4/8] block +Date: Mon, 15 Jul 2024 13:23:33 +0200 +Subject: [PATCH 03/11] block Signed-off-by: Peter Jung --- block/bfq-iosched.c | 120 ++++++++++++++++++++++++++++++++++++-------- block/bfq-iosched.h | 16 +++++- - block/mq-deadline.c | 114 +++++++++++++++++++++++++++++++++-------- - 3 files changed, 205 insertions(+), 45 deletions(-) + block/mq-deadline.c | 110 +++++++++++++++++++++++++++++++++------- + 3 files changed, 203 insertions(+), 43 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 3cce6de464a7..9bd57baa4b0b 100644 +index 4b88a54a9b76..88df08a246fa 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) @@ -6040,10 +4557,10 @@ index 467e8cfc41a2..f44f5d4ec2f4 100644 * bic associated with the task issuing current bio for * merging. This and the next field are used as a support to diff --git a/block/mq-deadline.c b/block/mq-deadline.c -index 02a916ba62ee..8bf621316a9e 100644 +index 94eede4fb9eb..567fd69a146c 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c -@@ -79,10 +79,24 @@ struct dd_per_prio { +@@ -79,10 +79,23 @@ struct dd_per_prio { struct io_stats_per_prio stats; }; @@ -6058,7 +4575,6 @@ index 02a916ba62ee..8bf621316a9e 100644 + struct { + spinlock_t lock; + spinlock_t insert_lock; -+ spinlock_t zone_lock; + } ____cacheline_aligned_in_smp; + + unsigned long run_state; @@ -6068,17 +4584,16 @@ index 02a916ba62ee..8bf621316a9e 100644 struct dd_per_prio per_prio[DD_PRIO_COUNT]; -@@ -100,9 +114,6 @@ struct deadline_data { +@@ -100,8 +113,6 @@ struct deadline_data { int front_merges; u32 async_depth; int prio_aging_expire; - - spinlock_t lock; -- spinlock_t zone_lock; }; /* Maps an I/O priority class to a deadline scheduler priority. */ -@@ -113,6 +124,9 @@ static const enum dd_prio ioprio_class_to_prio[] = { +@@ -112,6 +123,9 @@ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; @@ -6088,7 +4603,7 @@ index 02a916ba62ee..8bf621316a9e 100644 static inline struct rb_root * deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { -@@ -585,6 +599,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, +@@ -451,6 +465,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, return NULL; } @@ -6122,7 +4637,7 @@ index 02a916ba62ee..8bf621316a9e 100644 /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * -@@ -595,12 +636,27 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, +@@ -461,12 +502,27 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { @@ -6151,7 +4666,7 @@ index 02a916ba62ee..8bf621316a9e 100644 rq = dd_dispatch_prio_aged_requests(dd, now); if (rq) goto unlock; -@@ -616,8 +672,10 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +@@ -482,8 +538,10 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) } unlock: @@ -6162,13 +4677,12 @@ index 02a916ba62ee..8bf621316a9e 100644 return rq; } -@@ -705,6 +763,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +@@ -571,6 +629,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; + spin_lock_init(&dd->lock); + spin_lock_init(&dd->insert_lock); -+ spin_lock_init(&dd->zone_lock); + + INIT_LIST_HEAD(&dd->at_head); + INIT_LIST_HEAD(&dd->at_tail); @@ -6176,16 +4690,15 @@ index 02a916ba62ee..8bf621316a9e 100644 for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; -@@ -721,8 +786,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) +@@ -587,7 +651,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; dd->prio_aging_expire = prio_aging_expire; - spin_lock_init(&dd->lock); -- spin_lock_init(&dd->zone_lock); /* We dispatch from request queue wide instead of hw queue */ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); -@@ -778,7 +841,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, +@@ -643,7 +706,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, struct request *free = NULL; bool ret; @@ -6206,7 +4719,7 @@ index 02a916ba62ee..8bf621316a9e 100644 ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); spin_unlock(&dd->lock); -@@ -791,10 +866,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, +@@ -656,10 +731,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, /* * add rq to rbtree and fifo */ @@ -6218,12 +4731,12 @@ index 02a916ba62ee..8bf621316a9e 100644 struct deadline_data *dd = q->elevator->elevator_data; const enum dd_data_dir data_dir = rq_data_dir(rq); u16 ioprio = req_get_ioprio(rq); -@@ -866,19 +940,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, +@@ -713,19 +787,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - LIST_HEAD(free); - +- - spin_lock(&dd->lock); - while (!list_empty(list)) { - struct request *rq; @@ -6233,7 +4746,7 @@ index 02a916ba62ee..8bf621316a9e 100644 - dd_insert_request(hctx, rq, flags, &free); - } - spin_unlock(&dd->lock); -- + - blk_mq_free_requests(&free); + spin_lock(&dd->insert_lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) @@ -6244,7 +4757,7 @@ index 02a916ba62ee..8bf621316a9e 100644 } /* Callback from inside blk_mq_rq_ctx_init(). */ -@@ -957,6 +1025,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +@@ -766,6 +834,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) struct deadline_data *dd = hctx->queue->elevator->elevator_data; enum dd_prio prio; @@ -6256,79 +4769,95 @@ index 02a916ba62ee..8bf621316a9e 100644 if (dd_has_work_for_prio(&dd->per_prio[prio])) return true; -- -2.44.0 +2.46.0.rc1 -From b667355ece89a997a7b8508e6d6f1b5be46d3833 Mon Sep 17 00:00:00 2001 +From 3eb49a6c890c1da829c0ac8fe76caec909cb2103 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 3 Apr 2024 17:06:52 +0200 -Subject: [PATCH 5/8] cachy +Date: Fri, 19 Jul 2024 08:04:09 +0200 +Subject: [PATCH 04/11] cachy Signed-off-by: Peter Jung --- - .../admin-guide/kernel-parameters.txt | 9 + - Documentation/admin-guide/sysctl/vm.rst | 72 ++ - Makefile | 162 ++++- - arch/arm/Makefile | 56 +- - arch/x86/Kconfig.cpu | 426 +++++++++++- - arch/x86/Makefile | 19 +- - arch/x86/Makefile_32.cpu | 41 -- - arch/x86/include/asm/pci.h | 6 + - arch/x86/include/asm/vermagic.h | 74 ++ - arch/x86/pci/common.c | 7 +- - block/bfq-iosched.c | 6 + - block/elevator.c | 10 + - drivers/ata/ahci.c | 23 +- - drivers/cpufreq/Kconfig.x86 | 2 - - drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + - drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 + - drivers/gpu/drm/amd/display/Kconfig | 6 + - .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- - .../amd/display/amdgpu_dm/amdgpu_dm_color.c | 2 +- - .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 6 +- - .../amd/display/amdgpu_dm/amdgpu_dm_plane.c | 6 +- - drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 + - drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 +- - drivers/i2c/busses/Kconfig | 9 + - drivers/i2c/busses/Makefile | 1 + - drivers/i2c/busses/i2c-nct6775.c | 648 ++++++++++++++++++ - drivers/i2c/busses/i2c-piix4.c | 4 +- - drivers/input/evdev.c | 19 +- - drivers/md/dm-crypt.c | 5 + - drivers/pci/controller/Makefile | 6 + - drivers/pci/controller/intel-nvme-remap.c | 462 +++++++++++++ - drivers/pci/quirks.c | 101 +++ - drivers/platform/x86/Kconfig | 14 + - drivers/platform/x86/Makefile | 3 + - drivers/platform/x86/steamdeck.c | 523 ++++++++++++++ - include/linux/mm.h | 10 +- - include/linux/pagemap.h | 2 +- - include/linux/user_namespace.h | 4 + - init/Kconfig | 26 + - kernel/Kconfig.hz | 24 + - kernel/fork.c | 14 + - kernel/sched/fair.c | 13 + - kernel/sched/sched.h | 2 +- - kernel/sysctl.c | 46 ++ - kernel/user_namespace.c | 7 + - mm/Kconfig | 65 +- - mm/compaction.c | 4 + - mm/huge_memory.c | 4 + - mm/mm_init.c | 1 + - mm/page-writeback.c | 8 + - mm/page_alloc.c | 27 +- - mm/swap.c | 5 + - mm/vmpressure.c | 4 + - mm/vmscan.c | 178 ++++- - 54 files changed, 3020 insertions(+), 182 deletions(-) + .../admin-guide/kernel-parameters.txt | 12 + + Makefile | 7 +- + arch/x86/Kconfig.cpu | 432 ++- + arch/x86/Makefile | 45 +- + arch/x86/include/asm/pci.h | 6 + + arch/x86/include/asm/vermagic.h | 76 + + arch/x86/pci/common.c | 7 +- + block/bfq-iosched.c | 6 + + block/elevator.c | 10 + + drivers/Makefile | 13 +- + drivers/ata/ahci.c | 23 +- + drivers/cpufreq/Kconfig.x86 | 2 - + drivers/cpufreq/intel_pstate.c | 2 + + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 + + drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c | 53 +- + drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h | 1 + + drivers/gpu/drm/amd/display/Kconfig | 6 + + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- + .../amd/display/amdgpu_dm/amdgpu_dm_color.c | 2 +- + .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 6 +- + .../amd/display/amdgpu_dm/amdgpu_dm_plane.c | 6 +- + .../amd/display/dc/optc/dcn10/dcn10_optc.c | 15 +- + .../amd/display/dc/optc/dcn20/dcn20_optc.c | 10 + + drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 + + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 +- + drivers/gpu/drm/drm_atomic_uapi.c | 11 +- + drivers/i2c/busses/Kconfig | 9 + + drivers/i2c/busses/Makefile | 1 + + drivers/i2c/busses/i2c-nct6775.c | 648 ++++ + drivers/i2c/busses/i2c-piix4.c | 4 +- + drivers/input/evdev.c | 19 +- + drivers/md/dm-crypt.c | 5 + + drivers/media/v4l2-core/Kconfig | 5 + + drivers/media/v4l2-core/Makefile | 2 + + drivers/media/v4l2-core/v4l2loopback.c | 3184 +++++++++++++++++ + drivers/media/v4l2-core/v4l2loopback.h | 98 + + .../media/v4l2-core/v4l2loopback_formats.h | 445 +++ + drivers/pci/controller/Makefile | 6 + + drivers/pci/controller/intel-nvme-remap.c | 462 +++ + drivers/pci/quirks.c | 101 + + include/linux/pagemap.h | 2 +- + include/linux/user_namespace.h | 4 + + init/Kconfig | 26 + + kernel/Kconfig.hz | 24 + + kernel/fork.c | 14 + + kernel/sched/fair.c | 13 + + kernel/sched/sched.h | 2 +- + kernel/sysctl.c | 12 + + kernel/user_namespace.c | 7 + + mm/Kconfig | 2 +- + mm/compaction.c | 4 + + mm/huge_memory.c | 4 + + mm/page-writeback.c | 8 + + mm/page_alloc.c | 4 + + mm/swap.c | 5 + + mm/vmpressure.c | 4 + + mm/vmscan.c | 8 + + 58 files changed, 5800 insertions(+), 113 deletions(-) create mode 100644 drivers/i2c/busses/i2c-nct6775.c + create mode 100644 drivers/media/v4l2-core/v4l2loopback.c + create mode 100644 drivers/media/v4l2-core/v4l2loopback.h + create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h create mode 100644 drivers/pci/controller/intel-nvme-remap.c - create mode 100644 drivers/platform/x86/steamdeck.c diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 71ed7f1b0f9b..fbfaea49cbed 100644 +index 27ec49af1bf2..07ac4c81a7dd 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -4394,6 +4394,15 @@ +@@ -2229,6 +2229,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + active + Use intel_pstate driver to bypass the scaling + governors layer of cpufreq and provides it own +@@ -4447,6 +4450,15 @@ nomsi [MSI] If the PCI_MSI kernel config parameter is enabled, this kernel boot option can be used to disable the use of MSI interrupts system-wide. @@ -6344,263 +4873,11 @@ index 71ed7f1b0f9b..fbfaea49cbed 100644 noioapicquirk [APIC] Disable all boot interrupt quirks. Safety option to keep boot IRQs enabled. This should never be necessary. -diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst -index c59889de122b..468ae7dec1e1 100644 ---- a/Documentation/admin-guide/sysctl/vm.rst -+++ b/Documentation/admin-guide/sysctl/vm.rst -@@ -25,6 +25,9 @@ files can be found in mm/swap.c. - Currently, these files are in /proc/sys/vm: - - - admin_reserve_kbytes -+- anon_min_ratio -+- clean_low_ratio -+- clean_min_ratio - - compact_memory - - compaction_proactiveness - - compact_unevictable_allowed -@@ -106,6 +109,67 @@ On x86_64 this is about 128MB. - Changing this takes effect whenever an application requests memory. - - -+anon_min_ratio -+============== -+ -+This knob provides *hard* protection of anonymous pages. The anonymous pages -+on the current node won't be reclaimed under any conditions when their amount -+is below vm.anon_min_ratio. -+ -+This knob may be used to prevent excessive swap thrashing when anonymous -+memory is low (for example, when memory is going to be overfilled by -+compressed data of zram module). -+ -+Setting this value too high (close to 100) can result in inability to -+swap and can lead to early OOM under memory pressure. -+ -+The unit of measurement is the percentage of the total memory of the node. -+ -+The default value is 15. -+ -+ -+clean_low_ratio -+================ -+ -+This knob provides *best-effort* protection of clean file pages. The file pages -+on the current node won't be reclaimed under memory pressure when the amount of -+clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. -+ -+Protection of clean file pages using this knob may be used when swapping is -+still possible to -+ - prevent disk I/O thrashing under memory pressure; -+ - improve performance in disk cache-bound tasks under memory pressure. -+ -+Setting it to a high value may result in a early eviction of anonymous pages -+into the swap space by attempting to hold the protected amount of clean file -+pages in memory. -+ -+The unit of measurement is the percentage of the total memory of the node. -+ -+The default value is 0. -+ -+ -+clean_min_ratio -+================ -+ -+This knob provides *hard* protection of clean file pages. The file pages on the -+current node won't be reclaimed under memory pressure when the amount of clean -+file pages is below vm.clean_min_ratio. -+ -+Hard protection of clean file pages using this knob may be used to -+ - prevent disk I/O thrashing under memory pressure even with no free swap space; -+ - improve performance in disk cache-bound tasks under memory pressure; -+ - avoid high latency and prevent livelock in near-OOM conditions. -+ -+Setting it to a high value may result in a early out-of-memory condition due to -+the inability to reclaim the protected amount of clean file pages when other -+types of pages cannot be reclaimed. -+ -+The unit of measurement is the percentage of the total memory of the node. -+ -+The default value is 15. -+ -+ - compact_memory - ============== - -@@ -910,6 +974,14 @@ be 133 (x + 2x = 200, 2x = 133.33). - At 0, the kernel will not initiate swap until the amount of free and - file-backed pages is less than the high watermark in a zone. - -+This knob has no effect if the amount of clean file pages on the current -+node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, -+only anonymous pages can be reclaimed. -+ -+If the number of anonymous pages on the current node is below -+vm.anon_min_ratio, then only file pages can be reclaimed with -+any vm.swappiness value. -+ - - unprivileged_userfaultfd - ======================== diff --git a/Makefile b/Makefile -index a78379891d22..e58a4e647e7d 100644 +index 3d10e3aadeda..b9435cef21b0 100644 --- a/Makefile +++ b/Makefile -@@ -808,9 +808,164 @@ endif # need-config - - KBUILD_CFLAGS += -fno-delete-null-pointer-checks - -+# This selects which ARM instruction set is used. -+arch-$(CONFIG_CPU_32v7M) :=-march=armv7-m -+arch-$(CONFIG_CPU_32v7) :=-march=armv7-a -+arch-$(CONFIG_CPU_32v6) :=-march=armv6 -+# Only override the compiler option if ARMv6. The ARMv6K extensions are -+# always available in ARMv7 -+ifeq ($(CONFIG_CPU_32v6),y) -+arch-$(CONFIG_CPU_32v6K) :=-march=armv6k -+endif -+arch-$(CONFIG_CPU_32v5) :=-march=armv5te -+arch-$(CONFIG_CPU_32v4T) :=-march=armv4t -+arch-$(CONFIG_CPU_32v4) :=-march=armv4 -+arch-$(CONFIG_CPU_32v3) :=-march=armv3m -+ -+# Note that GCC does not numerically define an architecture version -+# macro, but instead defines a whole series of macros which makes -+# testing for a specific architecture or later rather impossible. -+cpp-$(CONFIG_CPU_32v7M) :=-D__LINUX_ARM_ARCH__=7 -+cpp-$(CONFIG_CPU_32v7) :=-D__LINUX_ARM_ARCH__=7 -+cpp-$(CONFIG_CPU_32v6) :=-D__LINUX_ARM_ARCH__=6 -+# Only override the compiler option if ARMv6. The ARMv6K extensions are -+# always available in ARMv7 -+ifeq ($(CONFIG_CPU_32v6),y) -+cpp-$(CONFIG_CPU_32v6K) :=-D__LINUX_ARM_ARCH__=6 -+endif -+cpp-$(CONFIG_CPU_32v5) :=-D__LINUX_ARM_ARCH__=5 -+cpp-$(CONFIG_CPU_32v4T) :=-D__LINUX_ARM_ARCH__=4 -+cpp-$(CONFIG_CPU_32v4) :=-D__LINUX_ARM_ARCH__=4 -+cpp-$(CONFIG_CPU_32v3) :=-D__LINUX_ARM_ARCH__=3 -+ -+# This selects how we optimise for the processor. -+tune-$(CONFIG_CPU_ARM7TDMI) :=-mtune=arm7tdmi -+tune-$(CONFIG_CPU_ARM720T) :=-mtune=arm7tdmi -+tune-$(CONFIG_CPU_ARM740T) :=-mtune=arm7tdmi -+tune-$(CONFIG_CPU_ARM9TDMI) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_ARM940T) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_ARM946E) :=-mtune=arm9e -+tune-$(CONFIG_CPU_ARM920T) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_ARM922T) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_ARM925T) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_ARM926T) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_FA526) :=-mtune=arm9tdmi -+tune-$(CONFIG_CPU_SA110) :=-mtune=strongarm110 -+tune-$(CONFIG_CPU_SA1100) :=-mtune=strongarm1100 -+tune-$(CONFIG_CPU_XSCALE) :=-mtune=xscale -+tune-$(CONFIG_CPU_XSC3) :=-mtune=xscale -+tune-$(CONFIG_CPU_FEROCEON) :=-mtune=xscale -+tune-$(CONFIG_CPU_V6) :=-mtune=arm1136j-s -+tune-$(CONFIG_CPU_V6K) :=-mtune=arm1136j-s -+ -+KBUILD_CPPFLAGS +=$(cpp-y) -+KBUILD_CFLAGS +=$(arch-y) $(tune-y) -+KBUILD_AFLAGS +=$(arch-y) $(tune-y) -+ -+# This selects which x86 instruction set is used. -+cflags-$(CONFIG_M486SX) += -march=i486 -+cflags-$(CONFIG_M486) += -march=i486 -+cflags-$(CONFIG_M586) += -march=i586 -+cflags-$(CONFIG_M586TSC) += -march=i586 -+cflags-$(CONFIG_M586MMX) += -march=pentium-mmx -+cflags-$(CONFIG_M686) += -march=i686 -+cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call tune,pentium2) -+cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call tune,pentium3) -+cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call tune,pentium3) -+cflags-$(CONFIG_MPENTIUM4) += -march=i686 $(call tune,pentium4) -+cflags-$(CONFIG_MK6) += -march=k6 -+# Please note, that patches that add -march=athlon-xp and friends are pointless. -+# They make zero difference whatsosever to performance at this time. -+cflags-$(CONFIG_MK7) += -march=athlon -+cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) -+cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) -+cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) -+cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -+cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) -+cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) -+cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) -+cflags-$(CONFIG_MVIAC7) += -march=i686 -+cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) -+cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ -+$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) -+ -+# AMD Elan support -+cflags-$(CONFIG_MELAN) += -march=i486 -+ -+# Geode GX1 support -+cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx -+cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) -+# add at the end to overwrite eventual tuning options from earlier -+# cpu entries -+cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) -+ -+# Bug fix for binutils: this option is required in order to keep -+# binutils from generating NOPL instructions against our will. -+ifneq ($(CONFIG_X86_P6_NOP),y) -+cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,) -+endif -+ -+# x86_64 instruction set -+cflags64-$(CONFIG_MK8) += -march=k8 -+cflags64-$(CONFIG_MPSC) += -march=nocona -+cflags64-$(CONFIG_MK8SSE3) += -march=k8-sse3 -+cflags64-$(CONFIG_MK10) += -march=amdfam10 -+cflags64-$(CONFIG_MBARCELONA) += -march=barcelona -+cflags64-$(CONFIG_MBOBCAT) += -march=btver1 -+cflags64-$(CONFIG_MJAGUAR) += -march=btver2 -+cflags64-$(CONFIG_MBULLDOZER) += -march=bdver1 -+cflags64-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm -+cflags64-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm -+cflags64-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm -+cflags64-$(CONFIG_MZEN) += -march=znver1 -+cflags64-$(CONFIG_MZEN2) += -march=znver2 -+cflags64-$(CONFIG_MZEN3) += -march=znver3 -+cflags64-$(CONFIG_MZEN4) += -march=znver4 -+cflags64-$(CONFIG_MNATIVE_INTEL) += -march=native -+cflags64-$(CONFIG_MNATIVE_AMD) += -march=native -+cflags64-$(CONFIG_MATOM) += -march=bonnell -+cflags64-$(CONFIG_MCORE2) += -march=core2 -+cflags64-$(CONFIG_MNEHALEM) += -march=nehalem -+cflags64-$(CONFIG_MWESTMERE) += -march=westmere -+cflags64-$(CONFIG_MSILVERMONT) += -march=silvermont -+cflags64-$(CONFIG_MGOLDMONT) += -march=goldmont -+cflags64-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus -+cflags64-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge -+cflags64-$(CONFIG_MIVYBRIDGE) += -march=ivybridge -+cflags64-$(CONFIG_MHASWELL) += -march=haswell -+cflags64-$(CONFIG_MBROADWELL) += -march=broadwell -+cflags64-$(CONFIG_MSKYLAKE) += -march=skylake -+cflags64-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 -+cflags64-$(CONFIG_MCANNONLAKE) += -march=cannonlake -+cflags64-$(CONFIG_MICELAKE) += -march=icelake-client -+cflags64-$(CONFIG_MCASCADELAKE) += -march=cascadelake -+cflags64-$(CONFIG_MCOOPERLAKE) += -march=cooperlake -+cflags64-$(CONFIG_MTIGERLAKE) += -march=tigerlake -+cflags64-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids -+cflags64-$(CONFIG_MROCKETLAKE) += -march=rocketlake -+cflags64-$(CONFIG_MALDERLAKE) += -march=alderlake -+cflags64-$(CONFIG_MRAPTORLAKE) += -march=raptorlake -+cflags64-$(CONFIG_MMETEORLAKE) += -march=meteorlake -+cflags64-$(CONFIG_MEMERALDRAPIDS) += -march=emeraldrapids -+cflags64-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 -+cflags64-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 -+cflags64-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 -+cflags64-$(CONFIG_GENERIC_CPU) += -mtune=generic -+KBUILD_CFLAGS += $(cflags64-y) -+ -+rustflags64-$(CONFIG_MK8) += -Ctarget-cpu=k8 -+rustflags64-$(CONFIG_MPSC) += -Ctarget-cpu=nocona -+rustflags64-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 -+rustflags64-$(CONFIG_MATOM) += -Ctarget-cpu=atom -+rustflags64-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic -+KBUILD_RUSTFLAGS += $(rustflags64-y) -+ +@@ -817,6 +817,9 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE KBUILD_CFLAGS += -O2 KBUILD_RUSTFLAGS += -Copt-level=2 @@ -6610,16 +4887,7 @@ index a78379891d22..e58a4e647e7d 100644 else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE KBUILD_CFLAGS += -Os KBUILD_RUSTFLAGS += -Copt-level=s -@@ -990,15 +1145,18 @@ KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) - KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-option, -Wno-stringop-overflow) - KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) - -+# disable GCC vectorization on trees -+KBUILD_CFLAGS += $(call cc-option, -fno-tree-vectorize) -+ - # disable invalid "can't wrap" optimizations for signed / pointers - KBUILD_CFLAGS += -fno-strict-overflow - +@@ -1005,9 +1008,9 @@ KBUILD_CFLAGS += -fno-strict-overflow # Make sure -fstack-check isn't enabled (like gentoo apparently did) KBUILD_CFLAGS += -fno-stack-check @@ -6627,86 +4895,12 @@ index a78379891d22..e58a4e647e7d 100644 +# conserve stack, ivopts and modulo-sched if available ifdef CONFIG_CC_IS_GCC -KBUILD_CFLAGS += -fconserve-stack -+KBUILD_CFLAGS += -fconserve-stack -fivopts -fmodulo-sched ++KBUILD_CFLAGS += -fconserve-stack -fivopts -fmodulo-sched -fno-tree-vectorize endif # change __FILE__ to the relative path from the srctree -diff --git a/arch/arm/Makefile b/arch/arm/Makefile -index 473280d5adce..c7596c898625 100644 ---- a/arch/arm/Makefile -+++ b/arch/arm/Makefile -@@ -59,56 +59,6 @@ endif - # - KBUILD_CFLAGS += $(call cc-option,-fno-ipa-sra) - --# This selects which instruction set is used. --arch-$(CONFIG_CPU_32v7M) :=-march=armv7-m --arch-$(CONFIG_CPU_32v7) :=-march=armv7-a --arch-$(CONFIG_CPU_32v6) :=-march=armv6 --# Only override the compiler option if ARMv6. The ARMv6K extensions are --# always available in ARMv7 --ifeq ($(CONFIG_CPU_32v6),y) --arch-$(CONFIG_CPU_32v6K) :=-march=armv6k --endif --arch-$(CONFIG_CPU_32v5) :=-march=armv5te --arch-$(CONFIG_CPU_32v4T) :=-march=armv4t --arch-$(CONFIG_CPU_32v4) :=-march=armv4 --arch-$(CONFIG_CPU_32v3) :=-march=armv3m -- --# Note that GCC does not numerically define an architecture version --# macro, but instead defines a whole series of macros which makes --# testing for a specific architecture or later rather impossible. --cpp-$(CONFIG_CPU_32v7M) :=-D__LINUX_ARM_ARCH__=7 --cpp-$(CONFIG_CPU_32v7) :=-D__LINUX_ARM_ARCH__=7 --cpp-$(CONFIG_CPU_32v6) :=-D__LINUX_ARM_ARCH__=6 --# Only override the compiler option if ARMv6. The ARMv6K extensions are --# always available in ARMv7 --ifeq ($(CONFIG_CPU_32v6),y) --cpp-$(CONFIG_CPU_32v6K) :=-D__LINUX_ARM_ARCH__=6 --endif --cpp-$(CONFIG_CPU_32v5) :=-D__LINUX_ARM_ARCH__=5 --cpp-$(CONFIG_CPU_32v4T) :=-D__LINUX_ARM_ARCH__=4 --cpp-$(CONFIG_CPU_32v4) :=-D__LINUX_ARM_ARCH__=4 --cpp-$(CONFIG_CPU_32v3) :=-D__LINUX_ARM_ARCH__=3 -- --# This selects how we optimise for the processor. --tune-$(CONFIG_CPU_ARM7TDMI) :=-mtune=arm7tdmi --tune-$(CONFIG_CPU_ARM720T) :=-mtune=arm7tdmi --tune-$(CONFIG_CPU_ARM740T) :=-mtune=arm7tdmi --tune-$(CONFIG_CPU_ARM9TDMI) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_ARM940T) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_ARM946E) :=-mtune=arm9e --tune-$(CONFIG_CPU_ARM920T) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_ARM922T) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_ARM925T) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_ARM926T) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_FA526) :=-mtune=arm9tdmi --tune-$(CONFIG_CPU_SA110) :=-mtune=strongarm110 --tune-$(CONFIG_CPU_SA1100) :=-mtune=strongarm1100 --tune-$(CONFIG_CPU_XSCALE) :=-mtune=xscale --tune-$(CONFIG_CPU_XSC3) :=-mtune=xscale --tune-$(CONFIG_CPU_FEROCEON) :=-mtune=xscale --tune-$(CONFIG_CPU_V6) :=-mtune=arm1136j-s --tune-$(CONFIG_CPU_V6K) :=-mtune=arm1136j-s -- - ifeq ($(CONFIG_AEABI),y) - CFLAGS_ABI :=-mabi=aapcs-linux -mfpu=vfp - else -@@ -140,9 +90,9 @@ AFLAGS_ISA :=$(CFLAGS_ISA) - endif - - # Need -Uarm for gcc < 3.x --KBUILD_CPPFLAGS +=$(cpp-y) --KBUILD_CFLAGS +=$(CFLAGS_ABI) $(CFLAGS_ISA) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm --KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,$(arch-y) $(tune-y) -include asm/unified.h -msoft-float -+ -+KBUILD_CFLAGS +=$(CFLAGS_ABI) $(CFLAGS_ISA) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm -+KBUILD_AFLAGS +=$(CFLAGS_ABI) $(AFLAGS_ISA) -Wa,-include asm/unified.h -msoft-float - - CHECKFLAGS += -D__arm__ - diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu -index 2a7279d80460..b6a64a959e09 100644 +index 2a7279d80460..3b077b9f9291 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -157,7 +157,7 @@ config MPENTIUM4 @@ -6727,7 +4921,7 @@ index 2a7279d80460..b6a64a959e09 100644 depends on X86_32 help Select this for an AMD Athlon K7-family processor. Enables use of -@@ -173,12 +173,106 @@ config MK7 +@@ -173,12 +173,114 @@ config MK7 flags to GCC. config MK8 @@ -6831,11 +5025,19 @@ index 2a7279d80460..b6a64a959e09 100644 + Select this for AMD Family 19h Zen 4 processors. + + Enables -march=znver4 ++ ++config MZEN5 ++ bool "AMD Zen 5" ++ depends on (CC_IS_GCC && GCC_VERSION >= 140000) || (CC_IS_CLANG && CLANG_VERSION >= 180000) ++ help ++ Select this for AMD Family 1Ah Zen 5 processors. ++ ++ Enables -march=znver5 + config MCRUSOE bool "Crusoe" depends on X86_32 -@@ -270,7 +364,7 @@ config MPSC +@@ -270,7 +372,7 @@ config MPSC in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. config MCORE2 @@ -6844,7 +5046,7 @@ index 2a7279d80460..b6a64a959e09 100644 help Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and -@@ -278,6 +372,8 @@ config MCORE2 +@@ -278,6 +380,8 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) @@ -6853,7 +5055,7 @@ index 2a7279d80460..b6a64a959e09 100644 config MATOM bool "Intel Atom" help -@@ -287,6 +383,212 @@ config MATOM +@@ -287,6 +391,212 @@ config MATOM accordingly optimized code. Use a recent GCC with specific Atom support in order to fully benefit from selecting this option. @@ -7066,7 +5268,7 @@ index 2a7279d80460..b6a64a959e09 100644 config GENERIC_CPU bool "Generic-x86-64" depends on X86_64 -@@ -294,6 +596,50 @@ config GENERIC_CPU +@@ -294,6 +604,50 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. @@ -7117,14 +5319,14 @@ index 2a7279d80460..b6a64a959e09 100644 endchoice config X86_GENERIC -@@ -318,9 +664,17 @@ config X86_INTERNODE_CACHE_SHIFT +@@ -318,9 +672,17 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU + default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ -+ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ ++ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \ @@ -7137,7 +5339,7 @@ index 2a7279d80460..b6a64a959e09 100644 config X86_F00F_BUG def_bool y -@@ -332,15 +686,27 @@ config X86_INVD_BUG +@@ -332,15 +694,27 @@ config X86_INVD_BUG config X86_ALIGNMENT_16 def_bool y @@ -7160,7 +5362,7 @@ index 2a7279d80460..b6a64a959e09 100644 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \ + || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ -+ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ ++ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \ + || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \ + || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ @@ -7168,7 +5370,7 @@ index 2a7279d80460..b6a64a959e09 100644 # # P6_NOPs are a relatively minor optimization that require a family >= -@@ -356,11 +722,22 @@ config X86_USE_PPRO_CHECKSUM +@@ -356,11 +730,22 @@ config X86_USE_PPRO_CHECKSUM config X86_P6_NOP def_bool y depends on X86_64 @@ -7185,7 +5387,7 @@ index 2a7279d80460..b6a64a959e09 100644 + depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ + || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \ + || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ -+ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ ++ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM \ + || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \ + || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \ @@ -7193,14 +5395,15 @@ index 2a7279d80460..b6a64a959e09 100644 config X86_HAVE_PAE def_bool y -@@ -368,24 +745,43 @@ config X86_HAVE_PAE +@@ -368,18 +753,37 @@ config X86_HAVE_PAE config X86_CMPXCHG64 def_bool y - depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 -+ depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ ++ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ ++ || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \ -+ || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ ++ || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ + || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \ + || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ + || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD @@ -7213,7 +5416,7 @@ index 2a7279d80460..b6a64a959e09 100644 + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \ + || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \ -+ || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ ++ || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ + || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ + || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ + || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD) @@ -7225,7 +5428,7 @@ index 2a7279d80460..b6a64a959e09 100644 + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ + || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8 || MK8SSE3 \ + || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ -+ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ ++ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT \ + || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ + || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ + || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \ @@ -7233,105 +5436,67 @@ index 2a7279d80460..b6a64a959e09 100644 default "5" if X86_32 && X86_CMPXCHG64 default "4" - config X86_DEBUGCTLMSR - def_bool y -- depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486SX || M486) && !UML -+ depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 \ -+ || M486SX || M486) && !UML - - config IA32_FEAT_CTL - def_bool y diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index da8f3caf2781..adf396b23669 100644 +index 801fd85c3ef6..93cc88b59cbb 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile -@@ -67,8 +67,8 @@ export BITS - # - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 - # --KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx --KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 -+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -mno-avx512f -+KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2,-avx512f - - ifeq ($(CONFIG_X86_KERNEL_IBT),y) - # -@@ -149,21 +149,6 @@ else - # Use -mskip-rax-setup if supported. - KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) - -- # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) -- cflags-$(CONFIG_MK8) += -march=k8 -- cflags-$(CONFIG_MPSC) += -march=nocona +@@ -176,8 +176,49 @@ else + # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) + cflags-$(CONFIG_MK8) += -march=k8 + cflags-$(CONFIG_MPSC) += -march=nocona - cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom -- cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic -- KBUILD_CFLAGS += $(cflags-y) -- -- rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 -- rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona -- rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 -- rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom -- rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic -- KBUILD_RUSTFLAGS += $(rustflags-y) -- - KBUILD_CFLAGS += -mno-red-zone - KBUILD_CFLAGS += -mcmodel=kernel - KBUILD_RUSTFLAGS += -Cno-redzone=y -diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu -index 94834c4b5e5e..81923b4afdf8 100644 ---- a/arch/x86/Makefile_32.cpu -+++ b/arch/x86/Makefile_32.cpu -@@ -10,44 +10,3 @@ else - align := -falign-functions=0 -falign-jumps=0 -falign-loops=0 - endif ++ cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 ++ cflags-$(CONFIG_MK10) += -march=amdfam10 ++ cflags-$(CONFIG_MBARCELONA) += -march=barcelona ++ cflags-$(CONFIG_MBOBCAT) += -march=btver1 ++ cflags-$(CONFIG_MJAGUAR) += -march=btver2 ++ cflags-$(CONFIG_MBULLDOZER) += -march=bdver1 ++ cflags-$(CONFIG_MPILEDRIVER) += -march=bdver2 -mno-tbm ++ cflags-$(CONFIG_MSTEAMROLLER) += -march=bdver3 -mno-tbm ++ cflags-$(CONFIG_MEXCAVATOR) += -march=bdver4 -mno-tbm ++ cflags-$(CONFIG_MZEN) += -march=znver1 ++ cflags-$(CONFIG_MZEN2) += -march=znver2 ++ cflags-$(CONFIG_MZEN3) += -march=znver3 ++ cflags-$(CONFIG_MZEN4) += -march=znver4 ++ cflags-$(CONFIG_MZEN5) += -march=znver5 ++ cflags-$(CONFIG_MNATIVE_INTEL) += -march=native ++ cflags-$(CONFIG_MNATIVE_AMD) += -march=native ++ cflags-$(CONFIG_MATOM) += -march=bonnell ++ cflags-$(CONFIG_MCORE2) += -march=core2 ++ cflags-$(CONFIG_MNEHALEM) += -march=nehalem ++ cflags-$(CONFIG_MWESTMERE) += -march=westmere ++ cflags-$(CONFIG_MSILVERMONT) += -march=silvermont ++ cflags-$(CONFIG_MGOLDMONT) += -march=goldmont ++ cflags-$(CONFIG_MGOLDMONTPLUS) += -march=goldmont-plus ++ cflags-$(CONFIG_MSANDYBRIDGE) += -march=sandybridge ++ cflags-$(CONFIG_MIVYBRIDGE) += -march=ivybridge ++ cflags-$(CONFIG_MHASWELL) += -march=haswell ++ cflags-$(CONFIG_MBROADWELL) += -march=broadwell ++ cflags-$(CONFIG_MSKYLAKE) += -march=skylake ++ cflags-$(CONFIG_MSKYLAKEX) += -march=skylake-avx512 ++ cflags-$(CONFIG_MCANNONLAKE) += -march=cannonlake ++ cflags-$(CONFIG_MICELAKE) += -march=icelake-client ++ cflags-$(CONFIG_MCASCADELAKE) += -march=cascadelake ++ cflags-$(CONFIG_MCOOPERLAKE) += -march=cooperlake ++ cflags-$(CONFIG_MTIGERLAKE) += -march=tigerlake ++ cflags-$(CONFIG_MSAPPHIRERAPIDS) += -march=sapphirerapids ++ cflags-$(CONFIG_MROCKETLAKE) += -march=rocketlake ++ cflags-$(CONFIG_MALDERLAKE) += -march=alderlake ++ cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake ++ cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake ++ cflags-$(CONFIG_MEMERALDRAPIDS) += -march=emeraldrapids ++ cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 ++ cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 ++ cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 + cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic + KBUILD_CFLAGS += $(cflags-y) --cflags-$(CONFIG_M486SX) += -march=i486 --cflags-$(CONFIG_M486) += -march=i486 --cflags-$(CONFIG_M586) += -march=i586 --cflags-$(CONFIG_M586TSC) += -march=i586 --cflags-$(CONFIG_M586MMX) += -march=pentium-mmx --cflags-$(CONFIG_M686) += -march=i686 --cflags-$(CONFIG_MPENTIUMII) += -march=i686 $(call tune,pentium2) --cflags-$(CONFIG_MPENTIUMIII) += -march=i686 $(call tune,pentium3) --cflags-$(CONFIG_MPENTIUMM) += -march=i686 $(call tune,pentium3) --cflags-$(CONFIG_MPENTIUM4) += -march=i686 $(call tune,pentium4) --cflags-$(CONFIG_MK6) += -march=k6 --# Please note, that patches that add -march=athlon-xp and friends are pointless. --# They make zero difference whatsosever to performance at this time. --cflags-$(CONFIG_MK7) += -march=athlon --cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) --cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align) --cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align) --cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) --cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) --cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align) --cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) --cflags-$(CONFIG_MVIAC7) += -march=i686 --cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) --cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ -- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) -- --# AMD Elan support --cflags-$(CONFIG_MELAN) += -march=i486 -- --# Geode GX1 support --cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx --cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx) --# add at the end to overwrite eventual tuning options from earlier --# cpu entries --cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) -- --# Bug fix for binutils: this option is required in order to keep --# binutils from generating NOPL instructions against our will. --ifneq ($(CONFIG_X86_P6_NOP),y) --cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,) --endif diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h -index b40c462b4af3..c4e66e60d559 100644 +index b3ab80a03365..5e883b397ff3 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h -@@ -27,6 +27,7 @@ struct pci_sysdata { +@@ -26,6 +26,7 @@ struct pci_sysdata { #if IS_ENABLED(CONFIG_VMD) struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */ #endif @@ -7339,7 +5504,7 @@ index b40c462b4af3..c4e66e60d559 100644 }; extern int pci_routeirq; -@@ -70,6 +71,11 @@ static inline bool is_vmd(struct pci_bus *bus) +@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus) #define is_vmd(bus) false #endif /* CONFIG_VMD */ @@ -7352,7 +5517,7 @@ index b40c462b4af3..c4e66e60d559 100644 already-configured bus numbers - to be used for buggy BIOSes or architectures with incomplete PCI setup by the loader */ diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h -index 75884d2cdec3..02c1386eb653 100644 +index 75884d2cdec3..7acca9b5a9d5 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,54 @@ @@ -7410,7 +5575,7 @@ index 75884d2cdec3..02c1386eb653 100644 #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 -@@ -35,6 +83,32 @@ +@@ -35,6 +83,34 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " @@ -7440,6 +5605,8 @@ index 75884d2cdec3..02c1386eb653 100644 +#define MODULE_PROC_FAMILY "ZEN3 " +#elif defined CONFIG_MZEN4 +#define MODULE_PROC_FAMILY "ZEN4 " ++#elif defined CONFIG_MZEN5 ++#define MODULE_PROC_FAMILY "ZEN5 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE @@ -7466,14 +5633,14 @@ index ddb798603201..7c20387d8202 100644 } -#endif diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 9bd57baa4b0b..efe818271cf7 100644 +index 88df08a246fa..deecce63d0fc 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -7703,6 +7703,7 @@ MODULE_ALIAS("bfq-iosched"); static int __init bfq_init(void) { int ret; -+ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.8"; ++ char msg[60] = "BFQ I/O-scheduler: BFQ-CachyOS v6.10"; #ifdef CONFIG_BFQ_GROUP_IOSCHED ret = blkcg_policy_register(&blkcg_policy_bfq); @@ -7490,10 +5657,10 @@ index 9bd57baa4b0b..efe818271cf7 100644 slab_kill: diff --git a/block/elevator.c b/block/elevator.c -index 5ff093cb3cf8..1c93fe91b006 100644 +index f64ebd726e58..4f1ccf8cf250 100644 --- a/block/elevator.c +++ b/block/elevator.c -@@ -574,9 +574,19 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) +@@ -567,9 +567,19 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) if (q->nr_hw_queues != 1 && !blk_mq_is_shared_tags(q->tag_set->flags)) @@ -7513,11 +5680,44 @@ index 5ff093cb3cf8..1c93fe91b006 100644 } /* +diff --git a/drivers/Makefile b/drivers/Makefile +index fe9ceb0d2288..b58955caf19b 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -61,14 +61,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb depends on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-y += base/ block/ misc/ mfd/ nfc/ + obj-$(CONFIG_LIBNVDIMM) += nvdimm/ +@@ -80,6 +74,13 @@ obj-y += macintosh/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb depends on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c -index df3fd6474bf2..4303eb5fe11b 100644 +index fc6fd583faf8..f79e205a51dd 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c -@@ -1547,7 +1547,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) +@@ -1618,7 +1618,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) } #endif @@ -7526,7 +5726,7 @@ index df3fd6474bf2..4303eb5fe11b 100644 struct ahci_host_priv *hpriv) { int i; -@@ -1560,7 +1560,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, +@@ -1631,7 +1631,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, pci_resource_len(pdev, bar) < SZ_512K || bar != AHCI_PCI_BAR_STANDARD || !(readl(hpriv->mmio + AHCI_VSCAP) & 1)) @@ -7535,7 +5735,7 @@ index df3fd6474bf2..4303eb5fe11b 100644 cap = readq(hpriv->mmio + AHCI_REMAP_CAP); for (i = 0; i < AHCI_MAX_REMAP; i++) { -@@ -1575,18 +1575,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, +@@ -1646,18 +1646,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, } if (!hpriv->remapped_nvme) @@ -7558,7 +5758,7 @@ index df3fd6474bf2..4303eb5fe11b 100644 } static int ahci_get_irq_vector(struct ata_host *host, int port) -@@ -1806,7 +1799,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) +@@ -1894,7 +1887,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) hpriv->mmio = pcim_iomap_table(pdev)[ahci_pci_bar]; /* detect remapped nvme devices */ @@ -7570,7 +5770,7 @@ index df3fd6474bf2..4303eb5fe11b 100644 sysfs_add_file_to_group(&pdev->dev.kobj, &dev_attr_remapped_nvme.attr, diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 -index 438c9e75a04d..1bbfeca5f01e 100644 +index 97c2d4f15d76..5a3af44d785a 100644 --- a/drivers/cpufreq/Kconfig.x86 +++ b/drivers/cpufreq/Kconfig.x86 @@ -9,7 +9,6 @@ config X86_INTEL_PSTATE @@ -7589,11 +5789,24 @@ index 438c9e75a04d..1bbfeca5f01e 100644 help This driver adds a CPUFreq driver which utilizes a fine grain processor performance frequency control range instead of legacy +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index c31914a9876f..1035c074f36a 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -3550,6 +3550,8 @@ static int __init intel_pstate_setup(char *str) + + if (!strcmp(str, "disable")) + no_load = 1; ++ else if (!strcmp(str, "enable")) ++ no_load = 0; + else if (!strcmp(str, "active")) + default_driver = &intel_pstate; + else if (!strcmp(str, "passive")) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -index 79827a6dcd7f..ee85a2352771 100644 +index f87d53e183c3..c489d3b2576b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h -@@ -153,6 +153,7 @@ struct amdgpu_watchdog_timer +@@ -159,6 +159,7 @@ struct amdgpu_watchdog_timer { */ extern int amdgpu_modeset; extern unsigned int amdgpu_vram_limit; @@ -7602,7 +5815,7 @@ index 79827a6dcd7f..ee85a2352771 100644 extern int amdgpu_gart_size; extern int amdgpu_gtt_size; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c -index 586f4d03039d..a2524615b696 100644 +index ea14f1c8f430..bb0b636d0d75 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -132,6 +132,7 @@ enum AMDGPU_DEBUG_MASK { @@ -7613,7 +5826,7 @@ index 586f4d03039d..a2524615b696 100644 int amdgpu_vis_vram_limit; int amdgpu_gart_size = -1; /* auto */ int amdgpu_gtt_size = -1; /* auto */ -@@ -241,6 +242,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { +@@ -243,6 +244,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { .period = 0x0, /* default to 0x0 (timeout disable) */ }; @@ -7629,8 +5842,194 @@ index 586f4d03039d..a2524615b696 100644 /** * DOC: vramlimit (int) * Restrict the total amount of VRAM in MiB for testing. The default is 0 (Use full VRAM). +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +index 677eb141554e..ceb3f1e4ed1d 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c +@@ -151,6 +151,10 @@ int amdgpu_vcn_sw_init(struct amdgpu_device *adev) + } + } + ++ /* from vcn4 and above, only unified queue is used */ ++ adev->vcn.using_unified_queue = ++ amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0); ++ + hdr = (const struct common_firmware_header *)adev->vcn.fw[0]->data; + adev->vcn.fw_version = le32_to_cpu(hdr->ucode_version); + +@@ -279,18 +283,6 @@ int amdgpu_vcn_sw_fini(struct amdgpu_device *adev) + return 0; + } + +-/* from vcn4 and above, only unified queue is used */ +-static bool amdgpu_vcn_using_unified_queue(struct amdgpu_ring *ring) +-{ +- struct amdgpu_device *adev = ring->adev; +- bool ret = false; +- +- if (amdgpu_ip_version(adev, UVD_HWIP, 0) >= IP_VERSION(4, 0, 0)) +- ret = true; +- +- return ret; +-} +- + bool amdgpu_vcn_is_disabled_vcn(struct amdgpu_device *adev, enum vcn_ring_type type, uint32_t vcn_instance) + { + bool ret = false; +@@ -401,7 +393,9 @@ static void amdgpu_vcn_idle_work_handler(struct work_struct *work) + for (i = 0; i < adev->vcn.num_enc_rings; ++i) + fence[j] += amdgpu_fence_count_emitted(&adev->vcn.inst[j].ring_enc[i]); + +- if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { ++ /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */ ++ if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG && ++ !adev->vcn.using_unified_queue) { + struct dpg_pause_state new_state; + + if (fence[j] || +@@ -447,7 +441,9 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring) + amdgpu_device_ip_set_powergating_state(adev, AMD_IP_BLOCK_TYPE_VCN, + AMD_PG_STATE_UNGATE); + +- if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG) { ++ /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */ ++ if (adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG && ++ !adev->vcn.using_unified_queue) { + struct dpg_pause_state new_state; + + if (ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC) { +@@ -473,8 +469,12 @@ void amdgpu_vcn_ring_begin_use(struct amdgpu_ring *ring) + + void amdgpu_vcn_ring_end_use(struct amdgpu_ring *ring) + { ++ struct amdgpu_device *adev = ring->adev; ++ ++ /* Only set DPG pause for VCN3 or below, VCN4 and above will be handled by FW */ + if (ring->adev->pg_flags & AMD_PG_SUPPORT_VCN_DPG && +- ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC) ++ ring->funcs->type == AMDGPU_RING_TYPE_VCN_ENC && ++ !adev->vcn.using_unified_queue) + atomic_dec(&ring->adev->vcn.inst[ring->me].dpg_enc_submission_cnt); + + atomic_dec(&ring->adev->vcn.total_submission_cnt); +@@ -728,12 +728,11 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, + struct amdgpu_job *job; + struct amdgpu_ib *ib; + uint64_t addr = AMDGPU_GPU_PAGE_ALIGN(ib_msg->gpu_addr); +- bool sq = amdgpu_vcn_using_unified_queue(ring); + uint32_t *ib_checksum; + uint32_t ib_pack_in_dw; + int i, r; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + ib_size_dw += 8; + + r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, +@@ -746,7 +745,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, + ib->length_dw = 0; + + /* single queue headers */ +- if (sq) { ++ if (adev->vcn.using_unified_queue) { + ib_pack_in_dw = sizeof(struct amdgpu_vcn_decode_buffer) / sizeof(uint32_t) + + 4 + 2; /* engine info + decoding ib in dw */ + ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, ib_pack_in_dw, false); +@@ -765,7 +764,7 @@ static int amdgpu_vcn_dec_sw_send_msg(struct amdgpu_ring *ring, + for (i = ib->length_dw; i < ib_size_dw; ++i) + ib->ptr[i] = 0x0; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, ib_pack_in_dw); + + r = amdgpu_job_submit_direct(job, ring, &f); +@@ -855,15 +854,15 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand + struct dma_fence **fence) + { + unsigned int ib_size_dw = 16; ++ struct amdgpu_device *adev = ring->adev; + struct amdgpu_job *job; + struct amdgpu_ib *ib; + struct dma_fence *f = NULL; + uint32_t *ib_checksum = NULL; + uint64_t addr; +- bool sq = amdgpu_vcn_using_unified_queue(ring); + int i, r; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + ib_size_dw += 8; + + r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, +@@ -877,7 +876,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand + + ib->length_dw = 0; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true); + + ib->ptr[ib->length_dw++] = 0x00000018; +@@ -899,7 +898,7 @@ static int amdgpu_vcn_enc_get_create_msg(struct amdgpu_ring *ring, uint32_t hand + for (i = ib->length_dw; i < ib_size_dw; ++i) + ib->ptr[i] = 0x0; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11); + + r = amdgpu_job_submit_direct(job, ring, &f); +@@ -922,15 +921,15 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han + struct dma_fence **fence) + { + unsigned int ib_size_dw = 16; ++ struct amdgpu_device *adev = ring->adev; + struct amdgpu_job *job; + struct amdgpu_ib *ib; + struct dma_fence *f = NULL; + uint32_t *ib_checksum = NULL; + uint64_t addr; +- bool sq = amdgpu_vcn_using_unified_queue(ring); + int i, r; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + ib_size_dw += 8; + + r = amdgpu_job_alloc_with_ib(ring->adev, NULL, NULL, +@@ -944,7 +943,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han + + ib->length_dw = 0; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + ib_checksum = amdgpu_vcn_unified_ring_ib_header(ib, 0x11, true); + + ib->ptr[ib->length_dw++] = 0x00000018; +@@ -966,7 +965,7 @@ static int amdgpu_vcn_enc_get_destroy_msg(struct amdgpu_ring *ring, uint32_t han + for (i = ib->length_dw; i < ib_size_dw; ++i) + ib->ptr[i] = 0x0; + +- if (sq) ++ if (adev->vcn.using_unified_queue) + amdgpu_vcn_unified_ring_ib_checksum(&ib_checksum, 0x11); + + r = amdgpu_job_submit_direct(job, ring, &f); +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +index 9f06def236fd..1a5439abd1a0 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.h +@@ -329,6 +329,7 @@ struct amdgpu_vcn { + + uint16_t inst_mask; + uint8_t num_inst_per_aid; ++ bool using_unified_queue; + }; + + struct amdgpu_fw_shared_rb_ptrs_struct { diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig -index 901d1961b739..05c49141f580 100644 +index 47b8b49da8a7..943959d1f401 100644 --- a/drivers/gpu/drm/amd/display/Kconfig +++ b/drivers/gpu/drm/amd/display/Kconfig @@ -51,4 +51,10 @@ config DRM_AMD_SECURE_DISPLAY @@ -7645,10 +6044,10 @@ index 901d1961b739..05c49141f580 100644 + endmenu diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -index 1eb0f82e9dfa..5e0c551759ab 100644 +index 3cdcadd41be1..8c0b165ec7fb 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c -@@ -4072,7 +4072,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) +@@ -4118,7 +4118,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) return r; } @@ -7658,7 +6057,7 @@ index 1eb0f82e9dfa..5e0c551759ab 100644 return -ENOMEM; #endif diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c -index c87b64e464ed..6fe07243adc3 100644 +index ebabfe3a512f..4d3ebcaacca1 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c @@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x) @@ -7671,10 +6070,10 @@ index c87b64e464ed..6fe07243adc3 100644 * * AMD driver supports pre-defined mathematical functions for transferring diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c -index 6e715ef3a556..11c7199ec3b3 100644 +index e23a0a276e33..dd83cf50a89b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c -@@ -290,7 +290,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) +@@ -338,7 +338,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) } #endif @@ -7683,7 +6082,7 @@ index 6e715ef3a556..11c7199ec3b3 100644 /** * dm_crtc_additional_color_mgmt - enable additional color properties * @crtc: DRM CRTC -@@ -372,7 +372,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { +@@ -420,7 +420,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { #if defined(CONFIG_DEBUG_FS) .late_register = amdgpu_dm_crtc_late_register, #endif @@ -7692,7 +6091,7 @@ index 6e715ef3a556..11c7199ec3b3 100644 .atomic_set_property = amdgpu_dm_atomic_crtc_set_property, .atomic_get_property = amdgpu_dm_atomic_crtc_get_property, #endif -@@ -551,7 +551,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, +@@ -599,7 +599,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); @@ -7732,11 +6131,59 @@ index 8a4c40b4c27e..779880c64575 100644 dm_atomic_plane_attach_color_mgmt_properties(dm, plane); #endif /* Create (reset) the plane state */ +diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c +index 5574bc628053..f109a101d84f 100644 +--- a/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c ++++ b/drivers/gpu/drm/amd/display/dc/optc/dcn10/dcn10_optc.c +@@ -945,19 +945,10 @@ void optc1_set_drr( + OTG_FORCE_LOCK_ON_EVENT, 0, + OTG_SET_V_TOTAL_MIN_MASK_EN, 0, + OTG_SET_V_TOTAL_MIN_MASK, 0); +- +- // Setup manual flow control for EOF via TRIG_A +- optc->funcs->setup_manual_trigger(optc); +- +- } else { +- REG_UPDATE_4(OTG_V_TOTAL_CONTROL, +- OTG_SET_V_TOTAL_MIN_MASK, 0, +- OTG_V_TOTAL_MIN_SEL, 0, +- OTG_V_TOTAL_MAX_SEL, 0, +- OTG_FORCE_LOCK_ON_EVENT, 0); +- +- optc->funcs->set_vtotal_min_max(optc, 0, 0); + } ++ ++ // Setup manual flow control for EOF via TRIG_A ++ optc->funcs->setup_manual_trigger(optc); + } + + void optc1_set_vtotal_min_max(struct timing_generator *optc, int vtotal_min, int vtotal_max) +diff --git a/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c b/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c +index d6f095b4555d..58bdbd859bf9 100644 +--- a/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c ++++ b/drivers/gpu/drm/amd/display/dc/optc/dcn20/dcn20_optc.c +@@ -462,6 +462,16 @@ void optc2_setup_manual_trigger(struct timing_generator *optc) + { + struct optc *optc1 = DCN10TG_FROM_TG(optc); + ++ /* Set the min/max selectors unconditionally so that ++ * DMCUB fw may change OTG timings when necessary ++ * TODO: Remove the w/a after fixing the issue in DMCUB firmware ++ */ ++ REG_UPDATE_4(OTG_V_TOTAL_CONTROL, ++ OTG_V_TOTAL_MIN_SEL, 1, ++ OTG_V_TOTAL_MAX_SEL, 1, ++ OTG_FORCE_LOCK_ON_EVENT, 0, ++ OTG_SET_V_TOTAL_MIN_MASK, (1 << 1)); /* TRIGA */ ++ + REG_SET_8(OTG_TRIGA_CNTL, 0, + OTG_TRIGA_SOURCE_SELECT, 21, + OTG_TRIGA_SOURCE_PIPE_SELECT, optc->inst, diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c -index 39c5e1dfa275..ee97bb26a8ef 100644 +index c11952a4389b..52f54a228b39 100644 --- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c -@@ -3034,6 +3034,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev, +@@ -3155,6 +3155,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev, struct device_attribute *attr, char *buf) { @@ -7747,10 +6194,10 @@ index 39c5e1dfa275..ee97bb26a8ef 100644 } diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -index 0ad947df777a..7b82e3ef7c91 100644 +index e1796ecf9c05..5e46bd293205 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c -@@ -2695,7 +2695,10 @@ int smu_get_power_limit(void *handle, +@@ -2749,7 +2749,10 @@ int smu_get_power_limit(void *handle, *limit = smu->max_power_limit; break; case SMU_PPT_LIMIT_MIN: @@ -7762,7 +6209,7 @@ index 0ad947df777a..7b82e3ef7c91 100644 break; default: return -EINVAL; -@@ -2719,7 +2722,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) +@@ -2773,7 +2776,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) if (smu->ppt_funcs->set_power_limit) return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); @@ -7778,11 +6225,39 @@ index 0ad947df777a..7b82e3ef7c91 100644 dev_err(smu->adev->dev, "New power limit (%d) is out of range [%d,%d]\n", limit, smu->min_power_limit, smu->max_power_limit); +diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c +index fc16fddee5c5..05b21fe9b395 100644 +--- a/drivers/gpu/drm/drm_atomic_uapi.c ++++ b/drivers/gpu/drm/drm_atomic_uapi.c +@@ -1066,21 +1066,14 @@ int drm_atomic_set_property(struct drm_atomic_state *state, + break; + } + +- if (async_flip && prop != config->prop_fb_id) { ++ if (async_flip && (prop != config->prop_fb_id || ++ plane_state->plane->type != DRM_PLANE_TYPE_PRIMARY)) { + ret = drm_atomic_plane_get_property(plane, plane_state, + prop, &old_val); + ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); + break; + } + +- if (async_flip && plane_state->plane->type != DRM_PLANE_TYPE_PRIMARY) { +- drm_dbg_atomic(prop->dev, +- "[OBJECT:%d] Only primary planes can be changed during async flip\n", +- obj->id); +- ret = -EINVAL; +- break; +- } +- + ret = drm_atomic_plane_set_property(plane, + plane_state, file_priv, + prop, prop_value); diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig -index 28eb48dd5b32..1cf4c700b108 100644 +index fe6e8a1bb607..1488a904e3bf 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig -@@ -230,6 +230,15 @@ config I2C_CHT_WC +@@ -238,6 +238,15 @@ config I2C_CHT_WC combined with a FUSB302 Type-C port-controller as such it is advised to also select CONFIG_TYPEC_FUSB302=m. @@ -7797,9 +6272,9 @@ index 28eb48dd5b32..1cf4c700b108 100644 + config I2C_NFORCE2 tristate "Nvidia nForce2, nForce3 and nForce4" - depends on PCI + depends on PCI && HAS_IOPORT diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile -index aa0ee8ecd6f2..020714113e9a 100644 +index 78d0561339e5..9ea3a294f9f0 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_I2C_CHT_WC) += i2c-cht-wc.o @@ -7812,7 +6287,7 @@ index aa0ee8ecd6f2..020714113e9a 100644 obj-$(CONFIG_I2C_NVIDIA_GPU) += i2c-nvidia-gpu.o diff --git a/drivers/i2c/busses/i2c-nct6775.c b/drivers/i2c/busses/i2c-nct6775.c new file mode 100644 -index 000000000000..e919d1e10c51 +index 000000000000..fdbd9a1c8d7a --- /dev/null +++ b/drivers/i2c/busses/i2c-nct6775.c @@ -0,0 +1,648 @@ @@ -8194,7 +6669,7 @@ index 000000000000..e919d1e10c51 + } + + adap->owner = THIS_MODULE; -+ adap->class = I2C_CLASS_HWMON | I2C_CLASS_SPD; ++ adap->class = I2C_CLASS_HWMON; + adap->algo = &smbus_algorithm; + + adapdata = kzalloc(sizeof(*adapdata), GFP_KERNEL); @@ -8547,10 +7022,10 @@ index 51e0c4954600..35c3ad741870 100644 } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 59445763e55a..568f85414c85 100644 +index 1b7a97cc3779..37e9e43908ab 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c -@@ -3271,6 +3271,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) +@@ -3284,6 +3284,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) goto bad; } @@ -8562,6 +7037,3779 @@ index 59445763e55a..568f85414c85 100644 ret = crypt_ctr_cipher(ti, argv[0], argv[1]); if (ret < 0) goto bad; +diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig +index 331b8e535e5b..80dabeebf580 100644 +--- a/drivers/media/v4l2-core/Kconfig ++++ b/drivers/media/v4l2-core/Kconfig +@@ -40,6 +40,11 @@ config VIDEO_TUNER + config V4L2_JPEG_HELPER + tristate + ++config V4L2_LOOPBACK ++ tristate "V4L2 loopback device" ++ help ++ V4L2 loopback device ++ + # Used by drivers that need v4l2-h264.ko + config V4L2_H264 + tristate +diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile +index 2177b9d63a8f..c179507cedc4 100644 +--- a/drivers/media/v4l2-core/Makefile ++++ b/drivers/media/v4l2-core/Makefile +@@ -33,5 +33,7 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o + obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o + obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o + ++obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o ++ + obj-$(CONFIG_VIDEO_TUNER) += tuner.o + obj-$(CONFIG_VIDEO_DEV) += v4l2-dv-timings.o videodev.o +diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c +new file mode 100644 +index 000000000000..25cb1beb26e5 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback.c +@@ -0,0 +1,3184 @@ ++/* -*- c-file-style: "linux" -*- */ ++/* ++ * v4l2loopback.c -- video4linux2 loopback driver ++ * ++ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com) ++ * Copyright (C) 2010-2023 IOhannes m zmoelnig (zmoelnig@iem.at) ++ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de) ++ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com) ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "v4l2loopback.h" ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) ++#error This module is not supported on kernels before 4.0.0. ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++#define strscpy strlcpy ++#endif ++ ++#if defined(timer_setup) && defined(from_timer) ++#define HAVE_TIMER_SETUP ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) ++#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER ++#endif ++ ++#define V4L2LOOPBACK_VERSION_CODE \ ++ KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \ ++ V4L2LOOPBACK_VERSION_BUGFIX) ++ ++MODULE_DESCRIPTION("V4L2 loopback video device"); ++MODULE_AUTHOR("Vasily Levin, " ++ "IOhannes m zmoelnig ," ++ "Stefan Diewald," ++ "Anton Novikov" ++ "et al."); ++#ifdef SNAPSHOT_VERSION ++MODULE_VERSION(__stringify(SNAPSHOT_VERSION)); ++#else ++MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify( ++ V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX)); ++#endif ++MODULE_LICENSE("GPL"); ++ ++/* ++ * helpers ++ */ ++#define dprintk(fmt, args...) \ ++ do { \ ++ if (debug > 0) { \ ++ printk(KERN_INFO "v4l2-loopback[" __stringify( \ ++ __LINE__) "], pid(%d): " fmt, \ ++ task_pid_nr(current), ##args); \ ++ } \ ++ } while (0) ++ ++#define MARK() \ ++ do { \ ++ if (debug > 1) { \ ++ printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \ ++ __LINE__, __func__, task_pid_nr(current)); \ ++ } \ ++ } while (0) ++ ++#define dprintkrw(fmt, args...) \ ++ do { \ ++ if (debug > 2) { \ ++ printk(KERN_INFO "v4l2-loopback[" __stringify( \ ++ __LINE__) "], pid(%d): " fmt, \ ++ task_pid_nr(current), ##args); \ ++ } \ ++ } while (0) ++ ++static inline void v4l2l_get_timestamp(struct v4l2_buffer *b) ++{ ++ struct timespec64 ts; ++ ktime_get_ts64(&ts); ++ ++ b->timestamp.tv_sec = ts.tv_sec; ++ b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC); ++ b->flags |= V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; ++} ++ ++#if BITS_PER_LONG == 32 ++#include /* do_div() for 64bit division */ ++static inline int v4l2l_mod64(const s64 A, const u32 B) ++{ ++ u64 a = (u64)A; ++ u32 b = B; ++ ++ if (A > 0) ++ return do_div(a, b); ++ a = -A; ++ return -do_div(a, b); ++} ++#else ++static inline int v4l2l_mod64(const s64 A, const u32 B) ++{ ++ return A % B; ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) ++typedef unsigned __poll_t; ++#endif ++ ++/* module constants ++ * can be overridden during he build process using something like ++ * make KCPPFLAGS="-DMAX_DEVICES=100" ++ */ ++ ++/* maximum number of v4l2loopback devices that can be created */ ++#ifndef MAX_DEVICES ++#define MAX_DEVICES 8 ++#endif ++ ++/* whether the default is to announce capabilities exclusively or not */ ++#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS ++#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0 ++#endif ++ ++/* when a producer is considered to have gone stale */ ++#ifndef MAX_TIMEOUT ++#define MAX_TIMEOUT (100 * 1000) /* in msecs */ ++#endif ++ ++/* max buffers that can be mapped, actually they ++ * are all mapped to max_buffers buffers */ ++#ifndef MAX_BUFFERS ++#define MAX_BUFFERS 32 ++#endif ++ ++/* module parameters */ ++static int debug = 0; ++module_param(debug, int, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)"); ++ ++#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2 ++static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS; ++module_param(max_buffers, int, S_IRUGO); ++MODULE_PARM_DESC(max_buffers, ++ "how many buffers should be allocated [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]"); ++ ++/* how many times a device can be opened ++ * the per-module default value can be overridden on a per-device basis using ++ * the /sys/devices interface ++ * ++ * note that max_openers should be at least 2 in order to get a working system: ++ * one opener for the producer and one opener for the consumer ++ * however, we leave that to the user ++ */ ++#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10 ++static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS; ++module_param(max_openers, int, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC( ++ max_openers, ++ "how many users can open the loopback device [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]"); ++ ++static int devices = -1; ++module_param(devices, int, 0); ++MODULE_PARM_DESC(devices, "how many devices should be created"); ++ ++static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 }; ++module_param_array(video_nr, int, NULL, 0444); ++MODULE_PARM_DESC(video_nr, ++ "video device numbers (-1=auto, 0=/dev/video0, etc.)"); ++ ++static char *card_label[MAX_DEVICES]; ++module_param_array(card_label, charp, NULL, 0000); ++MODULE_PARM_DESC(card_label, "card labels for each device"); ++ ++static bool exclusive_caps[MAX_DEVICES] = { ++ [0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS ++}; ++module_param_array(exclusive_caps, bool, NULL, 0444); ++/* FIXXME: wording */ ++MODULE_PARM_DESC( ++ exclusive_caps, ++ "whether to announce OUTPUT/CAPTURE capabilities exclusively or not [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]"); ++ ++/* format specifications */ ++#define V4L2LOOPBACK_SIZE_MIN_WIDTH 2 ++#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 1 ++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192 ++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192 ++ ++#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640 ++#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480 ++ ++static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; ++module_param(max_width, int, S_IRUGO); ++MODULE_PARM_DESC(max_width, ++ "maximum allowed frame width [DEFAULT: " __stringify( ++ V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]"); ++static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; ++module_param(max_height, int, S_IRUGO); ++MODULE_PARM_DESC(max_height, ++ "maximum allowed frame height [DEFAULT: " __stringify( ++ V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]"); ++ ++static DEFINE_IDR(v4l2loopback_index_idr); ++static DEFINE_MUTEX(v4l2loopback_ctl_mutex); ++ ++/* frame intervals */ ++#define V4L2LOOPBACK_FPS_MIN 0 ++#define V4L2LOOPBACK_FPS_MAX 1000 ++ ++/* control IDs */ ++#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000) ++#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0) ++#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1) ++#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2) ++#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3) ++ ++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl); ++static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = { ++ .s_ctrl = v4l2loopback_s_ctrl, ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_KEEP_FORMAT, ++ .name = "keep_format", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_SUSTAIN_FRAMERATE, ++ .name = "sustain_framerate", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_TIMEOUT, ++ .name = "timeout", ++ .type = V4L2_CTRL_TYPE_INTEGER, ++ .min = 0, ++ .max = MAX_TIMEOUT, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_TIMEOUT_IMAGE_IO, ++ .name = "timeout_image_io", ++ .type = V4L2_CTRL_TYPE_BUTTON, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++ ++/* module structures */ ++struct v4l2loopback_private { ++ int device_nr; ++}; ++ ++/* TODO(vasaka) use typenames which are common to kernel, but first find out if ++ * it is needed */ ++/* struct keeping state and settings of loopback device */ ++ ++struct v4l2l_buffer { ++ struct v4l2_buffer buffer; ++ struct list_head list_head; ++ int use_count; ++}; ++ ++struct v4l2_loopback_device { ++ struct v4l2_device v4l2_dev; ++ struct v4l2_ctrl_handler ctrl_handler; ++ struct video_device *vdev; ++ /* pixel and stream format */ ++ struct v4l2_pix_format pix_format; ++ bool pix_format_has_valid_sizeimage; ++ struct v4l2_captureparm capture_param; ++ unsigned long frame_jiffies; ++ ++ /* ctrls */ ++ int keep_format; /* CID_KEEP_FORMAT; stay ready_for_capture even when all ++ openers close() the device */ ++ int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain ++ (close to) nominal framerate */ ++ ++ /* buffers stuff */ ++ u8 *image; /* pointer to actual buffers data */ ++ unsigned long int imagesize; /* size of buffers data */ ++ int buffers_number; /* should not be big, 4 is a good choice */ ++ struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */ ++ int used_buffers; /* number of the actually used buffers */ ++ int max_openers; /* how many times can this device be opened */ ++ ++ s64 write_position; /* number of last written frame + 1 */ ++ struct list_head outbufs_list; /* buffers in output DQBUF order */ ++ int bufpos2index ++ [MAX_BUFFERS]; /* mapping of (read/write_position % used_buffers) ++ * to inner buffer index */ ++ long buffer_size; ++ ++ /* sustain_framerate stuff */ ++ struct timer_list sustain_timer; ++ unsigned int reread_count; ++ ++ /* timeout stuff */ ++ unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */ ++ int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will ++ * read/write to timeout_image */ ++ u8 *timeout_image; /* copy of it will be captured when timeout passes */ ++ struct v4l2l_buffer timeout_image_buffer; ++ struct timer_list timeout_timer; ++ int timeout_happened; ++ ++ /* sync stuff */ ++ atomic_t open_count; ++ ++ int ready_for_capture; /* set to the number of writers that opened the ++ * device and negotiated format. */ ++ int ready_for_output; /* set to true when no writer is currently attached ++ * this differs slightly from !ready_for_capture, ++ * e.g. when using fallback images */ ++ int active_readers; /* increase if any reader starts streaming */ ++ int announce_all_caps; /* set to false, if device caps (OUTPUT/CAPTURE) ++ * should only be announced if the resp. "ready" ++ * flag is set; default=TRUE */ ++ ++ int min_width, max_width; ++ int min_height, max_height; ++ ++ char card_label[32]; ++ ++ wait_queue_head_t read_event; ++ spinlock_t lock, list_lock; ++}; ++ ++/* types of opener shows what opener wants to do with loopback */ ++enum opener_type { ++ // clang-format off ++ UNNEGOTIATED = 0, ++ READER = 1, ++ WRITER = 2, ++ // clang-format on ++}; ++ ++/* struct keeping state and type of opener */ ++struct v4l2_loopback_opener { ++ enum opener_type type; ++ s64 read_position; /* number of last processed frame + 1 or ++ * write_position - 1 if reader went out of sync */ ++ unsigned int reread_count; ++ struct v4l2_buffer *buffers; ++ int buffers_number; /* should not be big, 4 is a good choice */ ++ int timeout_image_io; ++ ++ struct v4l2_fh fh; ++}; ++ ++#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh) ++ ++/* this is heavily inspired by the bttv driver found in the linux kernel */ ++struct v4l2l_format { ++ char *name; ++ int fourcc; /* video4linux 2 */ ++ int depth; /* bit/pixel */ ++ int flags; ++}; ++/* set the v4l2l_format.flags to PLANAR for non-packed formats */ ++#define FORMAT_FLAGS_PLANAR 0x01 ++#define FORMAT_FLAGS_COMPRESSED 0x02 ++ ++#include "v4l2loopback_formats.h" ++ ++#ifndef V4L2_TYPE_IS_CAPTURE ++#define V4L2_TYPE_IS_CAPTURE(type) \ ++ ((type) == V4L2_BUF_TYPE_VIDEO_CAPTURE || \ ++ (type) == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) ++#endif /* V4L2_TYPE_IS_CAPTURE */ ++#ifndef V4L2_TYPE_IS_OUTPUT ++#define V4L2_TYPE_IS_OUTPUT(type) \ ++ ((type) == V4L2_BUF_TYPE_VIDEO_OUTPUT || \ ++ (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) ++#endif /* V4L2_TYPE_IS_OUTPUT */ ++ ++/* whether the format can be changed */ ++/* the format is fixated if we ++ - have writers (ready_for_capture>0) ++ - and/or have readers (active_readers>0) ++*/ ++#define V4L2LOOPBACK_IS_FIXED_FMT(device) \ ++ (device->ready_for_capture > 0 || device->active_readers > 0 || \ ++ device->keep_format) ++ ++static const unsigned int FORMATS = ARRAY_SIZE(formats); ++ ++static char *fourcc2str(unsigned int fourcc, char buf[4]) ++{ ++ buf[0] = (fourcc >> 0) & 0xFF; ++ buf[1] = (fourcc >> 8) & 0xFF; ++ buf[2] = (fourcc >> 16) & 0xFF; ++ buf[3] = (fourcc >> 24) & 0xFF; ++ ++ return buf; ++} ++ ++static const struct v4l2l_format *format_by_fourcc(int fourcc) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < FORMATS; i++) { ++ if (formats[i].fourcc == fourcc) ++ return formats + i; ++ } ++ ++ dprintk("unsupported format '%c%c%c%c'\n", (fourcc >> 0) & 0xFF, ++ (fourcc >> 8) & 0xFF, (fourcc >> 16) & 0xFF, ++ (fourcc >> 24) & 0xFF); ++ return NULL; ++} ++ ++static void pix_format_set_size(struct v4l2_pix_format *f, ++ const struct v4l2l_format *fmt, ++ unsigned int width, unsigned int height) ++{ ++ f->width = width; ++ f->height = height; ++ ++ if (fmt->flags & FORMAT_FLAGS_PLANAR) { ++ f->bytesperline = width; /* Y plane */ ++ f->sizeimage = (width * height * fmt->depth) >> 3; ++ } else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) { ++ /* doesn't make sense for compressed formats */ ++ f->bytesperline = 0; ++ f->sizeimage = (width * height * fmt->depth) >> 3; ++ } else { ++ f->bytesperline = (width * fmt->depth) >> 3; ++ f->sizeimage = height * f->bytesperline; ++ } ++} ++ ++static int v4l2l_fill_format(struct v4l2_format *fmt, int capture, ++ const u32 minwidth, const u32 maxwidth, ++ const u32 minheight, const u32 maxheight) ++{ ++ u32 width = fmt->fmt.pix.width, height = fmt->fmt.pix.height; ++ u32 pixelformat = fmt->fmt.pix.pixelformat; ++ struct v4l2_format fmt0 = *fmt; ++ u32 bytesperline = 0, sizeimage = 0; ++ if (!width) ++ width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; ++ if (!height) ++ height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; ++ if (width < minwidth) ++ width = minwidth; ++ if (width > maxwidth) ++ width = maxwidth; ++ if (height < minheight) ++ height = minheight; ++ if (height > maxheight) ++ height = maxheight; ++ ++ /* sets: width,height,pixelformat,bytesperline,sizeimage */ ++ if (!(V4L2_TYPE_IS_MULTIPLANAR(fmt0.type))) { ++ fmt0.fmt.pix.bytesperline = 0; ++ fmt0.fmt.pix.sizeimage = 0; ++ } ++ ++ if (0) { ++ ; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) ++ } else if (!v4l2_fill_pixfmt(&fmt0.fmt.pix, pixelformat, width, ++ height)) { ++ ; ++ } else if (!v4l2_fill_pixfmt_mp(&fmt0.fmt.pix_mp, pixelformat, width, ++ height)) { ++ ; ++#endif ++ } else { ++ const struct v4l2l_format *format = ++ format_by_fourcc(pixelformat); ++ if (!format) ++ return -EINVAL; ++ pix_format_set_size(&fmt0.fmt.pix, format, width, height); ++ fmt0.fmt.pix.pixelformat = format->fourcc; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt0.type)) { ++ *fmt = fmt0; ++ ++ if ((fmt->fmt.pix_mp.colorspace == V4L2_COLORSPACE_DEFAULT) || ++ (fmt->fmt.pix_mp.colorspace > V4L2_COLORSPACE_DCI_P3)) ++ fmt->fmt.pix_mp.colorspace = V4L2_COLORSPACE_SRGB; ++ if (V4L2_FIELD_ANY == fmt->fmt.pix_mp.field) ++ fmt->fmt.pix_mp.field = V4L2_FIELD_NONE; ++ if (capture) ++ fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE; ++ else ++ fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE; ++ } else { ++ bytesperline = fmt->fmt.pix.bytesperline; ++ sizeimage = fmt->fmt.pix.sizeimage; ++ ++ *fmt = fmt0; ++ ++ if (!fmt->fmt.pix.bytesperline) ++ fmt->fmt.pix.bytesperline = bytesperline; ++ if (!fmt->fmt.pix.sizeimage) ++ fmt->fmt.pix.sizeimage = sizeimage; ++ ++ if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) || ++ (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3)) ++ fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; ++ if (V4L2_FIELD_ANY == fmt->fmt.pix.field) ++ fmt->fmt.pix.field = V4L2_FIELD_NONE; ++ if (capture) ++ fmt->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ else ++ fmt->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ } ++ ++ return 0; ++} ++ ++/* Checks if v4l2l_fill_format() has set a valid, fixed sizeimage val. */ ++static bool v4l2l_pix_format_has_valid_sizeimage(struct v4l2_format *fmt) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) ++ const struct v4l2_format_info *info; ++ ++ info = v4l2_format_info(fmt->fmt.pix.pixelformat); ++ if (info && info->mem_planes == 1) ++ return true; ++#endif ++ ++ return false; ++} ++ ++static int pix_format_eq(const struct v4l2_pix_format *ref, ++ const struct v4l2_pix_format *tgt, int strict) ++{ ++ /* check if the two formats are equivalent. ++ * ANY fields are handled gracefully ++ */ ++#define _pix_format_eq0(x) \ ++ if (ref->x != tgt->x) \ ++ result = 0 ++#define _pix_format_eq1(x, def) \ ++ do { \ ++ if ((def != tgt->x) && (ref->x != tgt->x)) { \ ++ printk(KERN_INFO #x " failed"); \ ++ result = 0; \ ++ } \ ++ } while (0) ++ int result = 1; ++ _pix_format_eq0(width); ++ _pix_format_eq0(height); ++ _pix_format_eq0(pixelformat); ++ if (!strict) ++ return result; ++ _pix_format_eq1(field, V4L2_FIELD_ANY); ++ _pix_format_eq0(bytesperline); ++ _pix_format_eq0(sizeimage); ++ _pix_format_eq1(colorspace, V4L2_COLORSPACE_DEFAULT); ++ return result; ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f); ++static int inner_try_setfmt(struct file *file, struct v4l2_format *fmt) ++{ ++ int capture = V4L2_TYPE_IS_CAPTURE(fmt->type); ++ struct v4l2_loopback_device *dev; ++ int needschange = 0; ++ char buf[5]; ++ buf[4] = 0; ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ needschange = !(pix_format_eq(&dev->pix_format, &fmt->fmt.pix, 0)); ++ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { ++ fmt->fmt.pix = dev->pix_format; ++ if (needschange) { ++ if (dev->active_readers > 0 && capture) { ++ /* cannot call fmt_cap while there are readers */ ++ return -EBUSY; ++ } ++ if (dev->ready_for_capture > 0 && !capture) { ++ /* cannot call fmt_out while there are writers */ ++ return -EBUSY; ++ } ++ } ++ } ++ if (v4l2l_fill_format(fmt, capture, dev->min_width, dev->max_width, ++ dev->min_height, dev->max_height) != 0) { ++ return -EINVAL; ++ } ++ ++ if (1) { ++ char buf[5]; ++ buf[4] = 0; ++ dprintk("capFOURCC=%s\n", ++ fourcc2str(dev->pix_format.pixelformat, buf)); ++ } ++ return 0; ++} ++ ++static int set_timeperframe(struct v4l2_loopback_device *dev, ++ struct v4l2_fract *tpf) ++{ ++ if ((tpf->denominator < 1) || (tpf->numerator < 1)) { ++ return -EINVAL; ++ } ++ dev->capture_param.timeperframe = *tpf; ++ dev->frame_jiffies = max(1UL, msecs_to_jiffies(1000) * tpf->numerator / ++ tpf->denominator); ++ return 0; ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd); ++ ++/* device attributes */ ++/* available via sysfs: /sys/devices/virtual/video4linux/video* */ ++ ++static ssize_t attr_show_format(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ /* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ const struct v4l2_fract *tpf; ++ char buf4cc[5], buf_fps[32]; ++ ++ if (!dev || !V4L2LOOPBACK_IS_FIXED_FMT(dev)) ++ return 0; ++ tpf = &dev->capture_param.timeperframe; ++ ++ fourcc2str(dev->pix_format.pixelformat, buf4cc); ++ buf4cc[4] = 0; ++ if (tpf->numerator == 1) ++ snprintf(buf_fps, sizeof(buf_fps), "%d", tpf->denominator); ++ else ++ snprintf(buf_fps, sizeof(buf_fps), "%d/%d", tpf->denominator, ++ tpf->numerator); ++ return sprintf(buf, "%4s:%dx%d@%s\n", buf4cc, dev->pix_format.width, ++ dev->pix_format.height, buf_fps); ++} ++ ++static ssize_t attr_store_format(struct device *cd, ++ struct device_attribute *attr, const char *buf, ++ size_t len) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ int fps_num = 0, fps_den = 1; ++ ++ if (!dev) ++ return -ENODEV; ++ ++ /* only fps changing is supported */ ++ if (sscanf(buf, "@%d/%d", &fps_num, &fps_den) > 0) { ++ struct v4l2_fract f = { .numerator = fps_den, ++ .denominator = fps_num }; ++ int err = 0; ++ if ((err = set_timeperframe(dev, &f)) < 0) ++ return err; ++ return len; ++ } ++ return -EINVAL; ++} ++ ++static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format, ++ attr_store_format); ++ ++static ssize_t attr_show_buffers(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ return sprintf(buf, "%d\n", dev->used_buffers); ++} ++ ++static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL); ++ ++static ssize_t attr_show_maxopeners(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ return sprintf(buf, "%d\n", dev->max_openers); ++} ++ ++static ssize_t attr_store_maxopeners(struct device *cd, ++ struct device_attribute *attr, ++ const char *buf, size_t len) ++{ ++ struct v4l2_loopback_device *dev = NULL; ++ unsigned long curr = 0; ++ ++ if (kstrtoul(buf, 0, &curr)) ++ return -EINVAL; ++ ++ dev = v4l2loopback_cd2dev(cd); ++ if (!dev) ++ return -ENODEV; ++ ++ if (dev->max_openers == curr) ++ return len; ++ ++ if (curr > __INT_MAX__ || dev->open_count.counter > curr) { ++ /* request to limit to less openers as are currently attached to us */ ++ return -EINVAL; ++ } ++ ++ dev->max_openers = (int)curr; ++ ++ return len; ++} ++ ++static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners, ++ attr_store_maxopeners); ++ ++static ssize_t attr_show_state(struct device *cd, struct device_attribute *attr, ++ char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ if (dev->ready_for_capture) ++ return sprintf(buf, "capture\n"); ++ if (dev->ready_for_output) ++ return sprintf(buf, "output\n"); ++ ++ return -EAGAIN; ++} ++ ++static DEVICE_ATTR(state, S_IRUGO, attr_show_state, NULL); ++ ++static void v4l2loopback_remove_sysfs(struct video_device *vdev) ++{ ++#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x) ++ ++ if (vdev) { ++ V4L2_SYSFS_DESTROY(format); ++ V4L2_SYSFS_DESTROY(buffers); ++ V4L2_SYSFS_DESTROY(max_openers); ++ V4L2_SYSFS_DESTROY(state); ++ /* ... */ ++ } ++} ++ ++static void v4l2loopback_create_sysfs(struct video_device *vdev) ++{ ++ int res = 0; ++ ++#define V4L2_SYSFS_CREATE(x) \ ++ res = device_create_file(&vdev->dev, &dev_attr_##x); \ ++ if (res < 0) \ ++ break ++ if (!vdev) ++ return; ++ do { ++ V4L2_SYSFS_CREATE(format); ++ V4L2_SYSFS_CREATE(buffers); ++ V4L2_SYSFS_CREATE(max_openers); ++ V4L2_SYSFS_CREATE(state); ++ /* ... */ ++ } while (0); ++ ++ if (res >= 0) ++ return; ++ dev_err(&vdev->dev, "%s error: %d\n", __func__, res); ++} ++ ++/* Event APIs */ ++ ++#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START) ++#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000 ++#define V4L2_EVENT_PRI_CLIENT_USAGE \ ++ (V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1) ++ ++struct v4l2_event_client_usage { ++ __u32 count; ++}; ++ ++/* global module data */ ++/* find a device based on it's device-number (e.g. '3' for /dev/video3) */ ++struct v4l2loopback_lookup_cb_data { ++ int device_nr; ++ struct v4l2_loopback_device *device; ++}; ++static int v4l2loopback_lookup_cb(int id, void *ptr, void *data) ++{ ++ struct v4l2_loopback_device *device = ptr; ++ struct v4l2loopback_lookup_cb_data *cbdata = data; ++ if (cbdata && device && device->vdev) { ++ if (device->vdev->num == cbdata->device_nr) { ++ cbdata->device = device; ++ cbdata->device_nr = id; ++ return 1; ++ } ++ } ++ return 0; ++} ++static int v4l2loopback_lookup(int device_nr, ++ struct v4l2_loopback_device **device) ++{ ++ struct v4l2loopback_lookup_cb_data data = { ++ .device_nr = device_nr, ++ .device = NULL, ++ }; ++ int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb, ++ &data); ++ if (1 == err) { ++ if (device) ++ *device = data.device; ++ return data.device_nr; ++ } ++ return -ENODEV; ++} ++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd) ++{ ++ struct video_device *loopdev = to_video_device(cd); ++ struct v4l2loopback_private *ptr = ++ (struct v4l2loopback_private *)video_get_drvdata(loopdev); ++ int nr = ptr->device_nr; ++ ++ return idr_find(&v4l2loopback_index_idr, nr); ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f) ++{ ++ struct v4l2loopback_private *ptr = video_drvdata(f); ++ int nr = ptr->device_nr; ++ ++ return idr_find(&v4l2loopback_index_idr, nr); ++} ++ ++/* forward declarations */ ++static void client_usage_queue_event(struct video_device *vdev); ++static void init_buffers(struct v4l2_loopback_device *dev); ++static int allocate_buffers(struct v4l2_loopback_device *dev); ++static void free_buffers(struct v4l2_loopback_device *dev); ++static void try_free_buffers(struct v4l2_loopback_device *dev); ++static int allocate_timeout_image(struct v4l2_loopback_device *dev); ++static void check_timers(struct v4l2_loopback_device *dev); ++static const struct v4l2_file_operations v4l2_loopback_fops; ++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops; ++ ++/* Queue helpers */ ++/* next functions sets buffer flags and adjusts counters accordingly */ ++static inline void set_done(struct v4l2l_buffer *buffer) ++{ ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; ++ buffer->buffer.flags |= V4L2_BUF_FLAG_DONE; ++} ++ ++static inline void set_queued(struct v4l2l_buffer *buffer) ++{ ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; ++ buffer->buffer.flags |= V4L2_BUF_FLAG_QUEUED; ++} ++ ++static inline void unset_flags(struct v4l2l_buffer *buffer) ++{ ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_QUEUED; ++ buffer->buffer.flags &= ~V4L2_BUF_FLAG_DONE; ++} ++ ++/* V4L2 ioctl caps and params calls */ ++/* returns device capabilities ++ * called on VIDIOC_QUERYCAP ++ */ ++static int vidioc_querycap(struct file *file, void *priv, ++ struct v4l2_capability *cap) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ int device_nr = ++ ((struct v4l2loopback_private *)video_get_drvdata(dev->vdev)) ++ ->device_nr; ++ __u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; ++ ++ strscpy(cap->driver, "v4l2 loopback", sizeof(cap->driver)); ++ snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label); ++ snprintf(cap->bus_info, sizeof(cap->bus_info), ++ "platform:v4l2loopback-%03d", device_nr); ++ ++ if (dev->announce_all_caps) { ++ capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT; ++ } else { ++ if (dev->ready_for_capture) { ++ capabilities |= V4L2_CAP_VIDEO_CAPTURE; ++ } ++ if (dev->ready_for_output) { ++ capabilities |= V4L2_CAP_VIDEO_OUTPUT; ++ } ++ } ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) ++ dev->vdev->device_caps = ++#endif /* >=linux-4.7.0 */ ++ cap->device_caps = cap->capabilities = capabilities; ++ ++ cap->capabilities |= V4L2_CAP_DEVICE_CAPS; ++ ++ memset(cap->reserved, 0, sizeof(cap->reserved)); ++ return 0; ++} ++ ++static int vidioc_enum_framesizes(struct file *file, void *fh, ++ struct v4l2_frmsizeenum *argp) ++{ ++ struct v4l2_loopback_device *dev; ++ ++ /* there can be only one... */ ++ if (argp->index) ++ return -EINVAL; ++ ++ dev = v4l2loopback_getdevice(file); ++ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { ++ /* format has already been negotiated ++ * cannot change during runtime ++ */ ++ if (argp->pixel_format != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; ++ ++ argp->discrete.width = dev->pix_format.width; ++ argp->discrete.height = dev->pix_format.height; ++ } else { ++ /* if the format has not been negotiated yet, we accept anything ++ */ ++ if (NULL == format_by_fourcc(argp->pixel_format)) ++ return -EINVAL; ++ ++ if (dev->min_width == dev->max_width && ++ dev->min_height == dev->max_height) { ++ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; ++ ++ argp->discrete.width = dev->min_width; ++ argp->discrete.height = dev->min_height; ++ } else { ++ argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS; ++ ++ argp->stepwise.min_width = dev->min_width; ++ argp->stepwise.min_height = dev->min_height; ++ ++ argp->stepwise.max_width = dev->max_width; ++ argp->stepwise.max_height = dev->max_height; ++ ++ argp->stepwise.step_width = 1; ++ argp->stepwise.step_height = 1; ++ } ++ } ++ return 0; ++} ++ ++/* returns frameinterval (fps) for the set resolution ++ * called on VIDIOC_ENUM_FRAMEINTERVALS ++ */ ++static int vidioc_enum_frameintervals(struct file *file, void *fh, ++ struct v4l2_frmivalenum *argp) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ ++ /* there can be only one... */ ++ if (argp->index) ++ return -EINVAL; ++ ++ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { ++ if (argp->width != dev->pix_format.width || ++ argp->height != dev->pix_format.height || ++ argp->pixel_format != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMIVAL_TYPE_DISCRETE; ++ argp->discrete = dev->capture_param.timeperframe; ++ } else { ++ if (argp->width < dev->min_width || ++ argp->width > dev->max_width || ++ argp->height < dev->min_height || ++ argp->height > dev->max_height || ++ NULL == format_by_fourcc(argp->pixel_format)) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS; ++ argp->stepwise.min.numerator = 1; ++ argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX; ++ argp->stepwise.max.numerator = 1; ++ argp->stepwise.max.denominator = V4L2LOOPBACK_FPS_MIN; ++ argp->stepwise.step.numerator = 1; ++ argp->stepwise.step.denominator = 1; ++ } ++ ++ return 0; ++} ++ ++/* ------------------ CAPTURE ----------------------- */ ++ ++/* returns device formats ++ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_enum_fmt_cap(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ struct v4l2_loopback_device *dev; ++ const struct v4l2l_format *fmt; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ if (f->index) ++ return -EINVAL; ++ ++ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { ++ /* format has been fixed, so only one single format is supported */ ++ const __u32 format = dev->pix_format.pixelformat; ++ ++ if ((fmt = format_by_fourcc(format))) { ++ snprintf(f->description, sizeof(f->description), "%s", ++ fmt->name); ++ } else { ++ snprintf(f->description, sizeof(f->description), ++ "[%c%c%c%c]", (format >> 0) & 0xFF, ++ (format >> 8) & 0xFF, (format >> 16) & 0xFF, ++ (format >> 24) & 0xFF); ++ } ++ ++ f->pixelformat = dev->pix_format.pixelformat; ++ } else { ++ return -EINVAL; ++ } ++ f->flags = 0; ++ MARK(); ++ return 0; ++} ++ ++/* returns current video format ++ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_g_fmt_cap(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ if (!dev->ready_for_capture && !dev->ready_for_output) ++ return -EINVAL; ++ ++ fmt->fmt.pix = dev->pix_format; ++ MARK(); ++ return 0; ++} ++ ++/* checks if it is OK to change to format fmt; ++ * actual check is done by inner_try_setfmt ++ * just checking that pixelformat is OK and set other parameters, app should ++ * obey this decision ++ * called on VIDIOC_TRY_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_try_fmt_cap(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ int ret = 0; ++ if (!V4L2_TYPE_IS_CAPTURE(fmt->type)) ++ return -EINVAL; ++ ret = inner_try_setfmt(file, fmt); ++ if (-EBUSY == ret) ++ return 0; ++ return ret; ++} ++ ++/* sets new output format, if possible ++ * actually format is set by input and we even do not check it, just return ++ * current one, but it is possible to set subregions of input TODO(vasaka) ++ * called on VIDIOC_S_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_CAPTURE ++ */ ++static int vidioc_s_fmt_cap(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ int ret; ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!V4L2_TYPE_IS_CAPTURE(fmt->type)) ++ return -EINVAL; ++ ret = inner_try_setfmt(file, fmt); ++ if (!ret) { ++ dev->pix_format = fmt->fmt.pix; ++ } ++ return ret; ++} ++ ++/* ------------------ OUTPUT ----------------------- */ ++ ++/* returns device formats; ++ * LATER: allow all formats ++ * called on VIDIOC_ENUM_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_enum_fmt_out(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ struct v4l2_loopback_device *dev; ++ const struct v4l2l_format *fmt; ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ if (V4L2LOOPBACK_IS_FIXED_FMT(dev)) { ++ /* format has been fixed, so only one single format is supported */ ++ const __u32 format = dev->pix_format.pixelformat; ++ ++ if (f->index) ++ return -EINVAL; ++ ++ if ((fmt = format_by_fourcc(format))) { ++ snprintf(f->description, sizeof(f->description), "%s", ++ fmt->name); ++ } else { ++ snprintf(f->description, sizeof(f->description), ++ "[%c%c%c%c]", (format >> 0) & 0xFF, ++ (format >> 8) & 0xFF, (format >> 16) & 0xFF, ++ (format >> 24) & 0xFF); ++ } ++ ++ f->pixelformat = dev->pix_format.pixelformat; ++ } else { ++ /* fill in a dummy format */ ++ /* coverity[unsigned_compare] */ ++ if (f->index < 0 || f->index >= FORMATS) ++ return -EINVAL; ++ ++ fmt = &formats[f->index]; ++ ++ f->pixelformat = fmt->fourcc; ++ snprintf(f->description, sizeof(f->description), "%s", ++ fmt->name); ++ } ++ f->flags = 0; ++ ++ return 0; ++} ++ ++/* returns current video format format fmt */ ++/* NOTE: this is called from the producer ++ * so if format has not been negotiated yet, ++ * it should return ALL of available formats, ++ * called on VIDIOC_G_FMT, with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_g_fmt_out(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ /* ++ * LATER: this should return the currently valid format ++ * gstreamer doesn't like it, if this returns -EINVAL, as it ++ * then concludes that there is _no_ valid format ++ * CHECK whether this assumption is wrong, ++ * or whether we have to always provide a valid format ++ */ ++ ++ fmt->fmt.pix = dev->pix_format; ++ return 0; ++} ++ ++/* checks if it is OK to change to format fmt; ++ * if format is negotiated do not change it ++ * called on VIDIOC_TRY_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_try_fmt_out(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ int ret = 0; ++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) ++ return -EINVAL; ++ ret = inner_try_setfmt(file, fmt); ++ if (-EBUSY == ret) ++ return 0; ++ return ret; ++} ++ ++/* sets new output format, if possible; ++ * allocate data here because we do not know if it will be streaming or ++ * read/write IO ++ * called on VIDIOC_S_FMT with v4l2_buf_type set to V4L2_BUF_TYPE_VIDEO_OUTPUT ++ */ ++static int vidioc_s_fmt_out(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ struct v4l2_loopback_device *dev; ++ int ret; ++ char buf[5]; ++ buf[4] = 0; ++ if (!V4L2_TYPE_IS_OUTPUT(fmt->type)) ++ return -EINVAL; ++ dev = v4l2loopback_getdevice(file); ++ ++ ret = inner_try_setfmt(file, fmt); ++ if (!ret) { ++ dev->pix_format = fmt->fmt.pix; ++ dev->pix_format_has_valid_sizeimage = ++ v4l2l_pix_format_has_valid_sizeimage(fmt); ++ dprintk("s_fmt_out(%d) %d...%d\n", ret, dev->ready_for_capture, ++ dev->pix_format.sizeimage); ++ dprintk("outFOURCC=%s\n", ++ fourcc2str(dev->pix_format.pixelformat, buf)); ++ ++ if (!dev->ready_for_capture) { ++ dev->buffer_size = ++ PAGE_ALIGN(dev->pix_format.sizeimage); ++ // JMZ: TODO get rid of the next line ++ fmt->fmt.pix.sizeimage = dev->buffer_size; ++ ret = allocate_buffers(dev); ++ } ++ } ++ return ret; ++} ++ ++// #define V4L2L_OVERLAY ++#ifdef V4L2L_OVERLAY ++/* ------------------ OVERLAY ----------------------- */ ++/* currently unsupported */ ++/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work ++ * while it should only require it, if overlay is requested ++ * once the gstreamer element is fixed, remove the overlay dummies ++ */ ++#warning OVERLAY dummies ++static int vidioc_g_fmt_overlay(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return 0; ++} ++ ++static int vidioc_s_fmt_overlay(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return 0; ++} ++#endif /* V4L2L_OVERLAY */ ++ ++/* ------------------ PARAMs ----------------------- */ ++ ++/* get some data flow parameters, only capability, fps and readbuffers has ++ * effect on this driver ++ * called on VIDIOC_G_PARM ++ */ ++static int vidioc_g_parm(struct file *file, void *priv, ++ struct v4l2_streamparm *parm) ++{ ++ /* do not care about type of opener, hope these enums would always be ++ * compatible */ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ parm->parm.capture = dev->capture_param; ++ return 0; ++} ++ ++/* get some data flow parameters, only capability, fps and readbuffers has ++ * effect on this driver ++ * called on VIDIOC_S_PARM ++ */ ++static int vidioc_s_parm(struct file *file, void *priv, ++ struct v4l2_streamparm *parm) ++{ ++ struct v4l2_loopback_device *dev; ++ int err = 0; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ dprintk("vidioc_s_parm called frate=%d/%d\n", ++ parm->parm.capture.timeperframe.numerator, ++ parm->parm.capture.timeperframe.denominator); ++ ++ switch (parm->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if ((err = set_timeperframe( ++ dev, &parm->parm.capture.timeperframe)) < 0) ++ return err; ++ break; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if ((err = set_timeperframe( ++ dev, &parm->parm.capture.timeperframe)) < 0) ++ return err; ++ break; ++ default: ++ return -1; ++ } ++ ++ parm->parm.capture = dev->capture_param; ++ return 0; ++} ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++/* sets a tv standard, actually we do not need to handle this any special way ++ * added to support effecttv ++ * called on VIDIOC_S_STD ++ */ ++static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std) ++{ ++ v4l2_std_id req_std = 0, supported_std = 0; ++ const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0; ++ ++ if (_std) { ++ req_std = *_std; ++ *_std = all_std; ++ } ++ ++ /* we support everything in V4L2_STD_ALL, but not more... */ ++ supported_std = (all_std & req_std); ++ if (no_std == supported_std) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* gets a fake video standard ++ * called on VIDIOC_G_STD ++ */ ++static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm) ++{ ++ if (norm) ++ *norm = V4L2_STD_ALL; ++ return 0; ++} ++/* gets a fake video standard ++ * called on VIDIOC_QUERYSTD ++ */ ++static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm) ++{ ++ if (norm) ++ *norm = V4L2_STD_ALL; ++ return 0; ++} ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id, ++ s64 val) ++{ ++ switch (id) { ++ case CID_KEEP_FORMAT: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ dev->keep_format = val; ++ try_free_buffers( ++ dev); /* will only free buffers if !keep_format */ ++ break; ++ case CID_SUSTAIN_FRAMERATE: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ spin_lock_bh(&dev->lock); ++ dev->sustain_framerate = val; ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++ break; ++ case CID_TIMEOUT: ++ if (val < 0 || val > MAX_TIMEOUT) ++ return -EINVAL; ++ spin_lock_bh(&dev->lock); ++ dev->timeout_jiffies = msecs_to_jiffies(val); ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++ allocate_timeout_image(dev); ++ break; ++ case CID_TIMEOUT_IMAGE_IO: ++ dev->timeout_image_io = 1; ++ break; ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl) ++{ ++ struct v4l2_loopback_device *dev = container_of( ++ ctrl->handler, struct v4l2_loopback_device, ctrl_handler); ++ return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val); ++} ++ ++/* returns set of device outputs, in our case there is only one ++ * called on VIDIOC_ENUMOUTPUT ++ */ ++static int vidioc_enum_output(struct file *file, void *fh, ++ struct v4l2_output *outp) ++{ ++ __u32 index = outp->index; ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ MARK(); ++ ++ if (!dev->announce_all_caps && !dev->ready_for_output) ++ return -ENOTTY; ++ ++ if (0 != index) ++ return -EINVAL; ++ ++ /* clear all data (including the reserved fields) */ ++ memset(outp, 0, sizeof(*outp)); ++ ++ outp->index = index; ++ strscpy(outp->name, "loopback in", sizeof(outp->name)); ++ outp->type = V4L2_OUTPUT_TYPE_ANALOG; ++ outp->audioset = 0; ++ outp->modulator = 0; ++#ifdef V4L2LOOPBACK_WITH_STD ++ outp->std = V4L2_STD_ALL; ++#ifdef V4L2_OUT_CAP_STD ++ outp->capabilities |= V4L2_OUT_CAP_STD; ++#endif /* V4L2_OUT_CAP_STD */ ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ return 0; ++} ++ ++/* which output is currently active, ++ * called on VIDIOC_G_OUTPUT ++ */ ++static int vidioc_g_output(struct file *file, void *fh, unsigned int *i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_output) ++ return -ENOTTY; ++ if (i) ++ *i = 0; ++ return 0; ++} ++ ++/* set output, can make sense if we have more than one video src, ++ * called on VIDIOC_S_OUTPUT ++ */ ++static int vidioc_s_output(struct file *file, void *fh, unsigned int i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_output) ++ return -ENOTTY; ++ ++ if (i) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* returns set of device inputs, in our case there is only one, ++ * but later I may add more ++ * called on VIDIOC_ENUMINPUT ++ */ ++static int vidioc_enum_input(struct file *file, void *fh, ++ struct v4l2_input *inp) ++{ ++ struct v4l2_loopback_device *dev; ++ __u32 index = inp->index; ++ MARK(); ++ ++ if (0 != index) ++ return -EINVAL; ++ ++ /* clear all data (including the reserved fields) */ ++ memset(inp, 0, sizeof(*inp)); ++ ++ inp->index = index; ++ strscpy(inp->name, "loopback", sizeof(inp->name)); ++ inp->type = V4L2_INPUT_TYPE_CAMERA; ++ inp->audioset = 0; ++ inp->tuner = 0; ++ inp->status = 0; ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ inp->std = V4L2_STD_ALL; ++#ifdef V4L2_IN_CAP_STD ++ inp->capabilities |= V4L2_IN_CAP_STD; ++#endif ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ dev = v4l2loopback_getdevice(file); ++ if (!dev->ready_for_capture) { ++ inp->status |= V4L2_IN_ST_NO_SIGNAL; ++ } ++ ++ return 0; ++} ++ ++/* which input is currently active, ++ * called on VIDIOC_G_INPUT ++ */ ++static int vidioc_g_input(struct file *file, void *fh, unsigned int *i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_capture) ++ return -ENOTTY; ++ if (i) ++ *i = 0; ++ return 0; ++} ++ ++/* set input, can make sense if we have more than one video src, ++ * called on VIDIOC_S_INPUT ++ */ ++static int vidioc_s_input(struct file *file, void *fh, unsigned int i) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ if (!dev->announce_all_caps && !dev->ready_for_capture) ++ return -ENOTTY; ++ if (i == 0) ++ return 0; ++ return -EINVAL; ++} ++ ++/* --------------- V4L2 ioctl buffer related calls ----------------- */ ++ ++/* negotiate buffer type ++ * only mmap streaming supported ++ * called on VIDIOC_REQBUFS ++ */ ++static int vidioc_reqbufs(struct file *file, void *fh, ++ struct v4l2_requestbuffers *b) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ int i; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ dprintk("reqbufs: %d\t%d=%d\n", b->memory, b->count, ++ dev->buffers_number); ++ ++ if (opener->timeout_image_io) { ++ dev->timeout_image_io = 0; ++ if (b->memory != V4L2_MEMORY_MMAP) ++ return -EINVAL; ++ b->count = 2; ++ return 0; ++ } ++ ++ if (V4L2_TYPE_IS_OUTPUT(b->type) && (!dev->ready_for_output)) { ++ return -EBUSY; ++ } ++ ++ init_buffers(dev); ++ switch (b->memory) { ++ case V4L2_MEMORY_MMAP: ++ /* do nothing here, buffers are always allocated */ ++ if (b->count < 1 || dev->buffers_number < 1) ++ return 0; ++ ++ if (b->count > dev->buffers_number) ++ b->count = dev->buffers_number; ++ ++ /* make sure that outbufs_list contains buffers from 0 to used_buffers-1 ++ * actually, it will have been already populated via v4l2_loopback_init() ++ * at this point */ ++ if (list_empty(&dev->outbufs_list)) { ++ for (i = 0; i < dev->used_buffers; ++i) ++ list_add_tail(&dev->buffers[i].list_head, ++ &dev->outbufs_list); ++ } ++ ++ /* also, if dev->used_buffers is going to be decreased, we should remove ++ * out-of-range buffers from outbufs_list, and fix bufpos2index mapping */ ++ if (b->count < dev->used_buffers) { ++ struct v4l2l_buffer *pos, *n; ++ ++ list_for_each_entry_safe(pos, n, &dev->outbufs_list, ++ list_head) { ++ if (pos->buffer.index >= b->count) ++ list_del(&pos->list_head); ++ } ++ ++ /* after we update dev->used_buffers, buffers in outbufs_list will ++ * correspond to dev->write_position + [0;b->count-1] range */ ++ i = v4l2l_mod64(dev->write_position, b->count); ++ list_for_each_entry(pos, &dev->outbufs_list, ++ list_head) { ++ dev->bufpos2index[i % b->count] = ++ pos->buffer.index; ++ ++i; ++ } ++ } ++ ++ opener->buffers_number = b->count; ++ if (opener->buffers_number < dev->used_buffers) ++ dev->used_buffers = opener->buffers_number; ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++/* returns buffer asked for; ++ * give app as many buffers as it wants, if it less than MAX, ++ * but map them in our inner buffers ++ * called on VIDIOC_QUERYBUF ++ */ ++static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *b) ++{ ++ enum v4l2_buf_type type; ++ int index; ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ ++ MARK(); ++ ++ type = b->type; ++ index = b->index; ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ if ((b->type != V4L2_BUF_TYPE_VIDEO_CAPTURE) && ++ (b->type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) { ++ return -EINVAL; ++ } ++ if (b->index > max_buffers) ++ return -EINVAL; ++ ++ if (opener->timeout_image_io) ++ *b = dev->timeout_image_buffer.buffer; ++ else ++ *b = dev->buffers[b->index % dev->used_buffers].buffer; ++ ++ b->type = type; ++ b->index = index; ++ dprintkrw("buffer type: %d (of %d with size=%ld)\n", b->memory, ++ dev->buffers_number, dev->buffer_size); ++ ++ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' ++ https://github.com/umlaeute/v4l2loopback/issues/60 */ ++ b->flags &= ~V4L2_BUF_FLAG_DONE; ++ b->flags |= V4L2_BUF_FLAG_QUEUED; ++ ++ return 0; ++} ++ ++static void buffer_written(struct v4l2_loopback_device *dev, ++ struct v4l2l_buffer *buf) ++{ ++ del_timer_sync(&dev->sustain_timer); ++ del_timer_sync(&dev->timeout_timer); ++ ++ spin_lock_bh(&dev->list_lock); ++ list_move_tail(&buf->list_head, &dev->outbufs_list); ++ spin_unlock_bh(&dev->list_lock); ++ ++ spin_lock_bh(&dev->lock); ++ dev->bufpos2index[v4l2l_mod64(dev->write_position, dev->used_buffers)] = ++ buf->buffer.index; ++ ++dev->write_position; ++ dev->reread_count = 0; ++ ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++} ++ ++/* put buffer to queue ++ * called on VIDIOC_QBUF ++ */ ++static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ struct v4l2l_buffer *b; ++ int index; ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ if (buf->index > max_buffers) ++ return -EINVAL; ++ if (opener->timeout_image_io) ++ return 0; ++ ++ index = buf->index % dev->used_buffers; ++ b = &dev->buffers[index]; ++ ++ switch (buf->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ dprintkrw( ++ "qbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", ++ index, buf->index, buf, buf->type, buf->bytesused, ++ buf->length, buf->flags, buf->field, ++ (long long)buf->timestamp.tv_sec, ++ (long int)buf->timestamp.tv_usec, buf->sequence); ++ set_queued(b); ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ dprintkrw( ++ "qbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", ++ index, buf->index, buf, buf->type, buf->bytesused, ++ buf->length, buf->flags, buf->field, ++ (long long)buf->timestamp.tv_sec, ++ (long int)buf->timestamp.tv_usec, buf->sequence); ++ if ((!(b->buffer.flags & V4L2_BUF_FLAG_TIMESTAMP_COPY)) && ++ (buf->timestamp.tv_sec == 0 && buf->timestamp.tv_usec == 0)) ++ v4l2l_get_timestamp(&b->buffer); ++ else { ++ b->buffer.timestamp = buf->timestamp; ++ b->buffer.flags |= V4L2_BUF_FLAG_TIMESTAMP_COPY; ++ } ++ if (dev->pix_format_has_valid_sizeimage) { ++ if (buf->bytesused >= dev->pix_format.sizeimage) { ++ b->buffer.bytesused = dev->pix_format.sizeimage; ++ } else { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) ++ dev_warn_ratelimited( ++ &dev->vdev->dev, ++#else ++ dprintkrw( ++#endif ++ "warning queued output buffer bytesused too small %d < %d\n", ++ buf->bytesused, ++ dev->pix_format.sizeimage); ++ b->buffer.bytesused = buf->bytesused; ++ } ++ } else { ++ b->buffer.bytesused = buf->bytesused; ++ } ++ ++ set_done(b); ++ buffer_written(dev, b); ++ ++ /* Hopefully fix 'DQBUF return bad index if queue bigger then 2 for capture' ++ https://github.com/umlaeute/v4l2loopback/issues/60 */ ++ buf->flags &= ~V4L2_BUF_FLAG_DONE; ++ buf->flags |= V4L2_BUF_FLAG_QUEUED; ++ ++ wake_up_all(&dev->read_event); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++static int can_read(struct v4l2_loopback_device *dev, ++ struct v4l2_loopback_opener *opener) ++{ ++ int ret; ++ ++ spin_lock_bh(&dev->lock); ++ check_timers(dev); ++ ret = dev->write_position > opener->read_position || ++ dev->reread_count > opener->reread_count || dev->timeout_happened; ++ spin_unlock_bh(&dev->lock); ++ return ret; ++} ++ ++static int get_capture_buffer(struct file *file) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); ++ int pos, ret; ++ int timeout_happened; ++ ++ if ((file->f_flags & O_NONBLOCK) && ++ (dev->write_position <= opener->read_position && ++ dev->reread_count <= opener->reread_count && ++ !dev->timeout_happened)) ++ return -EAGAIN; ++ wait_event_interruptible(dev->read_event, can_read(dev, opener)); ++ ++ spin_lock_bh(&dev->lock); ++ if (dev->write_position == opener->read_position) { ++ if (dev->reread_count > opener->reread_count + 2) ++ opener->reread_count = dev->reread_count - 1; ++ ++opener->reread_count; ++ pos = v4l2l_mod64(opener->read_position + dev->used_buffers - 1, ++ dev->used_buffers); ++ } else { ++ opener->reread_count = 0; ++ if (dev->write_position > ++ opener->read_position + dev->used_buffers) ++ opener->read_position = dev->write_position - 1; ++ pos = v4l2l_mod64(opener->read_position, dev->used_buffers); ++ ++opener->read_position; ++ } ++ timeout_happened = dev->timeout_happened; ++ dev->timeout_happened = 0; ++ spin_unlock_bh(&dev->lock); ++ ++ ret = dev->bufpos2index[pos]; ++ if (timeout_happened) { ++ if (ret < 0) { ++ dprintk("trying to return not mapped buf[%d]\n", ret); ++ return -EFAULT; ++ } ++ /* although allocated on-demand, timeout_image is freed only ++ * in free_buffers(), so we don't need to worry about it being ++ * deallocated suddenly */ ++ memcpy(dev->image + dev->buffers[ret].buffer.m.offset, ++ dev->timeout_image, dev->buffer_size); ++ } ++ return ret; ++} ++ ++/* put buffer to dequeue ++ * called on VIDIOC_DQBUF ++ */ ++static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ int index; ++ struct v4l2l_buffer *b; ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ if (opener->timeout_image_io) { ++ *buf = dev->timeout_image_buffer.buffer; ++ return 0; ++ } ++ ++ switch (buf->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ index = get_capture_buffer(file); ++ if (index < 0) ++ return index; ++ dprintkrw("capture DQBUF pos: %lld index: %d\n", ++ (long long)(opener->read_position - 1), index); ++ if (!(dev->buffers[index].buffer.flags & ++ V4L2_BUF_FLAG_MAPPED)) { ++ dprintk("trying to return not mapped buf[%d]\n", index); ++ return -EINVAL; ++ } ++ unset_flags(&dev->buffers[index]); ++ *buf = dev->buffers[index].buffer; ++ dprintkrw( ++ "dqbuf(CAPTURE)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", ++ index, buf->index, buf, buf->type, buf->bytesused, ++ buf->length, buf->flags, buf->field, ++ (long long)buf->timestamp.tv_sec, ++ (long int)buf->timestamp.tv_usec, buf->sequence); ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ spin_lock_bh(&dev->list_lock); ++ ++ b = list_entry(dev->outbufs_list.prev, struct v4l2l_buffer, ++ list_head); ++ list_move_tail(&b->list_head, &dev->outbufs_list); ++ ++ spin_unlock_bh(&dev->list_lock); ++ dprintkrw("output DQBUF index: %d\n", b->buffer.index); ++ unset_flags(b); ++ *buf = b->buffer; ++ buf->type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ dprintkrw( ++ "dqbuf(OUTPUT)#%d: buffer#%d @ %p type=%d bytesused=%d length=%d flags=%x field=%d timestamp=%lld.%06ld sequence=%d\n", ++ index, buf->index, buf, buf->type, buf->bytesused, ++ buf->length, buf->flags, buf->field, ++ (long long)buf->timestamp.tv_sec, ++ (long int)buf->timestamp.tv_usec, buf->sequence); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++/* ------------- STREAMING ------------------- */ ++ ++/* start streaming ++ * called on VIDIOC_STREAMON ++ */ ++static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (!dev->ready_for_capture) { ++ int ret = allocate_buffers(dev); ++ if (ret < 0) ++ return ret; ++ } ++ opener->type = WRITER; ++ dev->ready_for_output = 0; ++ dev->ready_for_capture++; ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (!dev->ready_for_capture) ++ return -EIO; ++ if (dev->active_readers > 0) ++ return -EBUSY; ++ opener->type = READER; ++ dev->active_readers++; ++ client_usage_queue_event(dev->vdev); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ return -EINVAL; ++} ++ ++/* stop streaming ++ * called on VIDIOC_STREAMOFF ++ */ ++static int vidioc_streamoff(struct file *file, void *fh, ++ enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ ++ MARK(); ++ dprintk("%d\n", type); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(fh); ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (dev->ready_for_capture > 0) ++ dev->ready_for_capture--; ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (opener->type == READER) { ++ opener->type = 0; ++ dev->active_readers--; ++ client_usage_queue_event(dev->vdev); ++ } ++ return 0; ++ default: ++ return -EINVAL; ++ } ++ return -EINVAL; ++} ++ ++#ifdef CONFIG_VIDEO_V4L1_COMPAT ++static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ p->frames = dev->buffers_number; ++ p->offsets[0] = 0; ++ p->offsets[1] = 0; ++ p->size = dev->buffer_size; ++ return 0; ++} ++#endif ++ ++static void client_usage_queue_event(struct video_device *vdev) ++{ ++ struct v4l2_event ev; ++ struct v4l2_loopback_device *dev; ++ ++ dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device, ++ v4l2_dev); ++ ++ memset(&ev, 0, sizeof(ev)); ++ ev.type = V4L2_EVENT_PRI_CLIENT_USAGE; ++ ((struct v4l2_event_client_usage *)&ev.u)->count = dev->active_readers; ++ ++ v4l2_event_queue(vdev, &ev); ++} ++ ++static int client_usage_ops_add(struct v4l2_subscribed_event *sev, ++ unsigned elems) ++{ ++ if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL)) ++ return 0; ++ ++ client_usage_queue_event(sev->fh->vdev); ++ return 0; ++} ++ ++static void client_usage_ops_replace(struct v4l2_event *old, ++ const struct v4l2_event *new) ++{ ++ *((struct v4l2_event_client_usage *)&old->u) = ++ *((struct v4l2_event_client_usage *)&new->u); ++} ++ ++static void client_usage_ops_merge(const struct v4l2_event *old, ++ struct v4l2_event *new) ++{ ++ *((struct v4l2_event_client_usage *)&new->u) = ++ *((struct v4l2_event_client_usage *)&old->u); ++} ++ ++const struct v4l2_subscribed_event_ops client_usage_ops = { ++ .add = client_usage_ops_add, ++ .replace = client_usage_ops_replace, ++ .merge = client_usage_ops_merge, ++}; ++ ++static int vidioc_subscribe_event(struct v4l2_fh *fh, ++ const struct v4l2_event_subscription *sub) ++{ ++ switch (sub->type) { ++ case V4L2_EVENT_CTRL: ++ return v4l2_ctrl_subscribe_event(fh, sub); ++ case V4L2_EVENT_PRI_CLIENT_USAGE: ++ return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops); ++ } ++ ++ return -EINVAL; ++} ++ ++/* file operations */ ++static void vm_open(struct vm_area_struct *vma) ++{ ++ struct v4l2l_buffer *buf; ++ MARK(); ++ ++ buf = vma->vm_private_data; ++ buf->use_count++; ++ ++ buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED; ++} ++ ++static void vm_close(struct vm_area_struct *vma) ++{ ++ struct v4l2l_buffer *buf; ++ MARK(); ++ ++ buf = vma->vm_private_data; ++ buf->use_count--; ++ ++ if (buf->use_count <= 0) ++ buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED; ++} ++ ++static struct vm_operations_struct vm_ops = { ++ .open = vm_open, ++ .close = vm_close, ++}; ++ ++static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ u8 *addr; ++ unsigned long start; ++ unsigned long size; ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ struct v4l2l_buffer *buffer = NULL; ++ MARK(); ++ ++ start = (unsigned long)vma->vm_start; ++ size = (unsigned long)(vma->vm_end - vma->vm_start); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(file->private_data); ++ ++ if (size > dev->buffer_size) { ++ dprintk("userspace tries to mmap too much, fail\n"); ++ return -EINVAL; ++ } ++ if (opener->timeout_image_io) { ++ /* we are going to map the timeout_image_buffer */ ++ if ((vma->vm_pgoff << PAGE_SHIFT) != ++ dev->buffer_size * MAX_BUFFERS) { ++ dprintk("invalid mmap offset for timeout_image_io mode\n"); ++ return -EINVAL; ++ } ++ } else if ((vma->vm_pgoff << PAGE_SHIFT) > ++ dev->buffer_size * (dev->buffers_number - 1)) { ++ dprintk("userspace tries to mmap too far, fail\n"); ++ return -EINVAL; ++ } ++ ++ /* FIXXXXXME: allocation should not happen here! */ ++ if (NULL == dev->image) ++ if (allocate_buffers(dev) < 0) ++ return -EINVAL; ++ ++ if (opener->timeout_image_io) { ++ buffer = &dev->timeout_image_buffer; ++ addr = dev->timeout_image; ++ } else { ++ int i; ++ for (i = 0; i < dev->buffers_number; ++i) { ++ buffer = &dev->buffers[i]; ++ if ((buffer->buffer.m.offset >> PAGE_SHIFT) == ++ vma->vm_pgoff) ++ break; ++ } ++ ++ if (i >= dev->buffers_number) ++ return -EINVAL; ++ ++ addr = dev->image + (vma->vm_pgoff << PAGE_SHIFT); ++ } ++ ++ while (size > 0) { ++ struct page *page; ++ ++ page = vmalloc_to_page(addr); ++ ++ if (vm_insert_page(vma, start, page) < 0) ++ return -EAGAIN; ++ ++ start += PAGE_SIZE; ++ addr += PAGE_SIZE; ++ size -= PAGE_SIZE; ++ } ++ ++ vma->vm_ops = &vm_ops; ++ vma->vm_private_data = buffer; ++ ++ vm_open(vma); ++ ++ MARK(); ++ return 0; ++} ++ ++static unsigned int v4l2_loopback_poll(struct file *file, ++ struct poll_table_struct *pts) ++{ ++ struct v4l2_loopback_opener *opener; ++ struct v4l2_loopback_device *dev; ++ __poll_t req_events = poll_requested_events(pts); ++ int ret_mask = 0; ++ MARK(); ++ ++ opener = fh_to_opener(file->private_data); ++ dev = v4l2loopback_getdevice(file); ++ ++ if (req_events & POLLPRI) { ++ if (!v4l2_event_pending(&opener->fh)) ++ poll_wait(file, &opener->fh.wait, pts); ++ if (v4l2_event_pending(&opener->fh)) { ++ ret_mask |= POLLPRI; ++ if (!(req_events & DEFAULT_POLLMASK)) ++ return ret_mask; ++ } ++ } ++ ++ switch (opener->type) { ++ case WRITER: ++ ret_mask |= POLLOUT | POLLWRNORM; ++ break; ++ case READER: ++ if (!can_read(dev, opener)) { ++ if (ret_mask) ++ return ret_mask; ++ poll_wait(file, &dev->read_event, pts); ++ } ++ if (can_read(dev, opener)) ++ ret_mask |= POLLIN | POLLRDNORM; ++ if (v4l2_event_pending(&opener->fh)) ++ ret_mask |= POLLPRI; ++ break; ++ default: ++ break; ++ } ++ ++ MARK(); ++ return ret_mask; ++} ++ ++/* do not want to limit device opens, it can be as many readers as user want, ++ * writers are limited by means of setting writer field */ ++static int v4l2_loopback_open(struct file *file) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ MARK(); ++ dev = v4l2loopback_getdevice(file); ++ if (dev->open_count.counter >= dev->max_openers) ++ return -EBUSY; ++ /* kfree on close */ ++ opener = kzalloc(sizeof(*opener), GFP_KERNEL); ++ if (opener == NULL) ++ return -ENOMEM; ++ ++ atomic_inc(&dev->open_count); ++ ++ opener->timeout_image_io = dev->timeout_image_io; ++ if (opener->timeout_image_io) { ++ int r = allocate_timeout_image(dev); ++ ++ if (r < 0) { ++ dprintk("timeout image allocation failed\n"); ++ ++ atomic_dec(&dev->open_count); ++ ++ kfree(opener); ++ return r; ++ } ++ } ++ ++ v4l2_fh_init(&opener->fh, video_devdata(file)); ++ file->private_data = &opener->fh; ++ ++ v4l2_fh_add(&opener->fh); ++ dprintk("opened dev:%p with image:%p\n", dev, dev ? dev->image : NULL); ++ MARK(); ++ return 0; ++} ++ ++static int v4l2_loopback_close(struct file *file) ++{ ++ struct v4l2_loopback_opener *opener; ++ struct v4l2_loopback_device *dev; ++ int is_writer = 0, is_reader = 0; ++ MARK(); ++ ++ opener = fh_to_opener(file->private_data); ++ dev = v4l2loopback_getdevice(file); ++ ++ if (WRITER == opener->type) ++ is_writer = 1; ++ if (READER == opener->type) ++ is_reader = 1; ++ ++ atomic_dec(&dev->open_count); ++ if (dev->open_count.counter == 0) { ++ del_timer_sync(&dev->sustain_timer); ++ del_timer_sync(&dev->timeout_timer); ++ } ++ try_free_buffers(dev); ++ ++ v4l2_fh_del(&opener->fh); ++ v4l2_fh_exit(&opener->fh); ++ ++ kfree(opener); ++ if (is_writer) ++ dev->ready_for_output = 1; ++ if (is_reader) { ++ dev->active_readers--; ++ client_usage_queue_event(dev->vdev); ++ } ++ MARK(); ++ return 0; ++} ++ ++static ssize_t v4l2_loopback_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int read_index; ++ struct v4l2_loopback_device *dev; ++ struct v4l2_buffer *b; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ ++ read_index = get_capture_buffer(file); ++ if (read_index < 0) ++ return read_index; ++ if (count > dev->buffer_size) ++ count = dev->buffer_size; ++ b = &dev->buffers[read_index].buffer; ++ if (count > b->bytesused) ++ count = b->bytesused; ++ if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset), ++ count)) { ++ printk(KERN_ERR ++ "v4l2-loopback: failed copy_to_user() in read buf\n"); ++ return -EFAULT; ++ } ++ dprintkrw("leave v4l2_loopback_read()\n"); ++ return count; ++} ++ ++static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct v4l2_loopback_opener *opener; ++ struct v4l2_loopback_device *dev; ++ int write_index; ++ struct v4l2_buffer *b; ++ int err = 0; ++ ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ opener = fh_to_opener(file->private_data); ++ ++ if (UNNEGOTIATED == opener->type) { ++ spin_lock(&dev->lock); ++ ++ if (dev->ready_for_output) { ++ err = vidioc_streamon(file, file->private_data, ++ V4L2_BUF_TYPE_VIDEO_OUTPUT); ++ } ++ ++ spin_unlock(&dev->lock); ++ ++ if (err < 0) ++ return err; ++ } ++ ++ if (WRITER != opener->type) ++ return -EINVAL; ++ ++ if (!dev->ready_for_capture) { ++ int ret = allocate_buffers(dev); ++ if (ret < 0) ++ return ret; ++ dev->ready_for_capture = 1; ++ } ++ dprintkrw("v4l2_loopback_write() trying to write %zu bytes\n", count); ++ if (count > dev->buffer_size) ++ count = dev->buffer_size; ++ ++ write_index = v4l2l_mod64(dev->write_position, dev->used_buffers); ++ b = &dev->buffers[write_index].buffer; ++ ++ if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf, ++ count)) { ++ printk(KERN_ERR ++ "v4l2-loopback: failed copy_from_user() in write buf, could not write %zu\n", ++ count); ++ return -EFAULT; ++ } ++ v4l2l_get_timestamp(b); ++ b->bytesused = count; ++ b->sequence = dev->write_position; ++ buffer_written(dev, &dev->buffers[write_index]); ++ wake_up_all(&dev->read_event); ++ dprintkrw("leave v4l2_loopback_write()\n"); ++ return count; ++} ++ ++/* init functions */ ++/* frees buffers, if already allocated */ ++static void free_buffers(struct v4l2_loopback_device *dev) ++{ ++ MARK(); ++ dprintk("freeing image@%p for dev:%p\n", dev ? dev->image : NULL, dev); ++ if (!dev) ++ return; ++ if (dev->image) { ++ vfree(dev->image); ++ dev->image = NULL; ++ } ++ if (dev->timeout_image) { ++ vfree(dev->timeout_image); ++ dev->timeout_image = NULL; ++ } ++ dev->imagesize = 0; ++} ++/* frees buffers, if they are no longer needed */ ++static void try_free_buffers(struct v4l2_loopback_device *dev) ++{ ++ MARK(); ++ if (0 == dev->open_count.counter && !dev->keep_format) { ++ free_buffers(dev); ++ dev->ready_for_capture = 0; ++ dev->buffer_size = 0; ++ dev->write_position = 0; ++ } ++} ++/* allocates buffers, if buffer_size is set */ ++static int allocate_buffers(struct v4l2_loopback_device *dev) ++{ ++ int err; ++ ++ MARK(); ++ /* vfree on close file operation in case no open handles left */ ++ ++ if (dev->buffer_size < 1 || dev->buffers_number < 1) ++ return -EINVAL; ++ ++ if ((__LONG_MAX__ / dev->buffer_size) < dev->buffers_number) ++ return -ENOSPC; ++ ++ if (dev->image) { ++ dprintk("allocating buffers again: %ld %ld\n", ++ dev->buffer_size * dev->buffers_number, dev->imagesize); ++ /* FIXME: prevent double allocation more intelligently! */ ++ if (dev->buffer_size * dev->buffers_number == dev->imagesize) ++ return 0; ++ ++ /* check whether the total number of readers/writers is <=1 */ ++ if ((dev->ready_for_capture + dev->active_readers) <= 1) ++ free_buffers(dev); ++ else ++ return -EINVAL; ++ } ++ ++ dev->imagesize = (unsigned long)dev->buffer_size * ++ (unsigned long)dev->buffers_number; ++ ++ dprintk("allocating %ld = %ldx%d\n", dev->imagesize, dev->buffer_size, ++ dev->buffers_number); ++ err = -ENOMEM; ++ ++ if (dev->timeout_jiffies > 0) { ++ err = allocate_timeout_image(dev); ++ if (err < 0) ++ goto error; ++ } ++ ++ dev->image = vmalloc(dev->imagesize); ++ if (dev->image == NULL) ++ goto error; ++ ++ dprintk("vmallocated %ld bytes\n", dev->imagesize); ++ MARK(); ++ ++ init_buffers(dev); ++ return 0; ++ ++error: ++ free_buffers(dev); ++ return err; ++} ++ ++/* init inner buffers, they are capture mode and flags are set as ++ * for capture mod buffers */ ++static void init_buffers(struct v4l2_loopback_device *dev) ++{ ++ int i; ++ int buffer_size; ++ int bytesused; ++ MARK(); ++ ++ buffer_size = dev->buffer_size; ++ bytesused = dev->pix_format.sizeimage; ++ for (i = 0; i < dev->buffers_number; ++i) { ++ struct v4l2_buffer *b = &dev->buffers[i].buffer; ++ b->index = i; ++ b->bytesused = bytesused; ++ b->length = buffer_size; ++ b->field = V4L2_FIELD_NONE; ++ b->flags = 0; ++ b->m.offset = i * buffer_size; ++ b->memory = V4L2_MEMORY_MMAP; ++ b->sequence = 0; ++ b->timestamp.tv_sec = 0; ++ b->timestamp.tv_usec = 0; ++ b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ ++ v4l2l_get_timestamp(b); ++ } ++ dev->timeout_image_buffer = dev->buffers[0]; ++ dev->timeout_image_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size; ++ MARK(); ++} ++ ++static int allocate_timeout_image(struct v4l2_loopback_device *dev) ++{ ++ MARK(); ++ if (dev->buffer_size <= 0) { ++ dev->timeout_image_io = 0; ++ return -EINVAL; ++ } ++ ++ if (dev->timeout_image == NULL) { ++ dev->timeout_image = vzalloc(dev->buffer_size); ++ if (dev->timeout_image == NULL) { ++ dev->timeout_image_io = 0; ++ return -ENOMEM; ++ } ++ } ++ return 0; ++} ++ ++/* fills and register video device */ ++static void init_vdev(struct video_device *vdev, int nr) ++{ ++ MARK(); ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ vdev->tvnorms = V4L2_STD_ALL; ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ vdev->vfl_type = VFL_TYPE_VIDEO; ++ vdev->fops = &v4l2_loopback_fops; ++ vdev->ioctl_ops = &v4l2_loopback_ioctl_ops; ++ vdev->release = &video_device_release; ++ vdev->minor = -1; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) ++ vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE | ++ V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE | ++ V4L2_CAP_STREAMING; ++#endif ++ ++ if (debug > 1) ++ vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL | ++ V4L2_DEV_DEBUG_IOCTL_ARG; ++ ++ vdev->vfl_dir = VFL_DIR_M2M; ++ ++ MARK(); ++} ++ ++/* init default capture parameters, only fps may be changed in future */ ++static void init_capture_param(struct v4l2_captureparm *capture_param) ++{ ++ MARK(); ++ capture_param->capability = 0; ++ capture_param->capturemode = 0; ++ capture_param->extendedmode = 0; ++ capture_param->readbuffers = max_buffers; ++ capture_param->timeperframe.numerator = 1; ++ capture_param->timeperframe.denominator = 30; ++} ++ ++static void check_timers(struct v4l2_loopback_device *dev) ++{ ++ if (!dev->ready_for_capture) ++ return; ++ ++ if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer)) ++ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); ++ if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer)) ++ mod_timer(&dev->sustain_timer, ++ jiffies + dev->frame_jiffies * 3 / 2); ++} ++#ifdef HAVE_TIMER_SETUP ++static void sustain_timer_clb(struct timer_list *t) ++{ ++ struct v4l2_loopback_device *dev = from_timer(dev, t, sustain_timer); ++#else ++static void sustain_timer_clb(unsigned long nr) ++{ ++ struct v4l2_loopback_device *dev = ++ idr_find(&v4l2loopback_index_idr, nr); ++#endif ++ spin_lock(&dev->lock); ++ if (dev->sustain_framerate) { ++ dev->reread_count++; ++ dprintkrw("reread: %lld %d\n", (long long)dev->write_position, ++ dev->reread_count); ++ if (dev->reread_count == 1) ++ mod_timer(&dev->sustain_timer, ++ jiffies + max(1UL, dev->frame_jiffies / 2)); ++ else ++ mod_timer(&dev->sustain_timer, ++ jiffies + dev->frame_jiffies); ++ wake_up_all(&dev->read_event); ++ } ++ spin_unlock(&dev->lock); ++} ++#ifdef HAVE_TIMER_SETUP ++static void timeout_timer_clb(struct timer_list *t) ++{ ++ struct v4l2_loopback_device *dev = from_timer(dev, t, timeout_timer); ++#else ++static void timeout_timer_clb(unsigned long nr) ++{ ++ struct v4l2_loopback_device *dev = ++ idr_find(&v4l2loopback_index_idr, nr); ++#endif ++ spin_lock(&dev->lock); ++ if (dev->timeout_jiffies > 0) { ++ dev->timeout_happened = 1; ++ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); ++ wake_up_all(&dev->read_event); ++ } ++ spin_unlock(&dev->lock); ++} ++ ++/* init loopback main structure */ ++#define DEFAULT_FROM_CONF(confmember, default_condition, default_value) \ ++ ((conf) ? \ ++ ((conf->confmember default_condition) ? (default_value) : \ ++ (conf->confmember)) : \ ++ default_value) ++ ++static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_ctrl_handler *hdl; ++ struct v4l2loopback_private *vdev_priv = NULL; ++ ++ int err = -ENOMEM; ++ ++ u32 _width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; ++ u32 _height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; ++ ++ u32 _min_width = DEFAULT_FROM_CONF(min_width, ++ < V4L2LOOPBACK_SIZE_MIN_WIDTH, ++ V4L2LOOPBACK_SIZE_MIN_WIDTH); ++ u32 _min_height = DEFAULT_FROM_CONF(min_height, ++ < V4L2LOOPBACK_SIZE_MIN_HEIGHT, ++ V4L2LOOPBACK_SIZE_MIN_HEIGHT); ++ u32 _max_width = DEFAULT_FROM_CONF(max_width, < _min_width, max_width); ++ u32 _max_height = ++ DEFAULT_FROM_CONF(max_height, < _min_height, max_height); ++ bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ? ++ (conf->announce_all_caps) : ++ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS; ++ int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers); ++ int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers); ++ ++ int nr = -1; ++ ++ _announce_all_caps = (!!_announce_all_caps); ++ ++ if (conf) { ++ const int output_nr = conf->output_nr; ++#ifdef SPLIT_DEVICES ++ const int capture_nr = conf->capture_nr; ++#else ++ const int capture_nr = output_nr; ++#endif ++ if (capture_nr >= 0 && output_nr == capture_nr) { ++ nr = output_nr; ++ } else if (capture_nr < 0 && output_nr < 0) { ++ nr = -1; ++ } else if (capture_nr < 0) { ++ nr = output_nr; ++ } else if (output_nr < 0) { ++ nr = capture_nr; ++ } else { ++ printk(KERN_ERR ++ "split OUTPUT and CAPTURE devices not yet supported."); ++ printk(KERN_INFO ++ "both devices must have the same number (%d != %d).", ++ output_nr, capture_nr); ++ return -EINVAL; ++ } ++ } ++ ++ if (idr_find(&v4l2loopback_index_idr, nr)) ++ return -EEXIST; ++ ++ dprintk("creating v4l2loopback-device #%d\n", nr); ++ dev = kzalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) ++ return -ENOMEM; ++ ++ /* allocate id, if @id >= 0, we're requesting that specific id */ ++ if (nr >= 0) { ++ err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1, ++ GFP_KERNEL); ++ if (err == -ENOSPC) ++ err = -EEXIST; ++ } else { ++ err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL); ++ } ++ if (err < 0) ++ goto out_free_dev; ++ nr = err; ++ err = -ENOMEM; ++ ++ if (conf && conf->card_label[0]) { ++ snprintf(dev->card_label, sizeof(dev->card_label), "%s", ++ conf->card_label); ++ } else { ++ snprintf(dev->card_label, sizeof(dev->card_label), ++ "Dummy video device (0x%04X)", nr); ++ } ++ snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), ++ "v4l2loopback-%03d", nr); ++ ++ err = v4l2_device_register(NULL, &dev->v4l2_dev); ++ if (err) ++ goto out_free_idr; ++ MARK(); ++ ++ dev->vdev = video_device_alloc(); ++ if (dev->vdev == NULL) { ++ err = -ENOMEM; ++ goto out_unregister; ++ } ++ ++ vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL); ++ if (vdev_priv == NULL) { ++ err = -ENOMEM; ++ goto out_unregister; ++ } ++ ++ video_set_drvdata(dev->vdev, vdev_priv); ++ if (video_get_drvdata(dev->vdev) == NULL) { ++ err = -ENOMEM; ++ goto out_unregister; ++ } ++ ++ MARK(); ++ snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s", ++ dev->card_label); ++ ++ vdev_priv->device_nr = nr; ++ ++ init_vdev(dev->vdev, nr); ++ dev->vdev->v4l2_dev = &dev->v4l2_dev; ++ init_capture_param(&dev->capture_param); ++ err = set_timeperframe(dev, &dev->capture_param.timeperframe); ++ if (err) ++ goto out_unregister; ++ dev->keep_format = 0; ++ dev->sustain_framerate = 0; ++ ++ dev->announce_all_caps = _announce_all_caps; ++ dev->min_width = _min_width; ++ dev->min_height = _min_height; ++ dev->max_width = _max_width; ++ dev->max_height = _max_height; ++ dev->max_openers = _max_openers; ++ dev->buffers_number = dev->used_buffers = _max_buffers; ++ ++ dev->write_position = 0; ++ ++ MARK(); ++ spin_lock_init(&dev->lock); ++ spin_lock_init(&dev->list_lock); ++ INIT_LIST_HEAD(&dev->outbufs_list); ++ if (list_empty(&dev->outbufs_list)) { ++ int i; ++ ++ for (i = 0; i < dev->used_buffers; ++i) ++ list_add_tail(&dev->buffers[i].list_head, ++ &dev->outbufs_list); ++ } ++ memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index)); ++ atomic_set(&dev->open_count, 0); ++ dev->ready_for_capture = 0; ++ dev->ready_for_output = 1; ++ ++ dev->buffer_size = 0; ++ dev->image = NULL; ++ dev->imagesize = 0; ++#ifdef HAVE_TIMER_SETUP ++ timer_setup(&dev->sustain_timer, sustain_timer_clb, 0); ++ timer_setup(&dev->timeout_timer, timeout_timer_clb, 0); ++#else ++ setup_timer(&dev->sustain_timer, sustain_timer_clb, nr); ++ setup_timer(&dev->timeout_timer, timeout_timer_clb, nr); ++#endif ++ dev->reread_count = 0; ++ dev->timeout_jiffies = 0; ++ dev->timeout_image = NULL; ++ dev->timeout_happened = 0; ++ ++ hdl = &dev->ctrl_handler; ++ err = v4l2_ctrl_handler_init(hdl, 4); ++ if (err) ++ goto out_unregister; ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL); ++ if (hdl->error) { ++ err = hdl->error; ++ goto out_free_handler; ++ } ++ dev->v4l2_dev.ctrl_handler = hdl; ++ ++ err = v4l2_ctrl_handler_setup(hdl); ++ if (err) ++ goto out_free_handler; ++ ++ /* FIXME set buffers to 0 */ ++ ++ /* Set initial format */ ++ if (_width < _min_width) ++ _width = _min_width; ++ if (_width > _max_width) ++ _width = _max_width; ++ if (_height < _min_height) ++ _height = _min_height; ++ if (_height > _max_height) ++ _height = _max_height; ++ ++ dev->pix_format.width = _width; ++ dev->pix_format.height = _height; ++ dev->pix_format.pixelformat = formats[0].fourcc; ++ dev->pix_format.colorspace = ++ V4L2_COLORSPACE_DEFAULT; /* do we need to set this ? */ ++ dev->pix_format.field = V4L2_FIELD_NONE; ++ ++ dev->buffer_size = PAGE_ALIGN(dev->pix_format.sizeimage); ++ dprintk("buffer_size = %ld (=%d)\n", dev->buffer_size, ++ dev->pix_format.sizeimage); ++ ++ if (dev->buffer_size && ((err = allocate_buffers(dev)) < 0)) ++ goto out_free_handler; ++ ++ init_waitqueue_head(&dev->read_event); ++ ++ /* register the device -> it creates /dev/video* */ ++ if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) { ++ printk(KERN_ERR ++ "v4l2loopback: failed video_register_device()\n"); ++ err = -EFAULT; ++ goto out_free_device; ++ } ++ v4l2loopback_create_sysfs(dev->vdev); ++ ++ MARK(); ++ if (ret_nr) ++ *ret_nr = dev->vdev->num; ++ return 0; ++ ++out_free_device: ++ video_device_release(dev->vdev); ++out_free_handler: ++ v4l2_ctrl_handler_free(&dev->ctrl_handler); ++out_unregister: ++ video_set_drvdata(dev->vdev, NULL); ++ if (vdev_priv != NULL) ++ kfree(vdev_priv); ++ v4l2_device_unregister(&dev->v4l2_dev); ++out_free_idr: ++ idr_remove(&v4l2loopback_index_idr, nr); ++out_free_dev: ++ kfree(dev); ++ return err; ++} ++ ++static void v4l2_loopback_remove(struct v4l2_loopback_device *dev) ++{ ++ free_buffers(dev); ++ v4l2loopback_remove_sysfs(dev->vdev); ++ kfree(video_get_drvdata(dev->vdev)); ++ video_unregister_device(dev->vdev); ++ v4l2_device_unregister(&dev->v4l2_dev); ++ v4l2_ctrl_handler_free(&dev->ctrl_handler); ++ kfree(dev); ++} ++ ++static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd, ++ unsigned long parm) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_config conf; ++ struct v4l2_loopback_config *confptr = &conf; ++ int device_nr, capture_nr, output_nr; ++ int ret; ++ ++ ret = mutex_lock_killable(&v4l2loopback_ctl_mutex); ++ if (ret) ++ return ret; ++ ++ ret = -EINVAL; ++ switch (cmd) { ++ default: ++ ret = -ENOSYS; ++ break; ++ /* add a v4l2loopback device (pair), based on the user-provided specs */ ++ case V4L2LOOPBACK_CTL_ADD: ++ if (parm) { ++ if ((ret = copy_from_user(&conf, (void *)parm, ++ sizeof(conf))) < 0) ++ break; ++ } else ++ confptr = NULL; ++ ret = v4l2_loopback_add(confptr, &device_nr); ++ if (ret >= 0) ++ ret = device_nr; ++ break; ++ /* remove a v4l2loopback device (both capture and output) */ ++ case V4L2LOOPBACK_CTL_REMOVE: ++ ret = v4l2loopback_lookup((int)parm, &dev); ++ if (ret >= 0 && dev) { ++ int nr = ret; ++ ret = -EBUSY; ++ if (dev->open_count.counter > 0) ++ break; ++ idr_remove(&v4l2loopback_index_idr, nr); ++ v4l2_loopback_remove(dev); ++ ret = 0; ++ }; ++ break; ++ /* get information for a loopback device. ++ * this is mostly about limits (which cannot be queried directly with VIDIOC_G_FMT and friends ++ */ ++ case V4L2LOOPBACK_CTL_QUERY: ++ if (!parm) ++ break; ++ if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) < ++ 0) ++ break; ++ capture_nr = output_nr = conf.output_nr; ++#ifdef SPLIT_DEVICES ++ capture_nr = conf.capture_nr; ++#endif ++ device_nr = (output_nr < 0) ? capture_nr : output_nr; ++ MARK(); ++ /* get the device from either capture_nr or output_nr (whatever is valid) */ ++ if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0) ++ break; ++ MARK(); ++ /* if we got the device from output_nr and there is a valid capture_nr, ++ * make sure that both refer to the same device (or bail out) ++ */ ++ if ((device_nr != capture_nr) && (capture_nr >= 0) && ++ ((ret = v4l2loopback_lookup(capture_nr, 0)) < 0)) ++ break; ++ MARK(); ++ /* if otoh, we got the device from capture_nr and there is a valid output_nr, ++ * make sure that both refer to the same device (or bail out) ++ */ ++ if ((device_nr != output_nr) && (output_nr >= 0) && ++ ((ret = v4l2loopback_lookup(output_nr, 0)) < 0)) ++ break; ++ MARK(); ++ ++ /* v4l2_loopback_config identified a single device, so fetch the data */ ++ snprintf(conf.card_label, sizeof(conf.card_label), "%s", ++ dev->card_label); ++ MARK(); ++ conf.output_nr = dev->vdev->num; ++#ifdef SPLIT_DEVICES ++ conf.capture_nr = dev->vdev->num; ++#endif ++ conf.min_width = dev->min_width; ++ conf.min_height = dev->min_height; ++ conf.max_width = dev->max_width; ++ conf.max_height = dev->max_height; ++ conf.announce_all_caps = dev->announce_all_caps; ++ conf.max_buffers = dev->buffers_number; ++ conf.max_openers = dev->max_openers; ++ conf.debug = debug; ++ MARK(); ++ if (copy_to_user((void *)parm, &conf, sizeof(conf))) { ++ ret = -EFAULT; ++ break; ++ } ++ MARK(); ++ ret = 0; ++ ; ++ break; ++ } ++ ++ MARK(); ++ mutex_unlock(&v4l2loopback_ctl_mutex); ++ MARK(); ++ return ret; ++} ++ ++/* LINUX KERNEL */ ++ ++static const struct file_operations v4l2loopback_ctl_fops = { ++ // clang-format off ++ .owner = THIS_MODULE, ++ .open = nonseekable_open, ++ .unlocked_ioctl = v4l2loopback_control_ioctl, ++ .compat_ioctl = v4l2loopback_control_ioctl, ++ .llseek = noop_llseek, ++ // clang-format on ++}; ++ ++static struct miscdevice v4l2loopback_misc = { ++ // clang-format off ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "v4l2loopback", ++ .fops = &v4l2loopback_ctl_fops, ++ // clang-format on ++}; ++ ++static const struct v4l2_file_operations v4l2_loopback_fops = { ++ // clang-format off ++ .owner = THIS_MODULE, ++ .open = v4l2_loopback_open, ++ .release = v4l2_loopback_close, ++ .read = v4l2_loopback_read, ++ .write = v4l2_loopback_write, ++ .poll = v4l2_loopback_poll, ++ .mmap = v4l2_loopback_mmap, ++ .unlocked_ioctl = video_ioctl2, ++ // clang-format on ++}; ++ ++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = { ++ // clang-format off ++ .vidioc_querycap = &vidioc_querycap, ++ .vidioc_enum_framesizes = &vidioc_enum_framesizes, ++ .vidioc_enum_frameintervals = &vidioc_enum_frameintervals, ++ ++ .vidioc_enum_output = &vidioc_enum_output, ++ .vidioc_g_output = &vidioc_g_output, ++ .vidioc_s_output = &vidioc_s_output, ++ ++ .vidioc_enum_input = &vidioc_enum_input, ++ .vidioc_g_input = &vidioc_g_input, ++ .vidioc_s_input = &vidioc_s_input, ++ ++ .vidioc_enum_fmt_vid_cap = &vidioc_enum_fmt_cap, ++ .vidioc_g_fmt_vid_cap = &vidioc_g_fmt_cap, ++ .vidioc_s_fmt_vid_cap = &vidioc_s_fmt_cap, ++ .vidioc_try_fmt_vid_cap = &vidioc_try_fmt_cap, ++ ++ .vidioc_enum_fmt_vid_out = &vidioc_enum_fmt_out, ++ .vidioc_s_fmt_vid_out = &vidioc_s_fmt_out, ++ .vidioc_g_fmt_vid_out = &vidioc_g_fmt_out, ++ .vidioc_try_fmt_vid_out = &vidioc_try_fmt_out, ++ ++#ifdef V4L2L_OVERLAY ++ .vidioc_s_fmt_vid_overlay = &vidioc_s_fmt_overlay, ++ .vidioc_g_fmt_vid_overlay = &vidioc_g_fmt_overlay, ++#endif ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ .vidioc_s_std = &vidioc_s_std, ++ .vidioc_g_std = &vidioc_g_std, ++ .vidioc_querystd = &vidioc_querystd, ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ .vidioc_g_parm = &vidioc_g_parm, ++ .vidioc_s_parm = &vidioc_s_parm, ++ ++ .vidioc_reqbufs = &vidioc_reqbufs, ++ .vidioc_querybuf = &vidioc_querybuf, ++ .vidioc_qbuf = &vidioc_qbuf, ++ .vidioc_dqbuf = &vidioc_dqbuf, ++ ++ .vidioc_streamon = &vidioc_streamon, ++ .vidioc_streamoff = &vidioc_streamoff, ++ ++#ifdef CONFIG_VIDEO_V4L1_COMPAT ++ .vidiocgmbuf = &vidiocgmbuf, ++#endif ++ ++ .vidioc_subscribe_event = &vidioc_subscribe_event, ++ .vidioc_unsubscribe_event = &v4l2_event_unsubscribe, ++ // clang-format on ++}; ++ ++static int free_device_cb(int id, void *ptr, void *data) ++{ ++ struct v4l2_loopback_device *dev = ptr; ++ v4l2_loopback_remove(dev); ++ return 0; ++} ++static void free_devices(void) ++{ ++ idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL); ++ idr_destroy(&v4l2loopback_index_idr); ++} ++ ++static int __init v4l2loopback_init_module(void) ++{ ++ const u32 min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH; ++ const u32 min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT; ++ int err; ++ int i; ++ MARK(); ++ ++ err = misc_register(&v4l2loopback_misc); ++ if (err < 0) ++ return err; ++ ++ if (devices < 0) { ++ devices = 1; ++ ++ /* try guessing the devices from the "video_nr" parameter */ ++ for (i = MAX_DEVICES - 1; i >= 0; i--) { ++ if (video_nr[i] >= 0) { ++ devices = i + 1; ++ break; ++ } ++ } ++ } ++ ++ if (devices > MAX_DEVICES) { ++ devices = MAX_DEVICES; ++ printk(KERN_INFO ++ "v4l2loopback: number of initial devices is limited to: %d\n", ++ MAX_DEVICES); ++ } ++ ++ if (max_buffers > MAX_BUFFERS) { ++ max_buffers = MAX_BUFFERS; ++ printk(KERN_INFO ++ "v4l2loopback: number of buffers is limited to: %d\n", ++ MAX_BUFFERS); ++ } ++ ++ if (max_openers < 0) { ++ printk(KERN_INFO ++ "v4l2loopback: allowing %d openers rather than %d\n", ++ 2, max_openers); ++ max_openers = 2; ++ } ++ ++ if (max_width < min_width) { ++ max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; ++ printk(KERN_INFO "v4l2loopback: using max_width %d\n", ++ max_width); ++ } ++ if (max_height < min_height) { ++ max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; ++ printk(KERN_INFO "v4l2loopback: using max_height %d\n", ++ max_height); ++ } ++ ++ for (i = 0; i < devices; i++) { ++ struct v4l2_loopback_config cfg = { ++ // clang-format off ++ .output_nr = video_nr[i], ++#ifdef SPLIT_DEVICES ++ .capture_nr = video_nr[i], ++#endif ++ .min_width = min_width, ++ .min_height = min_height, ++ .max_width = max_width, ++ .max_height = max_height, ++ .announce_all_caps = (!exclusive_caps[i]), ++ .max_buffers = max_buffers, ++ .max_openers = max_openers, ++ .debug = debug, ++ // clang-format on ++ }; ++ cfg.card_label[0] = 0; ++ if (card_label[i]) ++ snprintf(cfg.card_label, sizeof(cfg.card_label), "%s", ++ card_label[i]); ++ err = v4l2_loopback_add(&cfg, 0); ++ if (err) { ++ free_devices(); ++ goto error; ++ } ++ } ++ ++ dprintk("module installed\n"); ++ ++ printk(KERN_INFO "v4l2loopback driver version %d.%d.%d%s loaded\n", ++ // clang-format off ++ (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff, ++ (V4L2LOOPBACK_VERSION_CODE >> 8) & 0xff, ++ (V4L2LOOPBACK_VERSION_CODE ) & 0xff, ++#ifdef SNAPSHOT_VERSION ++ " (" __stringify(SNAPSHOT_VERSION) ")" ++#else ++ "" ++#endif ++ ); ++ // clang-format on ++ ++ return 0; ++error: ++ misc_deregister(&v4l2loopback_misc); ++ return err; ++} ++ ++static void v4l2loopback_cleanup_module(void) ++{ ++ MARK(); ++ /* unregister the device -> it deletes /dev/video* */ ++ free_devices(); ++ /* and get rid of /dev/v4l2loopback */ ++ misc_deregister(&v4l2loopback_misc); ++ dprintk("module removed\n"); ++} ++ ++MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR); ++ ++module_init(v4l2loopback_init_module); ++module_exit(v4l2loopback_cleanup_module); +diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h +new file mode 100644 +index 000000000000..1bc7e6b747a4 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback.h +@@ -0,0 +1,98 @@ ++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ ++/* ++ * v4l2loopback.h ++ * ++ * Written by IOhannes m zmölnig, 7/1/20. ++ * ++ * Copyright 2020 by IOhannes m zmölnig. Redistribution of this file is ++ * permitted under the GNU General Public License. ++ */ ++#ifndef _V4L2LOOPBACK_H ++#define _V4L2LOOPBACK_H ++ ++#define V4L2LOOPBACK_VERSION_MAJOR 0 ++#define V4L2LOOPBACK_VERSION_MINOR 13 ++#define V4L2LOOPBACK_VERSION_BUGFIX 1 ++ ++/* /dev/v4l2loopback interface */ ++ ++struct v4l2_loopback_config { ++ /** ++ * the device-number (/dev/video) ++ * V4L2LOOPBACK_CTL_ADD: ++ * setting this to a value<0, will allocate an available one ++ * if nr>=0 and the device already exists, the ioctl will EEXIST ++ * if output_nr and capture_nr are the same, only a single device will be created ++ * NOTE: currently split-devices (where output_nr and capture_nr differ) ++ * are not implemented yet. ++ * until then, requesting different device-IDs will result in EINVAL. ++ * ++ * V4L2LOOPBACK_CTL_QUERY: ++ * either both output_nr and capture_nr must refer to the same loopback, ++ * or one (and only one) of them must be -1 ++ * ++ */ ++ int output_nr; ++ int unused; /*capture_nr;*/ ++ ++ /** ++ * a nice name for your device ++ * if (*card_label)==0, an automatic name is assigned ++ */ ++ char card_label[32]; ++ ++ /** ++ * allowed frame size ++ * if too low, default values are used ++ */ ++ unsigned int min_width; ++ unsigned int max_width; ++ unsigned int min_height; ++ unsigned int max_height; ++ ++ /** ++ * number of buffers to allocate for the queue ++ * if set to <=0, default values are used ++ */ ++ int max_buffers; ++ ++ /** ++ * how many consumers are allowed to open this device concurrently ++ * if set to <=0, default values are used ++ */ ++ int max_openers; ++ ++ /** ++ * set the debugging level for this device ++ */ ++ int debug; ++ ++ /** ++ * whether to announce OUTPUT/CAPTURE capabilities exclusively ++ * for this device or not ++ * (!exclusive_caps) ++ * NOTE: this is going to be removed once separate output/capture ++ * devices are implemented ++ */ ++ int announce_all_caps; ++}; ++ ++/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the ++ * to-be-created device set. ++ * if the ptr is NULL, a new device is created with default values at the driver's discretion. ++ * ++ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY, ++ * to get more information on the device) ++ */ ++#define V4L2LOOPBACK_CTL_ADD 0x4C80 ++ ++/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set ++ * (the two values must either refer to video-devices associated with the same loopback device ++ * or exactly one of them must be <0 ++ */ ++#define V4L2LOOPBACK_CTL_QUERY 0x4C82 ++ ++/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */ ++#define V4L2LOOPBACK_CTL_REMOVE 0x4C81 ++ ++#endif /* _V4L2LOOPBACK_H */ +diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h +new file mode 100644 +index 000000000000..d855a3796554 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback_formats.h +@@ -0,0 +1,445 @@ ++static const struct v4l2l_format formats[] = { ++#ifndef V4L2_PIX_FMT_VP9 ++#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0') ++#endif ++#ifndef V4L2_PIX_FMT_HEVC ++#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C') ++#endif ++ ++ /* here come the packed formats */ ++ { ++ .name = "32 bpp RGB, le", ++ .fourcc = V4L2_PIX_FMT_BGR32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "32 bpp RGB, be", ++ .fourcc = V4L2_PIX_FMT_RGB32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "24 bpp RGB, le", ++ .fourcc = V4L2_PIX_FMT_BGR24, ++ .depth = 24, ++ .flags = 0, ++ }, ++ { ++ .name = "24 bpp RGB, be", ++ .fourcc = V4L2_PIX_FMT_RGB24, ++ .depth = 24, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_ABGR32 ++ { ++ .name = "32 bpp RGBA, le", ++ .fourcc = V4L2_PIX_FMT_ABGR32, ++ .depth = 32, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_RGBA32 ++ { ++ .name = "32 bpp RGBA", ++ .fourcc = V4L2_PIX_FMT_RGBA32, ++ .depth = 32, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_RGB332 ++ { ++ .name = "8 bpp RGB-3-3-2", ++ .fourcc = V4L2_PIX_FMT_RGB332, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB332 */ ++#ifdef V4L2_PIX_FMT_RGB444 ++ { ++ .name = "16 bpp RGB (xxxxrrrr ggggbbbb)", ++ .fourcc = V4L2_PIX_FMT_RGB444, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB444 */ ++#ifdef V4L2_PIX_FMT_RGB555 ++ { ++ .name = "16 bpp RGB-5-5-5", ++ .fourcc = V4L2_PIX_FMT_RGB555, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB555 */ ++#ifdef V4L2_PIX_FMT_RGB565 ++ { ++ .name = "16 bpp RGB-5-6-5", ++ .fourcc = V4L2_PIX_FMT_RGB565, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB565 */ ++#ifdef V4L2_PIX_FMT_RGB555X ++ { ++ .name = "16 bpp RGB-5-5-5 BE", ++ .fourcc = V4L2_PIX_FMT_RGB555X, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB555X */ ++#ifdef V4L2_PIX_FMT_RGB565X ++ { ++ .name = "16 bpp RGB-5-6-5 BE", ++ .fourcc = V4L2_PIX_FMT_RGB565X, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB565X */ ++#ifdef V4L2_PIX_FMT_BGR666 ++ { ++ .name = "18 bpp BGR-6-6-6", ++ .fourcc = V4L2_PIX_FMT_BGR666, ++ .depth = 18, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_BGR666 */ ++ { ++ .name = "4:2:2, packed, YUYV", ++ .fourcc = V4L2_PIX_FMT_YUYV, ++ .depth = 16, ++ .flags = 0, ++ }, ++ { ++ .name = "4:2:2, packed, UYVY", ++ .fourcc = V4L2_PIX_FMT_UYVY, ++ .depth = 16, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_YVYU ++ { ++ .name = "4:2:2, packed YVYU", ++ .fourcc = V4L2_PIX_FMT_YVYU, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_VYUY ++ { ++ .name = "4:2:2, packed VYUY", ++ .fourcc = V4L2_PIX_FMT_VYUY, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif ++ { ++ .name = "4:2:2, packed YYUV", ++ .fourcc = V4L2_PIX_FMT_YYUV, ++ .depth = 16, ++ .flags = 0, ++ }, ++ { ++ .name = "YUV-8-8-8-8", ++ .fourcc = V4L2_PIX_FMT_YUV32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "8 bpp, Greyscale", ++ .fourcc = V4L2_PIX_FMT_GREY, ++ .depth = 8, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_Y4 ++ { ++ .name = "4 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y4, ++ .depth = 4, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y4 */ ++#ifdef V4L2_PIX_FMT_Y6 ++ { ++ .name = "6 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y6, ++ .depth = 6, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y6 */ ++#ifdef V4L2_PIX_FMT_Y10 ++ { ++ .name = "10 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y10, ++ .depth = 10, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y10 */ ++#ifdef V4L2_PIX_FMT_Y12 ++ { ++ .name = "12 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y12, ++ .depth = 12, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y12 */ ++ { ++ .name = "16 bpp, Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y16, ++ .depth = 16, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_YUV444 ++ { ++ .name = "16 bpp xxxxyyyy uuuuvvvv", ++ .fourcc = V4L2_PIX_FMT_YUV444, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV444 */ ++#ifdef V4L2_PIX_FMT_YUV555 ++ { ++ .name = "16 bpp YUV-5-5-5", ++ .fourcc = V4L2_PIX_FMT_YUV555, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV555 */ ++#ifdef V4L2_PIX_FMT_YUV565 ++ { ++ .name = "16 bpp YUV-5-6-5", ++ .fourcc = V4L2_PIX_FMT_YUV565, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV565 */ ++ ++/* bayer formats */ ++#ifdef V4L2_PIX_FMT_SRGGB8 ++ { ++ .name = "Bayer RGGB 8bit", ++ .fourcc = V4L2_PIX_FMT_SRGGB8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SRGGB8 */ ++#ifdef V4L2_PIX_FMT_SGRBG8 ++ { ++ .name = "Bayer GRBG 8bit", ++ .fourcc = V4L2_PIX_FMT_SGRBG8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SGRBG8 */ ++#ifdef V4L2_PIX_FMT_SGBRG8 ++ { ++ .name = "Bayer GBRG 8bit", ++ .fourcc = V4L2_PIX_FMT_SGBRG8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SGBRG8 */ ++#ifdef V4L2_PIX_FMT_SBGGR8 ++ { ++ .name = "Bayer BA81 8bit", ++ .fourcc = V4L2_PIX_FMT_SBGGR8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SBGGR8 */ ++ ++ /* here come the planar formats */ ++ { ++ .name = "4:1:0, planar, Y-Cr-Cb", ++ .fourcc = V4L2_PIX_FMT_YVU410, ++ .depth = 9, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:2:0, planar, Y-Cr-Cb", ++ .fourcc = V4L2_PIX_FMT_YVU420, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:1:0, planar, Y-Cb-Cr", ++ .fourcc = V4L2_PIX_FMT_YUV410, ++ .depth = 9, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:2:0, planar, Y-Cb-Cr", ++ .fourcc = V4L2_PIX_FMT_YUV420, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#ifdef V4L2_PIX_FMT_YUV422P ++ { ++ .name = "16 bpp YVU422 planar", ++ .fourcc = V4L2_PIX_FMT_YUV422P, ++ .depth = 16, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_YUV422P */ ++#ifdef V4L2_PIX_FMT_YUV411P ++ { ++ .name = "16 bpp YVU411 planar", ++ .fourcc = V4L2_PIX_FMT_YUV411P, ++ .depth = 16, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_YUV411P */ ++#ifdef V4L2_PIX_FMT_Y41P ++ { ++ .name = "12 bpp YUV 4:1:1", ++ .fourcc = V4L2_PIX_FMT_Y41P, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_Y41P */ ++#ifdef V4L2_PIX_FMT_NV12 ++ { ++ .name = "12 bpp Y/CbCr 4:2:0 ", ++ .fourcc = V4L2_PIX_FMT_NV12, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_NV12 */ ++ ++/* here come the compressed formats */ ++ ++#ifdef V4L2_PIX_FMT_MJPEG ++ { ++ .name = "Motion-JPEG", ++ .fourcc = V4L2_PIX_FMT_MJPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MJPEG */ ++#ifdef V4L2_PIX_FMT_JPEG ++ { ++ .name = "JFIF JPEG", ++ .fourcc = V4L2_PIX_FMT_JPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_JPEG */ ++#ifdef V4L2_PIX_FMT_DV ++ { ++ .name = "DV1394", ++ .fourcc = V4L2_PIX_FMT_DV, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_DV */ ++#ifdef V4L2_PIX_FMT_MPEG ++ { ++ .name = "MPEG-1/2/4 Multiplexed", ++ .fourcc = V4L2_PIX_FMT_MPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG */ ++#ifdef V4L2_PIX_FMT_H264 ++ { ++ .name = "H264 with start codes", ++ .fourcc = V4L2_PIX_FMT_H264, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264 */ ++#ifdef V4L2_PIX_FMT_H264_NO_SC ++ { ++ .name = "H264 without start codes", ++ .fourcc = V4L2_PIX_FMT_H264_NO_SC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264_NO_SC */ ++#ifdef V4L2_PIX_FMT_H264_MVC ++ { ++ .name = "H264 MVC", ++ .fourcc = V4L2_PIX_FMT_H264_MVC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264_MVC */ ++#ifdef V4L2_PIX_FMT_H263 ++ { ++ .name = "H263", ++ .fourcc = V4L2_PIX_FMT_H263, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H263 */ ++#ifdef V4L2_PIX_FMT_MPEG1 ++ { ++ .name = "MPEG-1 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG1, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG1 */ ++#ifdef V4L2_PIX_FMT_MPEG2 ++ { ++ .name = "MPEG-2 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG2, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG2 */ ++#ifdef V4L2_PIX_FMT_MPEG4 ++ { ++ .name = "MPEG-4 part 2 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG4, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG4 */ ++#ifdef V4L2_PIX_FMT_XVID ++ { ++ .name = "Xvid", ++ .fourcc = V4L2_PIX_FMT_XVID, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_XVID */ ++#ifdef V4L2_PIX_FMT_VC1_ANNEX_G ++ { ++ .name = "SMPTE 421M Annex G compliant stream", ++ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_G, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */ ++#ifdef V4L2_PIX_FMT_VC1_ANNEX_L ++ { ++ .name = "SMPTE 421M Annex L compliant stream", ++ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_L, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */ ++#ifdef V4L2_PIX_FMT_VP8 ++ { ++ .name = "VP8", ++ .fourcc = V4L2_PIX_FMT_VP8, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VP8 */ ++#ifdef V4L2_PIX_FMT_VP9 ++ { ++ .name = "VP9", ++ .fourcc = V4L2_PIX_FMT_VP9, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VP9 */ ++#ifdef V4L2_PIX_FMT_HEVC ++ { ++ .name = "HEVC", ++ .fourcc = V4L2_PIX_FMT_HEVC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_HEVC */ ++}; diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile index f2b19e6174af..4fef4b174321 100644 --- a/drivers/pci/controller/Makefile @@ -9046,7 +11294,7 @@ index 000000000000..e105e6f5cc91 +MODULE_AUTHOR("Daniel Drake "); +MODULE_LICENSE("GPL v2"); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c -index eff7f5df08e2..cfb099dbeb5f 100644 +index 568410e64ce6..192d0557fb05 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -3732,6 +3732,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) @@ -9164,600 +11412,11 @@ index eff7f5df08e2..cfb099dbeb5f 100644 { 0 } }; -diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig -index bdd302274b9a..0afc3e4c1880 100644 ---- a/drivers/platform/x86/Kconfig -+++ b/drivers/platform/x86/Kconfig -@@ -1127,6 +1127,20 @@ config SEL3350_PLATFORM - To compile this driver as a module, choose M here: the module - will be called sel3350-platform. - -+config STEAMDECK -+ tristate "Valve Steam Deck platform driver" -+ depends on X86_64 -+ help -+ Driver exposing various bits and pieces of functionality -+ provided by Steam Deck specific VLV0100 device presented by -+ EC firmware. This includes but not limited to: -+ - CPU/device's fan control -+ - Read-only access to DDIC registers -+ - Battery tempreature measurements -+ - Various display related control knobs -+ - USB Type-C connector event notification -+ Say N unless you are running on a Steam Deck. -+ - endif # X86_PLATFORM_DEVICES - - config P2SB -diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile -index 1de432e8861e..59bfbd2649eb 100644 ---- a/drivers/platform/x86/Makefile -+++ b/drivers/platform/x86/Makefile -@@ -144,3 +144,6 @@ obj-$(CONFIG_WINMATE_FM07_KEYS) += winmate-fm07-keys.o - - # SEL - obj-$(CONFIG_SEL3350_PLATFORM) += sel3350-platform.o -+ -+# Steam Deck -+obj-$(CONFIG_STEAMDECK) += steamdeck.o -diff --git a/drivers/platform/x86/steamdeck.c b/drivers/platform/x86/steamdeck.c -new file mode 100644 -index 000000000000..77a6677ec19e ---- /dev/null -+++ b/drivers/platform/x86/steamdeck.c -@@ -0,0 +1,523 @@ -+// SPDX-License-Identifier: GPL-2.0+ -+ -+/* -+ * Steam Deck ACPI platform driver -+ * -+ * Copyright (C) 2021-2022 Valve Corporation -+ * -+ */ -+#include -+#include -+#include -+#include -+#include -+ -+#define ACPI_STEAMDECK_NOTIFY_STATUS 0x80 -+ -+/* 0 - port connected, 1 -port disconnected */ -+#define ACPI_STEAMDECK_PORT_CONNECT BIT(0) -+/* 0 - Upstream Facing Port, 1 - Downdstream Facing Port */ -+#define ACPI_STEAMDECK_CUR_DATA_ROLE BIT(3) -+/* -+ * Debouncing delay to allow negotiation process to settle. 2s value -+ * was arrived at via trial and error. -+ */ -+#define STEAMDECK_ROLE_SWITCH_DELAY (msecs_to_jiffies(2000)) -+ -+struct steamdeck { -+ struct acpi_device *adev; -+ struct device *hwmon; -+ void *regmap; -+ long fan_target; -+ struct delayed_work role_work; -+ struct extcon_dev *edev; -+ struct device *dev; -+}; -+ -+static ssize_t -+steamdeck_simple_store(struct device *dev, const char *buf, size_t count, -+ const char *method, -+ unsigned long upper_limit) -+{ -+ struct steamdeck *fan = dev_get_drvdata(dev); -+ unsigned long value; -+ -+ if (kstrtoul(buf, 10, &value) || value >= upper_limit) -+ return -EINVAL; -+ -+ if (ACPI_FAILURE(acpi_execute_simple_method(fan->adev->handle, -+ (char *)method, value))) -+ return -EIO; -+ -+ return count; -+} -+ -+#define STEAMDECK_ATTR_WO(_name, _method, _upper_limit) \ -+ static ssize_t _name##_store(struct device *dev, \ -+ struct device_attribute *attr, \ -+ const char *buf, size_t count) \ -+ { \ -+ return steamdeck_simple_store(dev, buf, count, \ -+ _method, \ -+ _upper_limit); \ -+ } \ -+ static DEVICE_ATTR_WO(_name) -+ -+STEAMDECK_ATTR_WO(target_cpu_temp, "STCT", U8_MAX / 2); -+STEAMDECK_ATTR_WO(gain, "SGAN", U16_MAX); -+STEAMDECK_ATTR_WO(ramp_rate, "SFRR", U8_MAX); -+STEAMDECK_ATTR_WO(hysteresis, "SHTS", U16_MAX); -+STEAMDECK_ATTR_WO(maximum_battery_charge_rate, "CHGR", U16_MAX); -+STEAMDECK_ATTR_WO(recalculate, "SCHG", U16_MAX); -+ -+STEAMDECK_ATTR_WO(led_brightness, "CHBV", U8_MAX); -+STEAMDECK_ATTR_WO(content_adaptive_brightness, "CABC", U8_MAX); -+STEAMDECK_ATTR_WO(gamma_set, "GAMA", U8_MAX); -+STEAMDECK_ATTR_WO(display_brightness, "WDBV", U8_MAX); -+STEAMDECK_ATTR_WO(ctrl_display, "WCDV", U8_MAX); -+STEAMDECK_ATTR_WO(cabc_minimum_brightness, "WCMB", U8_MAX); -+STEAMDECK_ATTR_WO(memory_data_access_control, "MDAC", U8_MAX); -+ -+#define STEAMDECK_ATTR_WO_NOARG(_name, _method) \ -+ static ssize_t _name##_store(struct device *dev, \ -+ struct device_attribute *attr, \ -+ const char *buf, size_t count) \ -+ { \ -+ struct steamdeck *fan = dev_get_drvdata(dev); \ -+ \ -+ if (ACPI_FAILURE(acpi_evaluate_object(fan->adev->handle, \ -+ _method, NULL, NULL))) \ -+ return -EIO; \ -+ \ -+ return count; \ -+ } \ -+ static DEVICE_ATTR_WO(_name) -+ -+STEAMDECK_ATTR_WO_NOARG(power_cycle_display, "DPCY"); -+STEAMDECK_ATTR_WO_NOARG(display_normal_mode_on, "NORO"); -+STEAMDECK_ATTR_WO_NOARG(display_inversion_off, "INOF"); -+STEAMDECK_ATTR_WO_NOARG(display_inversion_on, "INON"); -+STEAMDECK_ATTR_WO_NOARG(idle_mode_on, "WRNE"); -+ -+#define STEAMDECK_ATTR_RO(_name, _method) \ -+ static ssize_t _name##_show(struct device *dev, \ -+ struct device_attribute *attr, \ -+ char *buf) \ -+ { \ -+ struct steamdeck *jup = dev_get_drvdata(dev); \ -+ unsigned long long val; \ -+ \ -+ if (ACPI_FAILURE(acpi_evaluate_integer( \ -+ jup->adev->handle, \ -+ _method, NULL, &val))) \ -+ return -EIO; \ -+ \ -+ return sprintf(buf, "%llu\n", val); \ -+ } \ -+ static DEVICE_ATTR_RO(_name) -+ -+STEAMDECK_ATTR_RO(firmware_version, "PDFW"); -+STEAMDECK_ATTR_RO(board_id, "BOID"); -+STEAMDECK_ATTR_RO(pdcs, "PDCS"); -+ -+static umode_t -+steamdeck_is_visible(struct kobject *kobj, struct attribute *attr, int index) -+{ -+ return attr->mode; -+} -+ -+static struct attribute *steamdeck_attributes[] = { -+ &dev_attr_target_cpu_temp.attr, -+ &dev_attr_gain.attr, -+ &dev_attr_ramp_rate.attr, -+ &dev_attr_hysteresis.attr, -+ &dev_attr_maximum_battery_charge_rate.attr, -+ &dev_attr_recalculate.attr, -+ &dev_attr_power_cycle_display.attr, -+ -+ &dev_attr_led_brightness.attr, -+ &dev_attr_content_adaptive_brightness.attr, -+ &dev_attr_gamma_set.attr, -+ &dev_attr_display_brightness.attr, -+ &dev_attr_ctrl_display.attr, -+ &dev_attr_cabc_minimum_brightness.attr, -+ &dev_attr_memory_data_access_control.attr, -+ -+ &dev_attr_display_normal_mode_on.attr, -+ &dev_attr_display_inversion_off.attr, -+ &dev_attr_display_inversion_on.attr, -+ &dev_attr_idle_mode_on.attr, -+ -+ &dev_attr_firmware_version.attr, -+ &dev_attr_board_id.attr, -+ &dev_attr_pdcs.attr, -+ -+ NULL -+}; -+ -+static const struct attribute_group steamdeck_group = { -+ .attrs = steamdeck_attributes, -+ .is_visible = steamdeck_is_visible, -+}; -+ -+static const struct attribute_group *steamdeck_groups[] = { -+ &steamdeck_group, -+ NULL -+}; -+ -+static int steamdeck_read_fan_speed(struct steamdeck *jup, long *speed) -+{ -+ unsigned long long val; -+ -+ if (ACPI_FAILURE(acpi_evaluate_integer(jup->adev->handle, -+ "FANR", NULL, &val))) -+ return -EIO; -+ -+ *speed = val; -+ return 0; -+} -+ -+static int -+steamdeck_hwmon_read(struct device *dev, enum hwmon_sensor_types type, -+ u32 attr, int channel, long *out) -+{ -+ struct steamdeck *sd = dev_get_drvdata(dev); -+ unsigned long long val; -+ -+ switch (type) { -+ case hwmon_temp: -+ if (attr != hwmon_temp_input) -+ return -EOPNOTSUPP; -+ -+ if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle, -+ "BATT", NULL, &val))) -+ return -EIO; -+ /* -+ * Assuming BATT returns deg C we need to mutiply it -+ * by 1000 to convert to mC -+ */ -+ *out = val * 1000; -+ break; -+ case hwmon_fan: -+ switch (attr) { -+ case hwmon_fan_input: -+ return steamdeck_read_fan_speed(sd, out); -+ case hwmon_fan_target: -+ *out = sd->fan_target; -+ break; -+ case hwmon_fan_fault: -+ if (ACPI_FAILURE(acpi_evaluate_integer( -+ sd->adev->handle, -+ "FANC", NULL, &val))) -+ return -EIO; -+ /* -+ * FANC (Fan check): -+ * 0: Abnormal -+ * 1: Normal -+ */ -+ *out = !val; -+ break; -+ default: -+ return -EOPNOTSUPP; -+ } -+ break; -+ default: -+ return -EOPNOTSUPP; -+ } -+ -+ return 0; -+} -+ -+static int -+steamdeck_hwmon_read_string(struct device *dev, enum hwmon_sensor_types type, -+ u32 attr, int channel, const char **str) -+{ -+ switch (type) { -+ case hwmon_temp: -+ *str = "Battery Temp"; -+ break; -+ case hwmon_fan: -+ *str = "System Fan"; -+ break; -+ default: -+ return -EOPNOTSUPP; -+ } -+ -+ return 0; -+} -+ -+static int -+steamdeck_hwmon_write(struct device *dev, enum hwmon_sensor_types type, -+ u32 attr, int channel, long val) -+{ -+ struct steamdeck *sd = dev_get_drvdata(dev); -+ -+ if (type != hwmon_fan || -+ attr != hwmon_fan_target) -+ return -EOPNOTSUPP; -+ -+ if (val > U16_MAX) -+ return -EINVAL; -+ -+ sd->fan_target = val; -+ -+ if (ACPI_FAILURE(acpi_execute_simple_method(sd->adev->handle, -+ "FANS", val))) -+ return -EIO; -+ -+ return 0; -+} -+ -+static umode_t -+steamdeck_hwmon_is_visible(const void *data, enum hwmon_sensor_types type, -+ u32 attr, int channel) -+{ -+ if (type == hwmon_fan && -+ attr == hwmon_fan_target) -+ return 0644; -+ -+ return 0444; -+} -+ -+static const struct hwmon_channel_info *steamdeck_info[] = { -+ HWMON_CHANNEL_INFO(temp, -+ HWMON_T_INPUT | HWMON_T_LABEL), -+ HWMON_CHANNEL_INFO(fan, -+ HWMON_F_INPUT | HWMON_F_LABEL | -+ HWMON_F_TARGET | HWMON_F_FAULT), -+ NULL -+}; -+ -+static const struct hwmon_ops steamdeck_hwmon_ops = { -+ .is_visible = steamdeck_hwmon_is_visible, -+ .read = steamdeck_hwmon_read, -+ .read_string = steamdeck_hwmon_read_string, -+ .write = steamdeck_hwmon_write, -+}; -+ -+static const struct hwmon_chip_info steamdeck_chip_info = { -+ .ops = &steamdeck_hwmon_ops, -+ .info = steamdeck_info, -+}; -+ -+#define STEAMDECK_STA_OK \ -+ (ACPI_STA_DEVICE_ENABLED | \ -+ ACPI_STA_DEVICE_PRESENT | \ -+ ACPI_STA_DEVICE_FUNCTIONING) -+ -+static int -+steamdeck_ddic_reg_read(void *context, unsigned int reg, unsigned int *val) -+{ -+ union acpi_object obj = { .type = ACPI_TYPE_INTEGER }; -+ struct acpi_object_list arg_list = { .count = 1, .pointer = &obj, }; -+ struct steamdeck *sd = context; -+ unsigned long long _val; -+ -+ obj.integer.value = reg; -+ -+ if (ACPI_FAILURE(acpi_evaluate_integer(sd->adev->handle, -+ "RDDI", &arg_list, &_val))) -+ return -EIO; -+ -+ *val = _val; -+ return 0; -+} -+ -+static int steamdeck_read_pdcs(struct steamdeck *sd, unsigned long long *pdcs) -+{ -+ acpi_status status; -+ -+ status = acpi_evaluate_integer(sd->adev->handle, "PDCS", NULL, pdcs); -+ if (ACPI_FAILURE(status)) { -+ dev_err(sd->dev, "PDCS evaluation failed: %s\n", -+ acpi_format_exception(status)); -+ return -EIO; -+ } -+ -+ return 0; -+} -+ -+static void steamdeck_usb_role_work(struct work_struct *work) -+{ -+ struct steamdeck *sd = -+ container_of(work, struct steamdeck, role_work.work); -+ unsigned long long pdcs; -+ bool usb_host; -+ -+ if (steamdeck_read_pdcs(sd, &pdcs)) -+ return; -+ -+ /* -+ * We only care about these two -+ */ -+ pdcs &= ACPI_STEAMDECK_PORT_CONNECT | ACPI_STEAMDECK_CUR_DATA_ROLE; -+ -+ /* -+ * For "connect" events our role is determined by a bit in -+ * PDCS, for "disconnect" we switch to being a gadget -+ * unconditionally. The thinking for the latter is we don't -+ * want to start acting as a USB host until we get -+ * confirmation from the firmware that we are a USB host -+ */ -+ usb_host = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ? -+ pdcs & ACPI_STEAMDECK_CUR_DATA_ROLE : false; -+ -+ WARN_ON(extcon_set_state_sync(sd->edev, EXTCON_USB_HOST, -+ usb_host)); -+ dev_dbg(sd->dev, "USB role is %s\n", usb_host ? "host" : "device"); -+} -+ -+static void steamdeck_notify(acpi_handle handle, u32 event, void *context) -+{ -+ struct device *dev = context; -+ struct steamdeck *sd = dev_get_drvdata(dev); -+ unsigned long long pdcs; -+ unsigned long delay; -+ -+ switch (event) { -+ case ACPI_STEAMDECK_NOTIFY_STATUS: -+ if (steamdeck_read_pdcs(sd, &pdcs)) -+ return; -+ /* -+ * We process "disconnect" events immediately and -+ * "connect" events with a delay to give the HW time -+ * to settle. For example attaching USB hub (at least -+ * for HW used for testing) will generate intermediary -+ * event with "host" bit not set, followed by the one -+ * that does have it set. -+ */ -+ delay = (pdcs & ACPI_STEAMDECK_PORT_CONNECT) ? -+ STEAMDECK_ROLE_SWITCH_DELAY : 0; -+ -+ queue_delayed_work(system_long_wq, &sd->role_work, delay); -+ break; -+ default: -+ dev_err(dev, "Unsupported event [0x%x]\n", event); -+ } -+} -+ -+static void steamdeck_remove_notify_handler(void *data) -+{ -+ struct steamdeck *sd = data; -+ -+ acpi_remove_notify_handler(sd->adev->handle, ACPI_DEVICE_NOTIFY, -+ steamdeck_notify); -+ cancel_delayed_work_sync(&sd->role_work); -+} -+ -+static const unsigned int steamdeck_extcon_cable[] = { -+ EXTCON_USB, -+ EXTCON_USB_HOST, -+ EXTCON_CHG_USB_SDP, -+ EXTCON_CHG_USB_CDP, -+ EXTCON_CHG_USB_DCP, -+ EXTCON_CHG_USB_ACA, -+ EXTCON_NONE, -+}; -+ -+static int steamdeck_probe(struct platform_device *pdev) -+{ -+ struct device *dev = &pdev->dev; -+ struct steamdeck *sd; -+ acpi_status status; -+ unsigned long long sta; -+ int ret; -+ -+ static const struct regmap_config regmap_config = { -+ .reg_bits = 8, -+ .val_bits = 8, -+ .max_register = 255, -+ .cache_type = REGCACHE_NONE, -+ .reg_read = steamdeck_ddic_reg_read, -+ }; -+ -+ sd = devm_kzalloc(dev, sizeof(*sd), GFP_KERNEL); -+ if (!sd) -+ return -ENOMEM; -+ sd->adev = ACPI_COMPANION(&pdev->dev); -+ sd->dev = dev; -+ platform_set_drvdata(pdev, sd); -+ INIT_DELAYED_WORK(&sd->role_work, steamdeck_usb_role_work); -+ -+ status = acpi_evaluate_integer(sd->adev->handle, "_STA", -+ NULL, &sta); -+ if (ACPI_FAILURE(status)) { -+ dev_err(dev, "Status check failed (0x%x)\n", status); -+ return -EINVAL; -+ } -+ -+ if ((sta & STEAMDECK_STA_OK) != STEAMDECK_STA_OK) { -+ dev_err(dev, "Device is not ready\n"); -+ return -EINVAL; -+ } -+ -+ /* -+ * Our ACPI interface doesn't expose a method to read current -+ * fan target, so we use current fan speed as an -+ * approximation. -+ */ -+ if (steamdeck_read_fan_speed(sd, &sd->fan_target)) -+ dev_warn(dev, "Failed to read fan speed"); -+ -+ sd->hwmon = devm_hwmon_device_register_with_info(dev, -+ "steamdeck", -+ sd, -+ &steamdeck_chip_info, -+ steamdeck_groups); -+ if (IS_ERR(sd->hwmon)) { -+ dev_err(dev, "Failed to register HWMON device"); -+ return PTR_ERR(sd->hwmon); -+ } -+ -+ sd->regmap = devm_regmap_init(dev, NULL, sd, ®map_config); -+ if (IS_ERR(sd->regmap)) -+ dev_err(dev, "Failed to register REGMAP"); -+ -+ sd->edev = devm_extcon_dev_allocate(dev, steamdeck_extcon_cable); -+ if (IS_ERR(sd->edev)) -+ return -ENOMEM; -+ -+ ret = devm_extcon_dev_register(dev, sd->edev); -+ if (ret < 0) { -+ dev_err(dev, "Failed to register extcon device: %d\n", ret); -+ return ret; -+ } -+ -+ /* -+ * Set initial role value -+ */ -+ queue_delayed_work(system_long_wq, &sd->role_work, 0); -+ flush_delayed_work(&sd->role_work); -+ -+ status = acpi_install_notify_handler(sd->adev->handle, -+ ACPI_DEVICE_NOTIFY, -+ steamdeck_notify, -+ dev); -+ if (ACPI_FAILURE(status)) { -+ dev_err(dev, "Error installing ACPI notify handler\n"); -+ return -EIO; -+ } -+ -+ ret = devm_add_action_or_reset(dev, steamdeck_remove_notify_handler, -+ sd); -+ return ret; -+} -+ -+static const struct acpi_device_id steamdeck_device_ids[] = { -+ { "VLV0100", 0 }, -+ { "", 0 }, -+}; -+MODULE_DEVICE_TABLE(acpi, steamdeck_device_ids); -+ -+static struct platform_driver steamdeck_driver = { -+ .probe = steamdeck_probe, -+ .driver = { -+ .name = "steamdeck", -+ .acpi_match_table = steamdeck_device_ids, -+ }, -+}; -+module_platform_driver(steamdeck_driver); -+ -+MODULE_AUTHOR("Andrey Smirnov "); -+MODULE_DESCRIPTION("Steam Deck ACPI platform driver"); -+MODULE_LICENSE("GPL"); -diff --git a/include/linux/mm.h b/include/linux/mm.h -index f5a97dec5169..397ad6f1ac39 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -191,10 +191,18 @@ static inline void __mm_zero_struct_page(struct page *page) - * that. - */ - #define MAPCOUNT_ELF_CORE_MARGIN (5) --#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -+#define DEFAULT_MAX_MAP_COUNT (INT_MAX - MAPCOUNT_ELF_CORE_MARGIN) - - extern int sysctl_max_map_count; - -+extern bool sysctl_workingset_protection; -+extern u8 sysctl_anon_min_ratio; -+extern u8 sysctl_clean_low_ratio; -+extern u8 sysctl_clean_min_ratio; -+int vm_workingset_protection_update_handler( -+ struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos); -+ - extern unsigned long sysctl_user_reserve_kbytes; - extern unsigned long sysctl_admin_reserve_kbytes; - diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h -index 2df35e65557d..a52bd9f4b632 100644 +index a0a026d2d244..8bece21a8998 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h -@@ -1268,7 +1268,7 @@ struct readahead_control { +@@ -1281,7 +1281,7 @@ struct readahead_control { ._index = i, \ } @@ -9789,7 +11448,7 @@ index 6030a8235617..60b7fe5fa74a 100644 { return &init_user_ns; diff --git a/init/Kconfig b/init/Kconfig -index bee58f7468c3..9ea39297f149 100644 +index febdea2afc3b..3ba6142f2f42 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -132,6 +132,10 @@ config THREAD_INFO_IN_TASK @@ -9803,7 +11462,7 @@ index bee58f7468c3..9ea39297f149 100644 config BROKEN bool -@@ -1247,6 +1251,22 @@ config USER_NS +@@ -1251,6 +1255,22 @@ config USER_NS If unsure, say N. @@ -9826,7 +11485,7 @@ index bee58f7468c3..9ea39297f149 100644 config PID_NS bool "PID Namespaces" default y -@@ -1389,6 +1409,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE +@@ -1393,6 +1413,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE with the "-O2" compiler flag for best performance and most helpful compile-time warnings. @@ -9882,12 +11541,12 @@ index 38ef6d06888e..0f78364efd4f 100644 config SCHED_HRTICK diff --git a/kernel/fork.c b/kernel/fork.c -index 0d944e92a43f..5449c990a91a 100644 +index 99076dbe27d8..18750b83c564 100644 --- a/kernel/fork.c +++ b/kernel/fork.c -@@ -102,6 +102,10 @@ - #include - #include +@@ -104,6 +104,10 @@ + #include + #include +#ifdef CONFIG_USER_NS +#include @@ -9896,7 +11555,7 @@ index 0d944e92a43f..5449c990a91a 100644 #include #include #include -@@ -2260,6 +2264,10 @@ __latent_entropy struct task_struct *copy_process( +@@ -2154,6 +2158,10 @@ __latent_entropy struct task_struct *copy_process( if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -9907,7 +11566,7 @@ index 0d944e92a43f..5449c990a91a 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -3406,6 +3414,12 @@ int ksys_unshare(unsigned long unshare_flags) +@@ -3301,6 +3309,12 @@ int ksys_unshare(unsigned long unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -9921,7 +11580,7 @@ index 0d944e92a43f..5449c990a91a 100644 if (err) goto bad_unshare_out; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index e2b4e0396af8..97983b041e9d 100644 +index 24dda708b699..c2bb8eb1d6ba 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -73,10 +73,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; @@ -9942,9 +11601,9 @@ index e2b4e0396af8..97983b041e9d 100644 const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +#endif - int sched_thermal_decay_shift; static int __init setup_sched_thermal_decay_shift(char *str) -@@ -127,8 +136,12 @@ int __weak arch_asym_cpu_priority(int cpu) + { +@@ -121,8 +130,12 @@ int __weak arch_asym_cpu_priority(int cpu) * * (default: 5 msec, units: microseconds) */ @@ -9958,10 +11617,10 @@ index e2b4e0396af8..97983b041e9d 100644 #ifdef CONFIG_NUMA_BALANCING /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 001fe047bd5d..ed5c758c7368 100644 +index ef20c61004eb..10c1caff5e06 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2542,7 +2542,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); +@@ -2544,7 +2544,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); @@ -9971,19 +11630,19 @@ index 001fe047bd5d..ed5c758c7368 100644 #else #define SCHED_NR_MIGRATE_BREAK 32 diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index 157f7ce2942d..aa55ebba2ec3 100644 +index e0b917328cf9..e70ae9c11dea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c -@@ -95,6 +95,9 @@ EXPORT_SYMBOL_GPL(sysctl_long_vals); - #ifdef CONFIG_PERF_EVENTS - static const int six_hundred_forty_kb = 640 * 1024; +@@ -80,6 +80,9 @@ + #ifdef CONFIG_RT_MUTEXES + #include #endif +#ifdef CONFIG_USER_NS +#include +#endif - - static const int ngroups_max = NGROUPS_MAX; + /* shared constants to be used in various sysctls */ + const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; @@ -1623,6 +1626,15 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, @@ -10000,49 +11659,8 @@ index 157f7ce2942d..aa55ebba2ec3 100644 #ifdef CONFIG_PROC_SYSCTL { .procname = "tainted", -@@ -2204,6 +2216,40 @@ static struct ctl_table vm_table[] = { - .extra1 = SYSCTL_ZERO, - }, - #endif -+ { -+ .procname = "workingset_protection", -+ .data = &sysctl_workingset_protection, -+ .maxlen = sizeof(bool), -+ .mode = 0644, -+ .proc_handler = &proc_dobool, -+ }, -+ { -+ .procname = "anon_min_ratio", -+ .data = &sysctl_anon_min_ratio, -+ .maxlen = sizeof(u8), -+ .mode = 0644, -+ .proc_handler = &vm_workingset_protection_update_handler, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE_HUNDRED, -+ }, -+ { -+ .procname = "clean_low_ratio", -+ .data = &sysctl_clean_low_ratio, -+ .maxlen = sizeof(u8), -+ .mode = 0644, -+ .proc_handler = &vm_workingset_protection_update_handler, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE_HUNDRED, -+ }, -+ { -+ .procname = "clean_min_ratio", -+ .data = &sysctl_clean_min_ratio, -+ .maxlen = sizeof(u8), -+ .mode = 0644, -+ .proc_handler = &vm_workingset_protection_update_handler, -+ .extra1 = SYSCTL_ZERO, -+ .extra2 = SYSCTL_ONE_HUNDRED, -+ }, - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c -index ce4d99df5f0e..8272e2e359f1 100644 +index 0b0b95418b16..c4b835b91fc0 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,6 +22,13 @@ @@ -10060,80 +11678,10 @@ index ce4d99df5f0e..8272e2e359f1 100644 static DEFINE_MUTEX(userns_state_mutex); diff --git a/mm/Kconfig b/mm/Kconfig -index ffc3a2ba3a8c..002f48b655de 100644 +index b4cb45255a54..8635b3b24739 100644 --- a/mm/Kconfig +++ b/mm/Kconfig -@@ -486,6 +486,69 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP - config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP - bool - -+config ANON_MIN_RATIO -+ int "Default value for vm.anon_min_ratio" -+ depends on SYSCTL -+ range 0 100 -+ default 15 -+ help -+ This option sets the default value for vm.anon_min_ratio sysctl knob. -+ -+ The vm.anon_min_ratio sysctl knob provides *hard* protection of -+ anonymous pages. The anonymous pages on the current node won't be -+ reclaimed under any conditions when their amount is below -+ vm.anon_min_ratio. This knob may be used to prevent excessive swap -+ thrashing when anonymous memory is low (for example, when memory is -+ going to be overfilled by compressed data of zram module). -+ -+ Setting this value too high (close to MemTotal) can result in -+ inability to swap and can lead to early OOM under memory pressure. -+ -+config CLEAN_LOW_RATIO -+ int "Default value for vm.clean_low_ratio" -+ depends on SYSCTL -+ range 0 100 -+ default 0 -+ help -+ This option sets the default value for vm.clean_low_ratio sysctl knob. -+ -+ The vm.clean_low_ratio sysctl knob provides *best-effort* -+ protection of clean file pages. The file pages on the current node -+ won't be reclaimed under memory pressure when the amount of clean file -+ pages is below vm.clean_low_ratio *unless* we threaten to OOM. -+ Protection of clean file pages using this knob may be used when -+ swapping is still possible to -+ - prevent disk I/O thrashing under memory pressure; -+ - improve performance in disk cache-bound tasks under memory -+ pressure. -+ -+ Setting it to a high value may result in a early eviction of anonymous -+ pages into the swap space by attempting to hold the protected amount -+ of clean file pages in memory. -+ -+config CLEAN_MIN_RATIO -+ int "Default value for vm.clean_min_ratio" -+ depends on SYSCTL -+ range 0 100 -+ default 15 -+ help -+ This option sets the default value for vm.clean_min_ratio sysctl knob. -+ -+ The vm.clean_min_ratio sysctl knob provides *hard* protection of -+ clean file pages. The file pages on the current node won't be -+ reclaimed under memory pressure when the amount of clean file pages is -+ below vm.clean_min_ratio. Hard protection of clean file pages using -+ this knob may be used to -+ - prevent disk I/O thrashing under memory pressure even with no free -+ swap space; -+ - improve performance in disk cache-bound tasks under memory -+ pressure; -+ - avoid high latency and prevent livelock in near-OOM conditions. -+ -+ Setting it to a high value may result in a early out-of-memory condition -+ due to the inability to reclaim the protected amount of clean file pages -+ when other types of pages cannot be reclaimed. -+ - config HAVE_MEMBLOCK_PHYS_MAP - bool - -@@ -630,7 +693,7 @@ config COMPACTION +@@ -613,7 +613,7 @@ config COMPACTION config COMPACT_UNEVICTABLE_DEFAULT int depends on COMPACTION @@ -10143,10 +11691,10 @@ index ffc3a2ba3a8c..002f48b655de 100644 # diff --git a/mm/compaction.c b/mm/compaction.c -index b961db601df4..91d627e8a93d 100644 +index 739b1bf3d637..3a4269c02fb2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c -@@ -1830,7 +1830,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE +@@ -1950,7 +1950,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE * aggressively the kernel should compact memory in the * background. It takes values in the range [0, 100]. */ @@ -10159,10 +11707,10 @@ index b961db601df4..91d627e8a93d 100644 static int __read_mostly sysctl_compact_memory; diff --git a/mm/huge_memory.c b/mm/huge_memory.c -index 94c958f7ebb5..2f9974f305ee 100644 +index 2120f7478e55..765ea6197e1e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c -@@ -62,7 +62,11 @@ unsigned long transparent_hugepage_flags __read_mostly = +@@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE (1<lock, flags); - for (i = 0; i < count; ++i) { -@@ -2134,6 +2139,18 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - if (unlikely(page == NULL)) - break; - -+ /* Reschedule and ease the contention on the lock if needed */ -+ if (i + 1 < count && ((can_resched && need_resched()) || -+ spin_needbreak(&zone->lock))) { -+ __mod_zone_page_state(zone, NR_FREE_PAGES, -+ -((i + 1 - last_mod) << order)); -+ last_mod = i + 1; -+ spin_unlock_irqrestore(&zone->lock, flags); -+ if (can_resched) -+ cond_resched(); -+ spin_lock_irqsave(&zone->lock, flags); -+ } -+ - /* - * Split buddy pages returned by expand() are received here in - * physical page order. The page is added to the tail of -@@ -2150,7 +2167,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, - -(1 << order)); - } - -- __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); -+ __mod_zone_page_state(zone, NR_FREE_PAGES, -((i - last_mod) << order)); - spin_unlock_irqrestore(&zone->lock, flags); - - return i; diff --git a/mm/swap.c b/mm/swap.c -index cd8f0150ba3a..42c405a4f114 100644 +index 67786cb77130..6a91db6f3302 100644 --- a/mm/swap.c +++ b/mm/swap.c -@@ -1090,6 +1090,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) +@@ -1111,6 +1111,10 @@ void folio_batch_remove_exceptionals(struct folio_batch *fbatch) */ void __init swap_setup(void) { @@ -10295,7 +11781,7 @@ index cd8f0150ba3a..42c405a4f114 100644 unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT); /* Use a smaller cluster for small-memory machines */ -@@ -1101,4 +1105,5 @@ void __init swap_setup(void) +@@ -1122,4 +1126,5 @@ void __init swap_setup(void) * Right now other parts of the system means that we * _really_ don't want to cluster much more */ @@ -10318,38 +11804,10 @@ index bd5183dfd879..3a410f53a07c 100644 /* diff --git a/mm/vmscan.c b/mm/vmscan.c -index 4255619a1a31..62f42e92964f 100644 +index 2e34de9cd0d4..be9e40acc93b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c -@@ -133,6 +133,15 @@ struct scan_control { - /* The file folios on the current node are dangerously low */ - unsigned int file_is_tiny:1; - -+ /* The anonymous pages on the current node are below vm.anon_min_ratio */ -+ unsigned int anon_below_min:1; -+ -+ /* The clean file pages on the current node are below vm.clean_low_ratio */ -+ unsigned int clean_below_low:1; -+ -+ /* The clean file pages on the current node are below vm.clean_min_ratio */ -+ unsigned int clean_below_min:1; -+ - /* Always discard instead of demoting to lower tier memory */ - unsigned int no_demotion:1; - -@@ -182,10 +191,23 @@ struct scan_control { - #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) - #endif - -+bool sysctl_workingset_protection __read_mostly = true; -+u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; -+u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; -+u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; -+static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; -+static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; -+static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; -+static u64 workingset_protection_prev_totalram __read_mostly = 0; -+ +@@ -191,7 +191,11 @@ struct scan_control { /* * From 0 .. 200. Higher means more swappy. */ @@ -10361,56 +11819,7 @@ index 4255619a1a31..62f42e92964f 100644 #ifdef CONFIG_MEMCG -@@ -1052,6 +1074,9 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - -+ if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min) -+ goto keep_locked; -+ - /* - * The number of dirty pages determines if a node is marked - * reclaim_congested. kswapd will stall and start writing -@@ -2353,6 +2378,23 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - goto out; - } - -+ /* -+ * Force-scan the other type if anon/clean pages is -+ * under vm.{anon,clean}_{low,min}_ratio, respectively. -+ */ -+ if (sc->clean_below_min) { -+ scan_balance = SCAN_ANON; -+ goto out; -+ } -+ if (sc->anon_below_min) { -+ scan_balance = SCAN_FILE; -+ goto out; -+ } -+ if (sc->clean_below_low) { -+ scan_balance = SCAN_ANON; -+ goto out; -+ } -+ - /* - * Do not apply any pressure balancing cleverness when the - * system is close to OOM, scan both anon and file equally -@@ -2515,6 +2557,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - BUG(); - } - -+ /* -+ * Hard protection of the working set. -+ * Don't reclaim anon/file pages when the amount is -+ * below the watermark of the same type. -+ */ -+ if (file ? sc->clean_below_min : sc->anon_below_min) -+ scan = 0; -+ - nr[lru] = scan; - } - } -@@ -3922,7 +3972,28 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc +@@ -3949,7 +3953,11 @@ static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc } /* to protect the working set of the last N jiffies */ @@ -10419,824 +11828,8216 @@ index 4255619a1a31..62f42e92964f 100644 +#else static unsigned long lru_gen_min_ttl __read_mostly; +#endif -+ -+static void do_invoke_oom(struct scan_control *sc, bool try_memcg) { -+ struct oom_control oc = { -+ .gfp_mask = sc->gfp_mask, -+ .order = sc->order, -+ }; -+ -+ if (try_memcg && mem_cgroup_oom_synchronize(true)) -+ return; -+ -+ if (!mutex_trylock(&oom_lock)) -+ return; -+ out_of_memory(&oc); -+ mutex_unlock(&oom_lock); -+} -+#define invoke_oom(sc) do_invoke_oom(sc, true) -+#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false) static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) { -@@ -3952,14 +4023,96 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - * younger than min_ttl. However, another possibility is all memcgs are - * either too small or below min. - */ -- if (mutex_trylock(&oom_lock)) { -- struct oom_control oc = { -- .gfp_mask = sc->gfp_mask, -- }; -+ invoke_oom_nomemcg(sc); -+} -+ -+int vm_workingset_protection_update_handler(struct ctl_table *table, int write, -+ void __user *buffer, size_t *lenp, loff_t *ppos) -+{ -+ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); -+ if (ret || !write) -+ return ret; -+ -+ workingset_protection_prev_totalram = 0; -+ -+ return 0; -+} -+ -+static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) -+{ -+ unsigned long node_mem_total; -+ struct sysinfo i; -+ -+ if (!(sysctl_workingset_protection)) { -+ sc->anon_below_min = 0; -+ sc->clean_below_low = 0; -+ sc->clean_below_min = 0; -+ return; -+ } -+ -+ if (likely(sysctl_anon_min_ratio || -+ sysctl_clean_low_ratio || -+ sysctl_clean_min_ratio)) { -+#ifdef CONFIG_NUMA -+ si_meminfo_node(&i, pgdat->node_id); -+#else //CONFIG_NUMA -+ si_meminfo(&i); -+#endif //CONFIG_NUMA -+ node_mem_total = i.totalram; -+ -+ if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { -+ sysctl_anon_min_ratio_kb = -+ node_mem_total * sysctl_anon_min_ratio / 100; -+ sysctl_clean_low_ratio_kb = -+ node_mem_total * sysctl_clean_low_ratio / 100; -+ sysctl_clean_min_ratio_kb = -+ node_mem_total * sysctl_clean_min_ratio / 100; -+ workingset_protection_prev_totalram = node_mem_total; -+ } -+ } - -- out_of_memory(&oc); -+ /* -+ * Check the number of anonymous pages to protect them from -+ * reclaiming if their amount is below the specified. -+ */ -+ if (sysctl_anon_min_ratio) { -+ unsigned long reclaimable_anon; -+ -+ reclaimable_anon = -+ node_page_state(pgdat, NR_ACTIVE_ANON) + -+ node_page_state(pgdat, NR_INACTIVE_ANON) + -+ node_page_state(pgdat, NR_ISOLATED_ANON); -+ -+ sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; -+ } else -+ sc->anon_below_min = 0; -+ -+ /* -+ * Check the number of clean file pages to protect them from -+ * reclaiming if their amount is below the specified. -+ */ -+ if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { -+ unsigned long reclaimable_file, dirty, clean; -+ -+ reclaimable_file = -+ node_page_state(pgdat, NR_ACTIVE_FILE) + -+ node_page_state(pgdat, NR_INACTIVE_FILE) + -+ node_page_state(pgdat, NR_ISOLATED_FILE); -+ dirty = node_page_state(pgdat, NR_FILE_DIRTY); -+ /* -+ * node_page_state() sum can go out of sync since -+ * all the values are not read at once. -+ */ -+ if (likely(reclaimable_file > dirty)) -+ clean = reclaimable_file - dirty; -+ else -+ clean = 0; - -- mutex_unlock(&oom_lock); -+ sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; -+ sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; -+ } else { -+ sc->clean_below_low = 0; -+ sc->clean_below_min = 0; - } - } - -@@ -4462,6 +4615,12 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw - */ - if (!swappiness) - type = LRU_GEN_FILE; -+ else if (sc->clean_below_min) -+ type = LRU_GEN_ANON; -+ else if (sc->anon_below_min) -+ type = LRU_GEN_FILE; -+ else if (sc->clean_below_low) -+ type = LRU_GEN_ANON; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) -@@ -4471,7 +4630,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw - else - type = get_type_to_scan(lruvec, swappiness, &tier); - -- for (i = !swappiness; i < ANON_AND_FILE; i++) { -+ for (i = 0; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); - -@@ -4749,6 +4908,7 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - -+ prepare_workingset_protection(pgdat, sc); - mem_cgroup_calculate_protection(NULL, memcg); - - if (mem_cgroup_below_min(NULL, memcg)) -@@ -5899,6 +6059,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - - prepare_scan_control(pgdat, sc); - -+ prepare_workingset_protection(pgdat, sc); -+ - shrink_node_memcgs(pgdat, sc); - - flush_reclaim_state(sc); -@@ -5987,6 +6149,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - */ - if (reclaimable) - pgdat->kswapd_failures = 0; -+ else if (sc->clean_below_min && !sc->priority) -+ invoke_oom(sc); - } - - /* -- -2.44.0 +2.46.0.rc1 -From 3719b448ce6ae6e6df7f49a99ef30eeb0bf2117d Mon Sep 17 00:00:00 2001 +From e91af07ae5c96cff206bbbe52c16edb871050bc9 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 3 Apr 2024 17:43:37 +0200 -Subject: [PATCH] Revert "le9uo" +Date: Mon, 15 Jul 2024 13:24:26 +0200 +Subject: [PATCH 05/11] crypto -This reverts commit 9bb31a68ef456524c4370323e1c19b07fc0633df. +Signed-off-by: Peter Jung --- - Documentation/admin-guide/sysctl/vm.rst | 72 ---------- - include/linux/mm.h | 8 -- - kernel/sysctl.c | 34 ----- - mm/Kconfig | 63 --------- - mm/mm_init.c | 1 - - mm/vmscan.c | 170 +----------------------- - 6 files changed, 7 insertions(+), 341 deletions(-) + arch/x86/crypto/Kconfig | 1 + + arch/x86/crypto/Makefile | 8 +- + arch/x86/crypto/aes-gcm-aesni-x86_64.S | 1128 +++++++++ + arch/x86/crypto/aes-gcm-avx10-x86_64.S | 1222 ++++++++++ + arch/x86/crypto/aesni-intel_asm.S | 1503 +----------- + arch/x86/crypto/aesni-intel_avx-x86_64.S | 2804 ---------------------- + arch/x86/crypto/aesni-intel_glue.c | 1269 ++++++---- + 7 files changed, 3125 insertions(+), 4810 deletions(-) + create mode 100644 arch/x86/crypto/aes-gcm-aesni-x86_64.S + create mode 100644 arch/x86/crypto/aes-gcm-avx10-x86_64.S + delete mode 100644 arch/x86/crypto/aesni-intel_avx-x86_64.S -diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst -index 468ae7dec1e1..c59889de122b 100644 ---- a/Documentation/admin-guide/sysctl/vm.rst -+++ b/Documentation/admin-guide/sysctl/vm.rst -@@ -25,9 +25,6 @@ files can be found in mm/swap.c. - Currently, these files are in /proc/sys/vm: +diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig +index c9e59589a1ce..24875e6295f2 100644 +--- a/arch/x86/crypto/Kconfig ++++ b/arch/x86/crypto/Kconfig +@@ -18,6 +18,7 @@ config CRYPTO_AES_NI_INTEL + depends on X86 + select CRYPTO_AEAD + select CRYPTO_LIB_AES ++ select CRYPTO_LIB_GF128MUL + select CRYPTO_ALGAPI + select CRYPTO_SKCIPHER + select CRYPTO_SIMD +diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile +index 9c5ce5613738..53b4a277809e 100644 +--- a/arch/x86/crypto/Makefile ++++ b/arch/x86/crypto/Makefile +@@ -48,8 +48,12 @@ chacha-x86_64-$(CONFIG_AS_AVX512) += chacha-avx512vl-x86_64.o - - admin_reserve_kbytes --- anon_min_ratio --- clean_low_ratio --- clean_min_ratio - - compact_memory - - compaction_proactiveness - - compact_unevictable_allowed -@@ -109,67 +106,6 @@ On x86_64 this is about 128MB. - Changing this takes effect whenever an application requests memory. + obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o + aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +-aesni-intel-$(CONFIG_64BIT) += aesni-intel_avx-x86_64.o \ +- aes_ctrby8_avx-x86_64.o aes-xts-avx-x86_64.o ++aesni-intel-$(CONFIG_64BIT) += aes_ctrby8_avx-x86_64.o \ ++ aes-gcm-aesni-x86_64.o \ ++ aes-xts-avx-x86_64.o ++ifeq ($(CONFIG_AS_VAES)$(CONFIG_AS_VPCLMULQDQ),yy) ++aesni-intel-$(CONFIG_64BIT) += aes-gcm-avx10-x86_64.o ++endif + obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o + sha1-ssse3-y := sha1_avx2_x86_64_asm.o sha1_ssse3_asm.o sha1_ssse3_glue.o +diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S +new file mode 100644 +index 000000000000..45940e2883a0 +--- /dev/null ++++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S +@@ -0,0 +1,1128 @@ ++/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ ++// ++// AES-NI optimized AES-GCM for x86_64 ++// ++// Copyright 2024 Google LLC ++// ++// Author: Eric Biggers ++// ++//------------------------------------------------------------------------------ ++// ++// This file is dual-licensed, meaning that you can use it under your choice of ++// either of the following two licenses: ++// ++// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy ++// of the License at ++// ++// http://www.apache.org/licenses/LICENSE-2.0 ++// ++// Unless required by applicable law or agreed to in writing, software ++// distributed under the License is distributed on an "AS IS" BASIS, ++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// See the License for the specific language governing permissions and ++// limitations under the License. ++// ++// or ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions are met: ++// ++// 1. Redistributions of source code must retain the above copyright notice, ++// this list of conditions and the following disclaimer. ++// ++// 2. Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE ++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++// POSSIBILITY OF SUCH DAMAGE. ++// ++//------------------------------------------------------------------------------ ++// ++// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that ++// support the original set of AES instructions, i.e. AES-NI. Two ++// implementations are provided, one that uses AVX and one that doesn't. They ++// are very similar, being generated by the same macros. The only difference is ++// that the AVX implementation takes advantage of VEX-coded instructions in some ++// places to avoid some 'movdqu' and 'movdqa' instructions. The AVX ++// implementation does *not* use 256-bit vectors, as AES is not supported on ++// 256-bit vectors until the VAES feature (which this file doesn't target). ++// ++// The specific CPU feature prerequisites are AES-NI and PCLMULQDQ, plus SSE4.1 ++// for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems ++// there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) ++// ++// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is ++// more thoroughly commented. This file has the following notable changes: ++// ++// - The vector length is fixed at 128-bit, i.e. xmm registers. This means ++// there is only one AES block (and GHASH block) per register. ++// ++// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of ++// 32. We work around this by being much more careful about using ++// registers, relying heavily on loads to load values as they are needed. ++// ++// - Masking is not available either. We work around this by implementing ++// partial block loads and stores using overlapping scalar loads and stores ++// combined with shifts and SSE4.1 insertion and extraction instructions. ++// ++// - The main loop is organized differently due to the different design ++// constraints. First, with just one AES block per SIMD register, on some ++// CPUs 4 registers don't saturate the 'aesenc' throughput. We therefore ++// do an 8-register wide loop. Considering that and the fact that we have ++// just 16 SIMD registers to work with, it's not feasible to cache AES ++// round keys and GHASH key powers in registers across loop iterations. ++// That's not ideal, but also not actually that bad, since loads can run in ++// parallel with other instructions. Significantly, this also makes it ++// possible to roll up the inner loops, relying on hardware loop unrolling ++// instead of software loop unrolling, greatly reducing code size. ++// ++// - We implement the GHASH multiplications in the main loop using Karatsuba ++// multiplication instead of schoolbook multiplication. This saves one ++// pclmulqdq instruction per block, at the cost of one 64-bit load, one ++// pshufd, and 0.25 pxors per block. (This is without the three-argument ++// XOR support that would be provided by AVX512 / AVX10, which would be ++// more beneficial to schoolbook than Karatsuba.) ++// ++// As a rough approximation, we can assume that Karatsuba multiplication is ++// faster than schoolbook multiplication in this context if one pshufd and ++// 0.25 pxors are cheaper than a pclmulqdq. (We assume that the 64-bit ++// load is "free" due to running in parallel with arithmetic instructions.) ++// This is true on AMD CPUs, including all that support pclmulqdq up to at ++// least Zen 3. It's also true on older Intel CPUs: Westmere through ++// Haswell on the Core side, and Silvermont through Goldmont Plus on the ++// low-power side. On some of these CPUs, pclmulqdq is quite slow, and the ++// benefit of Karatsuba should be substantial. On newer Intel CPUs, ++// schoolbook multiplication should be faster, but only marginally. ++// ++// Not all these CPUs were available to be tested. However, benchmarks on ++// available CPUs suggest that this approximation is plausible. Switching ++// to Karatsuba showed negligible change (< 1%) on Intel Broadwell, ++// Skylake, and Cascade Lake, but it improved AMD Zen 1-3 by 6-7%. ++// Considering that and the fact that Karatsuba should be even more ++// beneficial on older Intel CPUs, it seems like the right choice here. ++// ++// An additional 0.25 pclmulqdq per block (2 per 8 blocks) could be ++// saved by using a multiplication-less reduction method. We don't do that ++// because it would require a large number of shift and xor instructions, ++// making it less worthwhile and likely harmful on newer CPUs. ++// ++// It does make sense to sometimes use a different reduction optimization ++// that saves a pclmulqdq, though: precompute the hash key times x^64, and ++// multiply the low half of the data block by the hash key with the extra ++// factor of x^64. This eliminates one step of the reduction. However, ++// this is incompatible with Karatsuba multiplication. Therefore, for ++// multi-block processing we use Karatsuba multiplication with a regular ++// reduction. For single-block processing, we use the x^64 optimization. ++ ++#include ++ ++.section .rodata ++.p2align 4 ++.Lbswap_mask: ++ .octa 0x000102030405060708090a0b0c0d0e0f ++.Lgfpoly: ++ .quad 0xc200000000000000 ++.Lone: ++ .quad 1 ++.Lgfpoly_and_internal_carrybit: ++ .octa 0xc2000000000000010000000000000001 ++ // Loading 16 bytes from '.Lzeropad_mask + 16 - len' produces a mask of ++ // 'len' 0xff bytes and the rest zeroes. ++.Lzeropad_mask: ++ .octa 0xffffffffffffffffffffffffffffffff ++ .octa 0 ++ ++// Offsets in struct aes_gcm_key_aesni ++#define OFFSETOF_AESKEYLEN 480 ++#define OFFSETOF_H_POWERS 496 ++#define OFFSETOF_H_POWERS_XORED 624 ++#define OFFSETOF_H_TIMES_X64 688 ++ ++.text ++ ++// Do a vpclmulqdq, or fall back to a movdqa and a pclmulqdq. The fallback ++// assumes that all operands are distinct and that any mem operand is aligned. ++.macro _vpclmulqdq imm, src1, src2, dst ++.if USE_AVX ++ vpclmulqdq \imm, \src1, \src2, \dst ++.else ++ movdqa \src2, \dst ++ pclmulqdq \imm, \src1, \dst ++.endif ++.endm ++ ++// Do a vpshufb, or fall back to a movdqa and a pshufb. The fallback assumes ++// that all operands are distinct and that any mem operand is aligned. ++.macro _vpshufb src1, src2, dst ++.if USE_AVX ++ vpshufb \src1, \src2, \dst ++.else ++ movdqa \src2, \dst ++ pshufb \src1, \dst ++.endif ++.endm ++ ++// Do a vpand, or fall back to a movdqu and a pand. The fallback assumes that ++// all operands are distinct. ++.macro _vpand src1, src2, dst ++.if USE_AVX ++ vpand \src1, \src2, \dst ++.else ++ movdqu \src1, \dst ++ pand \src2, \dst ++.endif ++.endm ++ ++// XOR the unaligned memory operand \mem into the xmm register \reg. \tmp must ++// be a temporary xmm register. ++.macro _xor_mem_to_reg mem, reg, tmp ++.if USE_AVX ++ vpxor \mem, \reg, \reg ++.else ++ movdqu \mem, \tmp ++ pxor \tmp, \reg ++.endif ++.endm ++ ++// Test the unaligned memory operand \mem against the xmm register \reg. \tmp ++// must be a temporary xmm register. ++.macro _test_mem mem, reg, tmp ++.if USE_AVX ++ vptest \mem, \reg ++.else ++ movdqu \mem, \tmp ++ ptest \tmp, \reg ++.endif ++.endm ++ ++// Load 1 <= %ecx <= 15 bytes from the pointer \src into the xmm register \dst ++// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. ++.macro _load_partial_block src, dst, tmp64, tmp32 ++ sub $8, %ecx // LEN - 8 ++ jle .Lle8\@ ++ ++ // Load 9 <= LEN <= 15 bytes. ++ movq (\src), \dst // Load first 8 bytes ++ mov (\src, %rcx), %rax // Load last 8 bytes ++ neg %ecx ++ shl $3, %ecx ++ shr %cl, %rax // Discard overlapping bytes ++ pinsrq $1, %rax, \dst ++ jmp .Ldone\@ ++ ++.Lle8\@: ++ add $4, %ecx // LEN - 4 ++ jl .Llt4\@ ++ ++ // Load 4 <= LEN <= 8 bytes. ++ mov (\src), %eax // Load first 4 bytes ++ mov (\src, %rcx), \tmp32 // Load last 4 bytes ++ jmp .Lcombine\@ ++ ++.Llt4\@: ++ // Load 1 <= LEN <= 3 bytes. ++ add $2, %ecx // LEN - 2 ++ movzbl (\src), %eax // Load first byte ++ jl .Lmovq\@ ++ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes ++.Lcombine\@: ++ shl $3, %ecx ++ shl %cl, \tmp64 ++ or \tmp64, %rax // Combine the two parts ++.Lmovq\@: ++ movq %rax, \dst ++.Ldone\@: ++.endm ++ ++// Store 1 <= %ecx <= 15 bytes from the xmm register \src to the pointer \dst. ++// Clobbers %rax, %rcx, and %rsi. ++.macro _store_partial_block src, dst ++ sub $8, %ecx // LEN - 8 ++ jl .Llt8\@ ++ ++ // Store 8 <= LEN <= 15 bytes. ++ pextrq $1, \src, %rax ++ mov %ecx, %esi ++ shl $3, %ecx ++ ror %cl, %rax ++ mov %rax, (\dst, %rsi) // Store last LEN - 8 bytes ++ movq \src, (\dst) // Store first 8 bytes ++ jmp .Ldone\@ ++ ++.Llt8\@: ++ add $4, %ecx // LEN - 4 ++ jl .Llt4\@ ++ ++ // Store 4 <= LEN <= 7 bytes. ++ pextrd $1, \src, %eax ++ mov %ecx, %esi ++ shl $3, %ecx ++ ror %cl, %eax ++ mov %eax, (\dst, %rsi) // Store last LEN - 4 bytes ++ movd \src, (\dst) // Store first 4 bytes ++ jmp .Ldone\@ ++ ++.Llt4\@: ++ // Store 1 <= LEN <= 3 bytes. ++ pextrb $0, \src, 0(\dst) ++ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? ++ jl .Ldone\@ ++ pextrb $1, \src, 1(\dst) ++ je .Ldone\@ ++ pextrb $2, \src, 2(\dst) ++.Ldone\@: ++.endm ++ ++// Do one step of GHASH-multiplying \a by \b and storing the reduced product in ++// \b. To complete all steps, this must be invoked with \i=0 through \i=9. ++// \a_times_x64 must contain \a * x^64 in reduced form, \gfpoly must contain the ++// .Lgfpoly constant, and \t0-\t1 must be temporary registers. ++.macro _ghash_mul_step i, a, a_times_x64, b, gfpoly, t0, t1 ++ ++ // MI = (a_L * b_H) + ((a*x^64)_L * b_L) ++.if \i == 0 ++ _vpclmulqdq $0x01, \a, \b, \t0 ++.elseif \i == 1 ++ _vpclmulqdq $0x00, \a_times_x64, \b, \t1 ++.elseif \i == 2 ++ pxor \t1, \t0 ++ ++ // HI = (a_H * b_H) + ((a*x^64)_H * b_L) ++.elseif \i == 3 ++ _vpclmulqdq $0x11, \a, \b, \t1 ++.elseif \i == 4 ++ pclmulqdq $0x10, \a_times_x64, \b ++.elseif \i == 5 ++ pxor \t1, \b ++.elseif \i == 6 ++ ++ // Fold MI into HI. ++ pshufd $0x4e, \t0, \t1 // Swap halves of MI ++.elseif \i == 7 ++ pclmulqdq $0x00, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) ++.elseif \i == 8 ++ pxor \t1, \b ++.elseif \i == 9 ++ pxor \t0, \b ++.endif ++.endm ++ ++// GHASH-multiply \a by \b and store the reduced product in \b. ++// See _ghash_mul_step for details. ++.macro _ghash_mul a, a_times_x64, b, gfpoly, t0, t1 ++.irp i, 0,1,2,3,4,5,6,7,8,9 ++ _ghash_mul_step \i, \a, \a_times_x64, \b, \gfpoly, \t0, \t1 ++.endr ++.endm ++ ++// GHASH-multiply \a by \b and add the unreduced product to \lo, \mi, and \hi. ++// This does Karatsuba multiplication and must be paired with _ghash_reduce. On ++// the first call, \lo, \mi, and \hi must be zero. \a_xored must contain the ++// two halves of \a XOR'd together, i.e. a_L + a_H. \b is clobbered. ++.macro _ghash_mul_noreduce a, a_xored, b, lo, mi, hi, t0 ++ ++ // LO += a_L * b_L ++ _vpclmulqdq $0x00, \a, \b, \t0 ++ pxor \t0, \lo ++ ++ // b_L + b_H ++ pshufd $0x4e, \b, \t0 ++ pxor \b, \t0 ++ ++ // HI += a_H * b_H ++ pclmulqdq $0x11, \a, \b ++ pxor \b, \hi ++ ++ // MI += (a_L + a_H) * (b_L + b_H) ++ pclmulqdq $0x00, \a_xored, \t0 ++ pxor \t0, \mi ++.endm ++ ++// Reduce the product from \lo, \mi, and \hi, and store the result in \dst. ++// This assumes that _ghash_mul_noreduce was used. ++.macro _ghash_reduce lo, mi, hi, dst, t0 ++ ++ movq .Lgfpoly(%rip), \t0 ++ ++ // MI += LO + HI (needed because we used Karatsuba multiplication) ++ pxor \lo, \mi ++ pxor \hi, \mi ++ ++ // Fold LO into MI. ++ pshufd $0x4e, \lo, \dst ++ pclmulqdq $0x00, \t0, \lo ++ pxor \dst, \mi ++ pxor \lo, \mi ++ ++ // Fold MI into HI. ++ pshufd $0x4e, \mi, \dst ++ pclmulqdq $0x00, \t0, \mi ++ pxor \hi, \dst ++ pxor \mi, \dst ++.endm ++ ++// Do the first step of the GHASH update of a set of 8 ciphertext blocks. ++// ++// The whole GHASH update does: ++// ++// GHASH_ACC = (blk0+GHASH_ACC)*H^8 + blk1*H^7 + blk2*H^6 + blk3*H^5 + ++// blk4*H^4 + blk5*H^3 + blk6*H^2 + blk7*H^1 ++// ++// This macro just does the first step: it does the unreduced multiplication ++// (blk0+GHASH_ACC)*H^8 and starts gathering the unreduced product in the xmm ++// registers LO, MI, and GHASH_ACC a.k.a. HI. It also zero-initializes the ++// inner block counter in %rax, which is a value that counts up by 8 for each ++// block in the set of 8 and is used later to index by 8*blknum and 16*blknum. ++// ++// To reduce the number of pclmulqdq instructions required, both this macro and ++// _ghash_update_continue_8x use Karatsuba multiplication instead of schoolbook ++// multiplication. See the file comment for more details about this choice. ++// ++// Both macros expect the ciphertext blocks blk[0-7] to be available at DST if ++// encrypting, or SRC if decrypting. They also expect the precomputed hash key ++// powers H^i and their XOR'd-together halves to be available in the struct ++// pointed to by KEY. Both macros clobber TMP[0-2]. ++.macro _ghash_update_begin_8x enc ++ ++ // Initialize the inner block counter. ++ xor %eax, %eax ++ ++ // Load the highest hash key power, H^8. ++ movdqa OFFSETOF_H_POWERS(KEY), TMP0 ++ ++ // Load the first ciphertext block and byte-reflect it. ++.if \enc ++ movdqu (DST), TMP1 ++.else ++ movdqu (SRC), TMP1 ++.endif ++ pshufb BSWAP_MASK, TMP1 ++ ++ // Add the GHASH accumulator to the ciphertext block to get the block ++ // 'b' that needs to be multiplied with the hash key power 'a'. ++ pxor TMP1, GHASH_ACC ++ ++ // b_L + b_H ++ pshufd $0x4e, GHASH_ACC, MI ++ pxor GHASH_ACC, MI ++ ++ // LO = a_L * b_L ++ _vpclmulqdq $0x00, TMP0, GHASH_ACC, LO ++ ++ // HI = a_H * b_H ++ pclmulqdq $0x11, TMP0, GHASH_ACC ++ ++ // MI = (a_L + a_H) * (b_L + b_H) ++ pclmulqdq $0x00, OFFSETOF_H_POWERS_XORED(KEY), MI ++.endm ++ ++// Continue the GHASH update of 8 ciphertext blocks as described above by doing ++// an unreduced multiplication of the next ciphertext block by the next lowest ++// key power and accumulating the result into LO, MI, and GHASH_ACC a.k.a. HI. ++.macro _ghash_update_continue_8x enc ++ add $8, %eax ++ ++ // Load the next lowest key power. ++ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), TMP0 ++ ++ // Load the next ciphertext block and byte-reflect it. ++.if \enc ++ movdqu (DST,%rax,2), TMP1 ++.else ++ movdqu (SRC,%rax,2), TMP1 ++.endif ++ pshufb BSWAP_MASK, TMP1 ++ ++ // LO += a_L * b_L ++ _vpclmulqdq $0x00, TMP0, TMP1, TMP2 ++ pxor TMP2, LO ++ ++ // b_L + b_H ++ pshufd $0x4e, TMP1, TMP2 ++ pxor TMP1, TMP2 ++ ++ // HI += a_H * b_H ++ pclmulqdq $0x11, TMP0, TMP1 ++ pxor TMP1, GHASH_ACC ++ ++ // MI += (a_L + a_H) * (b_L + b_H) ++ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), TMP1 ++ pclmulqdq $0x00, TMP1, TMP2 ++ pxor TMP2, MI ++.endm ++ ++// Reduce LO, MI, and GHASH_ACC a.k.a. HI into GHASH_ACC. This is similar to ++// _ghash_reduce, but it's hardcoded to use the registers of the main loop and ++// it uses the same register for HI and the destination. It's also divided into ++// two steps. TMP1 must be preserved across steps. ++// ++// One pshufd could be saved by shuffling MI and XOR'ing LO into it, instead of ++// shuffling LO, XOR'ing LO into MI, and shuffling MI. However, this would ++// increase the critical path length, and it seems to slightly hurt performance. ++.macro _ghash_update_end_8x_step i ++.if \i == 0 ++ movq .Lgfpoly(%rip), TMP1 ++ pxor LO, MI ++ pxor GHASH_ACC, MI ++ pshufd $0x4e, LO, TMP2 ++ pclmulqdq $0x00, TMP1, LO ++ pxor TMP2, MI ++ pxor LO, MI ++.elseif \i == 1 ++ pshufd $0x4e, MI, TMP2 ++ pclmulqdq $0x00, TMP1, MI ++ pxor TMP2, GHASH_ACC ++ pxor MI, GHASH_ACC ++.endif ++.endm ++ ++// void aes_gcm_precompute_##suffix(struct aes_gcm_key_aesni *key); ++// ++// Given the expanded AES key, derive the GHASH subkey and initialize the GHASH ++// related fields in the key struct. ++.macro _aes_gcm_precompute ++ ++ // Function arguments ++ .set KEY, %rdi ++ ++ // Additional local variables. ++ // %xmm0-%xmm1 and %rax are used as temporaries. ++ .set RNDKEYLAST_PTR, %rsi ++ .set H_CUR, %xmm2 ++ .set H_POW1, %xmm3 // H^1 ++ .set H_POW1_X64, %xmm4 // H^1 * x^64 ++ .set GFPOLY, %xmm5 ++ ++ // Encrypt an all-zeroes block to get the raw hash subkey. ++ movl OFFSETOF_AESKEYLEN(KEY), %eax ++ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR ++ movdqa (KEY), H_POW1 // Zero-th round key XOR all-zeroes block ++ lea 16(KEY), %rax ++1: ++ aesenc (%rax), H_POW1 ++ add $16, %rax ++ cmp %rax, RNDKEYLAST_PTR ++ jne 1b ++ aesenclast (RNDKEYLAST_PTR), H_POW1 ++ ++ // Preprocess the raw hash subkey as needed to operate on GHASH's ++ // bit-reflected values directly: reflect its bytes, then multiply it by ++ // x^-1 (using the backwards interpretation of polynomial coefficients ++ // from the GCM spec) or equivalently x^1 (using the alternative, ++ // natural interpretation of polynomial coefficients). ++ pshufb .Lbswap_mask(%rip), H_POW1 ++ movdqa H_POW1, %xmm0 ++ pshufd $0xd3, %xmm0, %xmm0 ++ psrad $31, %xmm0 ++ paddq H_POW1, H_POW1 ++ pand .Lgfpoly_and_internal_carrybit(%rip), %xmm0 ++ pxor %xmm0, H_POW1 ++ ++ // Store H^1. ++ movdqa H_POW1, OFFSETOF_H_POWERS+7*16(KEY) ++ ++ // Compute and store H^1 * x^64. ++ movq .Lgfpoly(%rip), GFPOLY ++ pshufd $0x4e, H_POW1, %xmm0 ++ _vpclmulqdq $0x00, H_POW1, GFPOLY, H_POW1_X64 ++ pxor %xmm0, H_POW1_X64 ++ movdqa H_POW1_X64, OFFSETOF_H_TIMES_X64(KEY) ++ ++ // Compute and store the halves of H^1 XOR'd together. ++ pxor H_POW1, %xmm0 ++ movq %xmm0, OFFSETOF_H_POWERS_XORED+7*8(KEY) ++ ++ // Compute and store the remaining key powers H^2 through H^8. ++ movdqa H_POW1, H_CUR ++ mov $6*8, %eax ++.Lprecompute_next\@: ++ // Compute H^i = H^{i-1} * H^1. ++ _ghash_mul H_POW1, H_POW1_X64, H_CUR, GFPOLY, %xmm0, %xmm1 ++ // Store H^i. ++ movdqa H_CUR, OFFSETOF_H_POWERS(KEY,%rax,2) ++ // Compute and store the halves of H^i XOR'd together. ++ pshufd $0x4e, H_CUR, %xmm0 ++ pxor H_CUR, %xmm0 ++ movq %xmm0, OFFSETOF_H_POWERS_XORED(KEY,%rax) ++ sub $8, %eax ++ jge .Lprecompute_next\@ ++ ++ RET ++.endm ++ ++// void aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, ++// u8 ghash_acc[16], const u8 *aad, int aadlen); ++// ++// This function processes the AAD (Additional Authenticated Data) in GCM. ++// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the ++// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all ++// zeroes. |aadlen| must be a multiple of 16, except on the last call where it ++// can be any length. The caller must do any buffering needed to ensure this. ++.macro _aes_gcm_aad_update ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set GHASH_ACC_PTR, %rsi ++ .set AAD, %rdx ++ .set AADLEN, %ecx ++ // Note: _load_partial_block relies on AADLEN being in %ecx. ++ ++ // Additional local variables. ++ // %rax, %r10, and %xmm0-%xmm1 are used as temporary registers. ++ .set BSWAP_MASK, %xmm2 ++ .set GHASH_ACC, %xmm3 ++ .set H_POW1, %xmm4 // H^1 ++ .set H_POW1_X64, %xmm5 // H^1 * x^64 ++ .set GFPOLY, %xmm6 ++ ++ movdqa .Lbswap_mask(%rip), BSWAP_MASK ++ movdqu (GHASH_ACC_PTR), GHASH_ACC ++ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 ++ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 ++ movq .Lgfpoly(%rip), GFPOLY ++ ++ // Process the AAD one full block at a time. ++ sub $16, AADLEN ++ jl .Laad_loop_1x_done\@ ++.Laad_loop_1x\@: ++ movdqu (AAD), %xmm0 ++ pshufb BSWAP_MASK, %xmm0 ++ pxor %xmm0, GHASH_ACC ++ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 ++ add $16, AAD ++ sub $16, AADLEN ++ jge .Laad_loop_1x\@ ++.Laad_loop_1x_done\@: ++ // Check whether there is a partial block at the end. ++ add $16, AADLEN ++ jz .Laad_done\@ ++ ++ // Process a partial block of length 1 <= AADLEN <= 15. ++ // _load_partial_block assumes that %ecx contains AADLEN. ++ _load_partial_block AAD, %xmm0, %r10, %r10d ++ pshufb BSWAP_MASK, %xmm0 ++ pxor %xmm0, GHASH_ACC ++ _ghash_mul H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm0, %xmm1 ++ ++.Laad_done\@: ++ movdqu GHASH_ACC, (GHASH_ACC_PTR) ++ RET ++.endm ++ ++// Increment LE_CTR eight times to generate eight little-endian counter blocks, ++// swap each to big-endian, and store them in AESDATA[0-7]. Also XOR them with ++// the zero-th AES round key. Clobbers TMP0 and TMP1. ++.macro _ctr_begin_8x ++ movq .Lone(%rip), TMP0 ++ movdqa (KEY), TMP1 // zero-th round key ++.irp i, 0,1,2,3,4,5,6,7 ++ _vpshufb BSWAP_MASK, LE_CTR, AESDATA\i ++ pxor TMP1, AESDATA\i ++ paddd TMP0, LE_CTR ++.endr ++.endm ++ ++// Do a non-last round of AES on AESDATA[0-7] using \round_key. ++.macro _aesenc_8x round_key ++.irp i, 0,1,2,3,4,5,6,7 ++ aesenc \round_key, AESDATA\i ++.endr ++.endm ++ ++// Do the last round of AES on AESDATA[0-7] using \round_key. ++.macro _aesenclast_8x round_key ++.irp i, 0,1,2,3,4,5,6,7 ++ aesenclast \round_key, AESDATA\i ++.endr ++.endm ++ ++// XOR eight blocks from SRC with the keystream blocks in AESDATA[0-7], and ++// store the result to DST. Clobbers TMP0. ++.macro _xor_data_8x ++.irp i, 0,1,2,3,4,5,6,7 ++ _xor_mem_to_reg \i*16(SRC), AESDATA\i, tmp=TMP0 ++.endr ++.irp i, 0,1,2,3,4,5,6,7 ++ movdqu AESDATA\i, \i*16(DST) ++.endr ++.endm ++ ++// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_aesni *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// const u8 *src, u8 *dst, int datalen); ++// ++// This macro generates a GCM encryption or decryption update function with the ++// above prototype (with \enc selecting which one). ++// ++// This function computes the next portion of the CTR keystream, XOR's it with ++// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted ++// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the ++// next |datalen| ciphertext bytes. ++// ++// |datalen| must be a multiple of 16, except on the last call where it can be ++// any length. The caller must do any buffering needed to ensure this. Both ++// in-place and out-of-place en/decryption are supported. ++// ++// |le_ctr| must give the current counter in little-endian format. For a new ++// message, the low word of the counter must be 2. This function loads the ++// counter from |le_ctr| and increments the loaded counter as needed, but it ++// does *not* store the updated counter back to |le_ctr|. The caller must ++// update |le_ctr| if any more data segments follow. Internally, only the low ++// 32-bit word of the counter is incremented, following the GCM standard. ++.macro _aes_gcm_update enc ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set LE_CTR_PTR, %rsi // Note: overlaps with usage as temp reg ++ .set GHASH_ACC_PTR, %rdx ++ .set SRC, %rcx ++ .set DST, %r8 ++ .set DATALEN, %r9d ++ .set DATALEN64, %r9 // Zero-extend DATALEN before using! ++ // Note: the code setting up for _load_partial_block assumes that SRC is ++ // in %rcx (and that DATALEN is *not* in %rcx). ++ ++ // Additional local variables ++ ++ // %rax and %rsi are used as temporary registers. Note: %rsi overlaps ++ // with LE_CTR_PTR, which is used only at the beginning. ++ ++ .set AESKEYLEN, %r10d // AES key length in bytes ++ .set AESKEYLEN64, %r10 ++ .set RNDKEYLAST_PTR, %r11 // Pointer to last AES round key ++ ++ // Put the most frequently used values in %xmm0-%xmm7 to reduce code ++ // size. (%xmm0-%xmm7 take fewer bytes to encode than %xmm8-%xmm15.) ++ .set TMP0, %xmm0 ++ .set TMP1, %xmm1 ++ .set TMP2, %xmm2 ++ .set LO, %xmm3 // Low part of unreduced product ++ .set MI, %xmm4 // Middle part of unreduced product ++ .set GHASH_ACC, %xmm5 // GHASH accumulator; in main loop also ++ // the high part of unreduced product ++ .set BSWAP_MASK, %xmm6 // Shuffle mask for reflecting bytes ++ .set LE_CTR, %xmm7 // Little-endian counter value ++ .set AESDATA0, %xmm8 ++ .set AESDATA1, %xmm9 ++ .set AESDATA2, %xmm10 ++ .set AESDATA3, %xmm11 ++ .set AESDATA4, %xmm12 ++ .set AESDATA5, %xmm13 ++ .set AESDATA6, %xmm14 ++ .set AESDATA7, %xmm15 ++ ++ movdqa .Lbswap_mask(%rip), BSWAP_MASK ++ movdqu (GHASH_ACC_PTR), GHASH_ACC ++ movdqu (LE_CTR_PTR), LE_CTR ++ ++ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN ++ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR ++ ++ // If there are at least 8*16 bytes of data, then continue into the main ++ // loop, which processes 8*16 bytes of data per iteration. ++ // ++ // The main loop interleaves AES and GHASH to improve performance on ++ // CPUs that can execute these instructions in parallel. When ++ // decrypting, the GHASH input (the ciphertext) is immediately ++ // available. When encrypting, we instead encrypt a set of 8 blocks ++ // first and then GHASH those blocks while encrypting the next set of 8, ++ // repeat that as needed, and finally GHASH the last set of 8 blocks. ++ // ++ // Code size optimization: Prefer adding or subtracting -8*16 over 8*16, ++ // as this makes the immediate fit in a signed byte, saving 3 bytes. ++ add $-8*16, DATALEN ++ jl .Lcrypt_loop_8x_done\@ ++.if \enc ++ // Encrypt the first 8 plaintext blocks. ++ _ctr_begin_8x ++ lea 16(KEY), %rsi ++ .p2align 4 ++1: ++ movdqa (%rsi), TMP0 ++ _aesenc_8x TMP0 ++ add $16, %rsi ++ cmp %rsi, RNDKEYLAST_PTR ++ jne 1b ++ movdqa (%rsi), TMP0 ++ _aesenclast_8x TMP0 ++ _xor_data_8x ++ // Don't increment DST until the ciphertext blocks have been hashed. ++ sub $-8*16, SRC ++ add $-8*16, DATALEN ++ jl .Lghash_last_ciphertext_8x\@ ++.endif ++ ++ .p2align 4 ++.Lcrypt_loop_8x\@: ++ ++ // Generate the next set of 8 counter blocks and start encrypting them. ++ _ctr_begin_8x ++ lea 16(KEY), %rsi ++ ++ // Do a round of AES, and start the GHASH update of 8 ciphertext blocks ++ // by doing the unreduced multiplication for the first ciphertext block. ++ movdqa (%rsi), TMP0 ++ add $16, %rsi ++ _aesenc_8x TMP0 ++ _ghash_update_begin_8x \enc ++ ++ // Do 7 more rounds of AES, and continue the GHASH update by doing the ++ // unreduced multiplication for the remaining ciphertext blocks. ++ .p2align 4 ++1: ++ movdqa (%rsi), TMP0 ++ add $16, %rsi ++ _aesenc_8x TMP0 ++ _ghash_update_continue_8x \enc ++ cmp $7*8, %eax ++ jne 1b ++ ++ // Do the remaining AES rounds. ++ .p2align 4 ++1: ++ movdqa (%rsi), TMP0 ++ add $16, %rsi ++ _aesenc_8x TMP0 ++ cmp %rsi, RNDKEYLAST_PTR ++ jne 1b ++ ++ // Do the GHASH reduction and the last round of AES. ++ movdqa (RNDKEYLAST_PTR), TMP0 ++ _ghash_update_end_8x_step 0 ++ _aesenclast_8x TMP0 ++ _ghash_update_end_8x_step 1 ++ ++ // XOR the data with the AES-CTR keystream blocks. ++.if \enc ++ sub $-8*16, DST ++.endif ++ _xor_data_8x ++ sub $-8*16, SRC ++.if !\enc ++ sub $-8*16, DST ++.endif ++ add $-8*16, DATALEN ++ jge .Lcrypt_loop_8x\@ ++ ++.if \enc ++.Lghash_last_ciphertext_8x\@: ++ // Update GHASH with the last set of 8 ciphertext blocks. ++ _ghash_update_begin_8x \enc ++ .p2align 4 ++1: ++ _ghash_update_continue_8x \enc ++ cmp $7*8, %eax ++ jne 1b ++ _ghash_update_end_8x_step 0 ++ _ghash_update_end_8x_step 1 ++ sub $-8*16, DST ++.endif ++ ++.Lcrypt_loop_8x_done\@: ++ ++ sub $-8*16, DATALEN ++ jz .Ldone\@ ++ ++ // Handle the remainder of length 1 <= DATALEN < 8*16 bytes. We keep ++ // things simple and keep the code size down by just going one block at ++ // a time, again taking advantage of hardware loop unrolling. Since ++ // there are enough key powers available for all remaining data, we do ++ // the GHASH multiplications unreduced, and only reduce at the very end. ++ ++ .set HI, TMP2 ++ .set H_POW, AESDATA0 ++ .set H_POW_XORED, AESDATA1 ++ .set ONE, AESDATA2 ++ ++ movq .Lone(%rip), ONE ++ ++ // Start collecting the unreduced GHASH intermediate value LO, MI, HI. ++ pxor LO, LO ++ pxor MI, MI ++ pxor HI, HI ++ ++ // Set up a block counter %rax to contain 8*(8-n), where n is the number ++ // of blocks that remain, counting any partial block. This will be used ++ // to access the key powers H^n through H^1. ++ mov DATALEN, %eax ++ neg %eax ++ and $~15, %eax ++ sar $1, %eax ++ add $64, %eax ++ ++ sub $16, DATALEN ++ jl .Lcrypt_loop_1x_done\@ ++ ++ // Process the data one full block at a time. ++.Lcrypt_loop_1x\@: ++ ++ // Encrypt the next counter block. ++ _vpshufb BSWAP_MASK, LE_CTR, TMP0 ++ paddd ONE, LE_CTR ++ pxor (KEY), TMP0 ++ lea -6*16(RNDKEYLAST_PTR), %rsi // Reduce code size ++ cmp $24, AESKEYLEN ++ jl 128f // AES-128? ++ je 192f // AES-192? ++ // AES-256 ++ aesenc -7*16(%rsi), TMP0 ++ aesenc -6*16(%rsi), TMP0 ++192: ++ aesenc -5*16(%rsi), TMP0 ++ aesenc -4*16(%rsi), TMP0 ++128: ++.irp i, -3,-2,-1,0,1,2,3,4,5 ++ aesenc \i*16(%rsi), TMP0 ++.endr ++ aesenclast (RNDKEYLAST_PTR), TMP0 ++ ++ // Load the next key power H^i. ++ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW ++ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED ++ ++ // XOR the keystream block that was just generated in TMP0 with the next ++ // source data block and store the resulting en/decrypted data to DST. ++.if \enc ++ _xor_mem_to_reg (SRC), TMP0, tmp=TMP1 ++ movdqu TMP0, (DST) ++.else ++ movdqu (SRC), TMP1 ++ pxor TMP1, TMP0 ++ movdqu TMP0, (DST) ++.endif ++ ++ // Update GHASH with the ciphertext block. ++.if \enc ++ pshufb BSWAP_MASK, TMP0 ++ pxor TMP0, GHASH_ACC ++.else ++ pshufb BSWAP_MASK, TMP1 ++ pxor TMP1, GHASH_ACC ++.endif ++ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 ++ pxor GHASH_ACC, GHASH_ACC ++ ++ add $8, %eax ++ add $16, SRC ++ add $16, DST ++ sub $16, DATALEN ++ jge .Lcrypt_loop_1x\@ ++.Lcrypt_loop_1x_done\@: ++ // Check whether there is a partial block at the end. ++ add $16, DATALEN ++ jz .Lghash_reduce\@ ++ ++ // Process a partial block of length 1 <= DATALEN <= 15. ++ ++ // Encrypt a counter block for the last time. ++ pshufb BSWAP_MASK, LE_CTR ++ pxor (KEY), LE_CTR ++ lea 16(KEY), %rsi ++1: ++ aesenc (%rsi), LE_CTR ++ add $16, %rsi ++ cmp %rsi, RNDKEYLAST_PTR ++ jne 1b ++ aesenclast (RNDKEYLAST_PTR), LE_CTR ++ ++ // Load the lowest key power, H^1. ++ movdqa OFFSETOF_H_POWERS(KEY,%rax,2), H_POW ++ movq OFFSETOF_H_POWERS_XORED(KEY,%rax), H_POW_XORED ++ ++ // Load and zero-pad 1 <= DATALEN <= 15 bytes of data from SRC. SRC is ++ // in %rcx, but _load_partial_block needs DATALEN in %rcx instead. ++ // RNDKEYLAST_PTR is no longer needed, so reuse it for SRC. ++ mov SRC, RNDKEYLAST_PTR ++ mov DATALEN, %ecx ++ _load_partial_block RNDKEYLAST_PTR, TMP0, %rsi, %esi ++ ++ // XOR the keystream block that was just generated in LE_CTR with the ++ // source data block and store the resulting en/decrypted data to DST. ++ pxor TMP0, LE_CTR ++ mov DATALEN, %ecx ++ _store_partial_block LE_CTR, DST ++ ++ // If encrypting, zero-pad the final ciphertext block for GHASH. (If ++ // decrypting, this was already done by _load_partial_block.) ++.if \enc ++ lea .Lzeropad_mask+16(%rip), %rax ++ sub DATALEN64, %rax ++ _vpand (%rax), LE_CTR, TMP0 ++.endif ++ ++ // Update GHASH with the final ciphertext block. ++ pshufb BSWAP_MASK, TMP0 ++ pxor TMP0, GHASH_ACC ++ _ghash_mul_noreduce H_POW, H_POW_XORED, GHASH_ACC, LO, MI, HI, TMP0 ++ ++.Lghash_reduce\@: ++ // Finally, do the GHASH reduction. ++ _ghash_reduce LO, MI, HI, GHASH_ACC, TMP0 ++ ++.Ldone\@: ++ // Store the updated GHASH accumulator back to memory. ++ movdqu GHASH_ACC, (GHASH_ACC_PTR) ++ ++ RET ++.endm ++ ++// void aes_gcm_enc_final_##suffix(const struct aes_gcm_key_aesni *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen); ++// bool aes_gcm_dec_final_##suffix(const struct aes_gcm_key_aesni *key, ++// const u32 le_ctr[4], const u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen, ++// const u8 tag[16], int taglen); ++// ++// This macro generates one of the above two functions (with \enc selecting ++// which one). Both functions finish computing the GCM authentication tag by ++// updating GHASH with the lengths block and encrypting the GHASH accumulator. ++// |total_aadlen| and |total_datalen| must be the total length of the additional ++// authenticated data and the en/decrypted data in bytes, respectively. ++// ++// The encryption function then stores the full-length (16-byte) computed ++// authentication tag to |ghash_acc|. The decryption function instead loads the ++// expected authentication tag (the one that was transmitted) from the 16-byte ++// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the ++// computed tag in constant time, and returns true if and only if they match. ++.macro _aes_gcm_final enc ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set LE_CTR_PTR, %rsi ++ .set GHASH_ACC_PTR, %rdx ++ .set TOTAL_AADLEN, %rcx ++ .set TOTAL_DATALEN, %r8 ++ .set TAG, %r9 ++ .set TAGLEN, %r10d // Originally at 8(%rsp) ++ .set TAGLEN64, %r10 ++ ++ // Additional local variables. ++ // %rax and %xmm0-%xmm2 are used as temporary registers. ++ .set AESKEYLEN, %r11d ++ .set AESKEYLEN64, %r11 ++ .set BSWAP_MASK, %xmm3 ++ .set GHASH_ACC, %xmm4 ++ .set H_POW1, %xmm5 // H^1 ++ .set H_POW1_X64, %xmm6 // H^1 * x^64 ++ .set GFPOLY, %xmm7 ++ ++ movdqa .Lbswap_mask(%rip), BSWAP_MASK ++ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN ++ ++ // Set up a counter block with 1 in the low 32-bit word. This is the ++ // counter that produces the ciphertext needed to encrypt the auth tag. ++ movdqu (LE_CTR_PTR), %xmm0 ++ mov $1, %eax ++ pinsrd $0, %eax, %xmm0 ++ ++ // Build the lengths block and XOR it into the GHASH accumulator. ++ movq TOTAL_DATALEN, GHASH_ACC ++ pinsrq $1, TOTAL_AADLEN, GHASH_ACC ++ psllq $3, GHASH_ACC // Bytes to bits ++ _xor_mem_to_reg (GHASH_ACC_PTR), GHASH_ACC, %xmm1 ++ ++ movdqa OFFSETOF_H_POWERS+7*16(KEY), H_POW1 ++ movdqa OFFSETOF_H_TIMES_X64(KEY), H_POW1_X64 ++ movq .Lgfpoly(%rip), GFPOLY ++ ++ // Make %rax point to the 6th from last AES round key. (Using signed ++ // byte offsets -7*16 through 6*16 decreases code size.) ++ lea (KEY,AESKEYLEN64,4), %rax ++ ++ // AES-encrypt the counter block and also multiply GHASH_ACC by H^1. ++ // Interleave the AES and GHASH instructions to improve performance. ++ pshufb BSWAP_MASK, %xmm0 ++ pxor (KEY), %xmm0 ++ cmp $24, AESKEYLEN ++ jl 128f // AES-128? ++ je 192f // AES-192? ++ // AES-256 ++ aesenc -7*16(%rax), %xmm0 ++ aesenc -6*16(%rax), %xmm0 ++192: ++ aesenc -5*16(%rax), %xmm0 ++ aesenc -4*16(%rax), %xmm0 ++128: ++.irp i, 0,1,2,3,4,5,6,7,8 ++ aesenc (\i-3)*16(%rax), %xmm0 ++ _ghash_mul_step \i, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 ++.endr ++ aesenclast 6*16(%rax), %xmm0 ++ _ghash_mul_step 9, H_POW1, H_POW1_X64, GHASH_ACC, GFPOLY, %xmm1, %xmm2 ++ ++ // Undo the byte reflection of the GHASH accumulator. ++ pshufb BSWAP_MASK, GHASH_ACC ++ ++ // Encrypt the GHASH accumulator. ++ pxor %xmm0, GHASH_ACC ++ ++.if \enc ++ // Return the computed auth tag. ++ movdqu GHASH_ACC, (GHASH_ACC_PTR) ++.else ++ .set ZEROPAD_MASK_PTR, TOTAL_AADLEN // Reusing TOTAL_AADLEN! ++ ++ // Verify the auth tag in constant time by XOR'ing the transmitted and ++ // computed auth tags together and using the ptest instruction to check ++ // whether the first TAGLEN bytes of the result are zero. ++ _xor_mem_to_reg (TAG), GHASH_ACC, tmp=%xmm0 ++ movl 8(%rsp), TAGLEN ++ lea .Lzeropad_mask+16(%rip), ZEROPAD_MASK_PTR ++ sub TAGLEN64, ZEROPAD_MASK_PTR ++ xor %eax, %eax ++ _test_mem (ZEROPAD_MASK_PTR), GHASH_ACC, tmp=%xmm0 ++ sete %al ++.endif ++ RET ++.endm ++ ++.set USE_AVX, 0 ++SYM_FUNC_START(aes_gcm_precompute_aesni) ++ _aes_gcm_precompute ++SYM_FUNC_END(aes_gcm_precompute_aesni) ++SYM_FUNC_START(aes_gcm_aad_update_aesni) ++ _aes_gcm_aad_update ++SYM_FUNC_END(aes_gcm_aad_update_aesni) ++SYM_FUNC_START(aes_gcm_enc_update_aesni) ++ _aes_gcm_update 1 ++SYM_FUNC_END(aes_gcm_enc_update_aesni) ++SYM_FUNC_START(aes_gcm_dec_update_aesni) ++ _aes_gcm_update 0 ++SYM_FUNC_END(aes_gcm_dec_update_aesni) ++SYM_FUNC_START(aes_gcm_enc_final_aesni) ++ _aes_gcm_final 1 ++SYM_FUNC_END(aes_gcm_enc_final_aesni) ++SYM_FUNC_START(aes_gcm_dec_final_aesni) ++ _aes_gcm_final 0 ++SYM_FUNC_END(aes_gcm_dec_final_aesni) ++ ++.set USE_AVX, 1 ++SYM_FUNC_START(aes_gcm_precompute_aesni_avx) ++ _aes_gcm_precompute ++SYM_FUNC_END(aes_gcm_precompute_aesni_avx) ++SYM_FUNC_START(aes_gcm_aad_update_aesni_avx) ++ _aes_gcm_aad_update ++SYM_FUNC_END(aes_gcm_aad_update_aesni_avx) ++SYM_FUNC_START(aes_gcm_enc_update_aesni_avx) ++ _aes_gcm_update 1 ++SYM_FUNC_END(aes_gcm_enc_update_aesni_avx) ++SYM_FUNC_START(aes_gcm_dec_update_aesni_avx) ++ _aes_gcm_update 0 ++SYM_FUNC_END(aes_gcm_dec_update_aesni_avx) ++SYM_FUNC_START(aes_gcm_enc_final_aesni_avx) ++ _aes_gcm_final 1 ++SYM_FUNC_END(aes_gcm_enc_final_aesni_avx) ++SYM_FUNC_START(aes_gcm_dec_final_aesni_avx) ++ _aes_gcm_final 0 ++SYM_FUNC_END(aes_gcm_dec_final_aesni_avx) +diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S +new file mode 100644 +index 000000000000..97e0ee515fc5 +--- /dev/null ++++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S +@@ -0,0 +1,1222 @@ ++/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ ++// ++// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 ++// ++// Copyright 2024 Google LLC ++// ++// Author: Eric Biggers ++// ++//------------------------------------------------------------------------------ ++// ++// This file is dual-licensed, meaning that you can use it under your choice of ++// either of the following two licenses: ++// ++// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy ++// of the License at ++// ++// http://www.apache.org/licenses/LICENSE-2.0 ++// ++// Unless required by applicable law or agreed to in writing, software ++// distributed under the License is distributed on an "AS IS" BASIS, ++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// See the License for the specific language governing permissions and ++// limitations under the License. ++// ++// or ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions are met: ++// ++// 1. Redistributions of source code must retain the above copyright notice, ++// this list of conditions and the following disclaimer. ++// ++// 2. Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE ++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++// POSSIBILITY OF SUCH DAMAGE. ++// ++//------------------------------------------------------------------------------ ++// ++// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that ++// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and ++// either AVX512 or AVX10. Some of the functions, notably the encryption and ++// decryption update functions which are the most performance-critical, are ++// provided in two variants generated from a macro: one using 256-bit vectors ++// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The ++// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. ++// ++// The functions that use 512-bit vectors are intended for CPUs that support ++// 512-bit vectors *and* where using them doesn't cause significant ++// downclocking. They require the following CPU features: ++// ++// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) ++// ++// The other functions require the following CPU features: ++// ++// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) ++// ++// All functions use the "System V" ABI. The Windows ABI is not supported. ++// ++// Note that we use "avx10" in the names of the functions as a shorthand to ++// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's ++// introduction of AVX512 and then its replacement by AVX10, there doesn't seem ++// to be a simple way to name things that makes sense on all CPUs. ++// ++// Note that the macros that support both 256-bit and 512-bit vectors could ++// fairly easily be changed to support 128-bit too. However, this would *not* ++// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, ++// because the code heavily uses several features of these extensions other than ++// the vector length: the increase in the number of SIMD registers from 16 to ++// 32, masking support, and new instructions such as vpternlogd (which can do a ++// three-argument XOR). These features are very useful for AES-GCM. ++ ++#include ++ ++.section .rodata ++.p2align 6 ++ ++ // A shuffle mask that reflects the bytes of 16-byte blocks ++.Lbswap_mask: ++ .octa 0x000102030405060708090a0b0c0d0e0f ++ ++ // This is the GHASH reducing polynomial without its constant term, i.e. ++ // x^128 + x^7 + x^2 + x, represented using the backwards mapping ++ // between bits and polynomial coefficients. ++ // ++ // Alternatively, it can be interpreted as the naturally-ordered ++ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the ++ // "reversed" GHASH reducing polynomial without its x^128 term. ++.Lgfpoly: ++ .octa 0xc2000000000000000000000000000001 ++ ++ // Same as above, but with the (1 << 64) bit set. ++.Lgfpoly_and_internal_carrybit: ++ .octa 0xc2000000000000010000000000000001 ++ ++ // The below constants are used for incrementing the counter blocks. ++ // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. ++ // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and ++ // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. ++.Lctr_pattern: ++ .octa 0 ++ .octa 1 ++.Linc_2blocks: ++ .octa 2 ++ .octa 3 ++.Linc_4blocks: ++ .octa 4 ++ ++// Number of powers of the hash key stored in the key struct. The powers are ++// stored from highest (H^NUM_H_POWERS) to lowest (H^1). ++#define NUM_H_POWERS 16 ++ ++// Offset to AES key length (in bytes) in the key struct ++#define OFFSETOF_AESKEYLEN 480 ++ ++// Offset to start of hash key powers array in the key struct ++#define OFFSETOF_H_POWERS 512 ++ ++// Offset to end of hash key powers array in the key struct. ++// ++// This is immediately followed by three zeroized padding blocks, which are ++// included so that partial vectors can be handled more easily. E.g. if VL=64 ++// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most ++// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. ++#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) ++ ++.text ++ ++// Set the vector length in bytes. This sets the VL variable and defines ++// register aliases V0-V31 that map to the ymm or zmm registers. ++.macro _set_veclen vl ++ .set VL, \vl ++.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ ++ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 ++.if VL == 32 ++ .set V\i, %ymm\i ++.elseif VL == 64 ++ .set V\i, %zmm\i ++.else ++ .error "Unsupported vector length" ++.endif ++.endr ++.endm ++ ++// The _ghash_mul_step macro does one step of GHASH multiplication of the ++// 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the ++// reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the ++// same size as \a and \b. To complete all steps, this must invoked with \i=0 ++// through \i=9. The division into steps allows users of this macro to ++// optionally interleave the computation with other instructions. Users of this ++// macro must preserve the parameter registers across steps. ++// ++// The multiplications are done in GHASH's representation of the finite field ++// GF(2^128). Elements of GF(2^128) are represented as binary polynomials ++// (i.e. polynomials whose coefficients are bits) modulo a reducing polynomial ++// G. The GCM specification uses G = x^128 + x^7 + x^2 + x + 1. Addition is ++// just XOR, while multiplication is more complex and has two parts: (a) do ++// carryless multiplication of two 128-bit input polynomials to get a 256-bit ++// intermediate product polynomial, and (b) reduce the intermediate product to ++// 128 bits by adding multiples of G that cancel out terms in it. (Adding ++// multiples of G doesn't change which field element the polynomial represents.) ++// ++// Unfortunately, the GCM specification maps bits to/from polynomial ++// coefficients backwards from the natural order. In each byte it specifies the ++// highest bit to be the lowest order polynomial coefficient, *not* the highest! ++// This makes it nontrivial to work with the GHASH polynomials. We could ++// reflect the bits, but x86 doesn't have an instruction that does that. ++// ++// Instead, we operate on the values without bit-reflecting them. This *mostly* ++// just works, since XOR and carryless multiplication are symmetric with respect ++// to bit order, but it has some consequences. First, due to GHASH's byte ++// order, by skipping bit reflection, *byte* reflection becomes necessary to ++// give the polynomial terms a consistent order. E.g., considering an N-bit ++// value interpreted using the G = x^128 + x^7 + x^2 + x + 1 convention, bits 0 ++// through N-1 of the byte-reflected value represent the coefficients of x^(N-1) ++// through x^0, whereas bits 0 through N-1 of the non-byte-reflected value ++// represent x^7...x^0, x^15...x^8, ..., x^(N-1)...x^(N-8) which can't be worked ++// with. Fortunately, x86's vpshufb instruction can do byte reflection. ++// ++// Second, forgoing the bit reflection causes an extra multiple of x (still ++// using the G = x^128 + x^7 + x^2 + x + 1 convention) to be introduced by each ++// multiplication. This is because an M-bit by N-bit carryless multiplication ++// really produces a (M+N-1)-bit product, but in practice it's zero-extended to ++// M+N bits. In the G = x^128 + x^7 + x^2 + x + 1 convention, which maps bits ++// to polynomial coefficients backwards, this zero-extension actually changes ++// the product by introducing an extra factor of x. Therefore, users of this ++// macro must ensure that one of the inputs has an extra factor of x^-1, i.e. ++// the multiplicative inverse of x, to cancel out the extra x. ++// ++// Third, the backwards coefficients convention is just confusing to work with, ++// since it makes "low" and "high" in the polynomial math mean the opposite of ++// their normal meaning in computer programming. This can be solved by using an ++// alternative interpretation: the polynomial coefficients are understood to be ++// in the natural order, and the multiplication is actually \a * \b * x^-128 mod ++// x^128 + x^127 + x^126 + x^121 + 1. This doesn't change the inputs, outputs, ++// or the implementation at all; it just changes the mathematical interpretation ++// of what each instruction is doing. Starting from here, we'll use this ++// alternative interpretation, as it's easier to understand the code that way. ++// ++// Moving onto the implementation, the vpclmulqdq instruction does 64 x 64 => ++// 128-bit carryless multiplication, so we break the 128 x 128 multiplication ++// into parts as follows (the _L and _H suffixes denote low and high 64 bits): ++// ++// LO = a_L * b_L ++// MI = (a_L * b_H) + (a_H * b_L) ++// HI = a_H * b_H ++// ++// The 256-bit product is x^128*HI + x^64*MI + LO. LO, MI, and HI are 128-bit. ++// Note that MI "overlaps" with LO and HI. We don't consolidate MI into LO and ++// HI right away, since the way the reduction works makes that unnecessary. ++// ++// For the reduction, we cancel out the low 128 bits by adding multiples of G = ++// x^128 + x^127 + x^126 + x^121 + 1. This is done by two iterations, each of ++// which cancels out the next lowest 64 bits. Consider a value x^64*A + B, ++// where A and B are 128-bit. Adding B_L*G to that value gives: ++// ++// x^64*A + B + B_L*G ++// = x^64*A + x^64*B_H + B_L + B_L*(x^128 + x^127 + x^126 + x^121 + 1) ++// = x^64*A + x^64*B_H + B_L + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L ++// = x^64*A + x^64*B_H + x^128*B_L + x^64*B_L*(x^63 + x^62 + x^57) + B_L + B_L ++// = x^64*(A + B_H + x^64*B_L + B_L*(x^63 + x^62 + x^57)) ++// ++// So: if we sum A, B with its halves swapped, and the low half of B times x^63 ++// + x^62 + x^57, we get a 128-bit value C where x^64*C is congruent to the ++// original value x^64*A + B. I.e., the low 64 bits got canceled out. ++// ++// We just need to apply this twice: first to fold LO into MI, and second to ++// fold the updated MI into HI. ++// ++// The needed three-argument XORs are done using the vpternlogd instruction with ++// immediate 0x96, since this is faster than two vpxord instructions. ++// ++// A potential optimization, assuming that b is fixed per-key (if a is fixed ++// per-key it would work the other way around), is to use one iteration of the ++// reduction described above to precompute a value c such that x^64*c = b mod G, ++// and then multiply a_L by c (and implicitly by x^64) instead of by b: ++// ++// MI = (a_L * c_L) + (a_H * b_L) ++// HI = (a_L * c_H) + (a_H * b_H) ++// ++// This would eliminate the LO part of the intermediate product, which would ++// eliminate the need to fold LO into MI. This would save two instructions, ++// including a vpclmulqdq. However, we currently don't use this optimization ++// because it would require twice as many per-key precomputed values. ++// ++// Using Karatsuba multiplication instead of "schoolbook" multiplication ++// similarly would save a vpclmulqdq but does not seem to be worth it. ++.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 ++.if \i == 0 ++ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L ++ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H ++.elseif \i == 1 ++ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L ++.elseif \i == 2 ++ vpxord \t2, \t1, \t1 // MI = MI_0 + MI_1 ++.elseif \i == 3 ++ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) ++.elseif \i == 4 ++ vpshufd $0x4e, \t0, \t0 // Swap halves of LO ++.elseif \i == 5 ++ vpternlogd $0x96, \t2, \t0, \t1 // Fold LO into MI ++.elseif \i == 6 ++ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H ++.elseif \i == 7 ++ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) ++.elseif \i == 8 ++ vpshufd $0x4e, \t1, \t1 // Swap halves of MI ++.elseif \i == 9 ++ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI ++.endif ++.endm ++ ++// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store ++// the reduced products in \dst. See _ghash_mul_step for full explanation. ++.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 ++.irp i, 0,1,2,3,4,5,6,7,8,9 ++ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 ++.endr ++.endm ++ ++// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the ++// *unreduced* products to \lo, \mi, and \hi. ++.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0, t1, t2, t3 ++ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L ++ vpclmulqdq $0x01, \a, \b, \t1 // a_L * b_H ++ vpclmulqdq $0x10, \a, \b, \t2 // a_H * b_L ++ vpclmulqdq $0x11, \a, \b, \t3 // a_H * b_H ++ vpxord \t0, \lo, \lo ++ vpternlogd $0x96, \t2, \t1, \mi ++ vpxord \t3, \hi, \hi ++.endm ++ ++// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit ++// reduced products in \hi. See _ghash_mul_step for explanation of reduction. ++.macro _ghash_reduce lo, mi, hi, gfpoly, t0 ++ vpclmulqdq $0x01, \lo, \gfpoly, \t0 ++ vpshufd $0x4e, \lo, \lo ++ vpternlogd $0x96, \t0, \lo, \mi ++ vpclmulqdq $0x01, \mi, \gfpoly, \t0 ++ vpshufd $0x4e, \mi, \mi ++ vpternlogd $0x96, \t0, \mi, \hi ++.endm ++ ++// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); ++// ++// Given the expanded AES key |key->aes_key|, this function derives the GHASH ++// subkey and initializes |key->ghash_key_powers| with powers of it. ++// ++// The number of key powers initialized is NUM_H_POWERS, and they are stored in ++// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key ++// powers themselves are also initialized. ++// ++// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked ++// with the desired length. In the VL=32 case, the function computes twice as ++// many key powers than are actually used by the VL=32 GCM update functions. ++// This is done to keep the key format the same regardless of vector length. ++.macro _aes_gcm_precompute ++ ++ // Function arguments ++ .set KEY, %rdi ++ ++ // Additional local variables. V0-V2 and %rax are used as temporaries. ++ .set POWERS_PTR, %rsi ++ .set RNDKEYLAST_PTR, %rdx ++ .set H_CUR, V3 ++ .set H_CUR_YMM, %ymm3 ++ .set H_CUR_XMM, %xmm3 ++ .set H_INC, V4 ++ .set H_INC_YMM, %ymm4 ++ .set H_INC_XMM, %xmm4 ++ .set GFPOLY, V5 ++ .set GFPOLY_YMM, %ymm5 ++ .set GFPOLY_XMM, %xmm5 ++ ++ // Get pointer to lowest set of key powers (located at end of array). ++ lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR ++ ++ // Encrypt an all-zeroes block to get the raw hash subkey. ++ movl OFFSETOF_AESKEYLEN(KEY), %eax ++ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR ++ vmovdqu (KEY), %xmm0 // Zero-th round key XOR all-zeroes block ++ add $16, KEY ++1: ++ vaesenc (KEY), %xmm0, %xmm0 ++ add $16, KEY ++ cmp KEY, RNDKEYLAST_PTR ++ jne 1b ++ vaesenclast (RNDKEYLAST_PTR), %xmm0, %xmm0 ++ ++ // Reflect the bytes of the raw hash subkey. ++ vpshufb .Lbswap_mask(%rip), %xmm0, H_CUR_XMM ++ ++ // Zeroize the padding blocks. ++ vpxor %xmm0, %xmm0, %xmm0 ++ vmovdqu %ymm0, VL(POWERS_PTR) ++ vmovdqu %xmm0, VL+2*16(POWERS_PTR) ++ ++ // Finish preprocessing the first key power, H^1. Since this GHASH ++ // implementation operates directly on values with the backwards bit ++ // order specified by the GCM standard, it's necessary to preprocess the ++ // raw key as follows. First, reflect its bytes. Second, multiply it ++ // by x^-1 mod x^128 + x^7 + x^2 + x + 1 (if using the backwards ++ // interpretation of polynomial coefficients), which can also be ++ // interpreted as multiplication by x mod x^128 + x^127 + x^126 + x^121 ++ // + 1 using the alternative, natural interpretation of polynomial ++ // coefficients. For details, see the comment above _ghash_mul_step. ++ // ++ // Either way, for the multiplication the concrete operation performed ++ // is a left shift of the 128-bit value by 1 bit, then an XOR with (0xc2 ++ // << 120) | 1 if a 1 bit was carried out. However, there's no 128-bit ++ // wide shift instruction, so instead double each of the two 64-bit ++ // halves and incorporate the internal carry bit into the value XOR'd. ++ vpshufd $0xd3, H_CUR_XMM, %xmm0 ++ vpsrad $31, %xmm0, %xmm0 ++ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM ++ vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 ++ vpxor %xmm0, H_CUR_XMM, H_CUR_XMM ++ ++ // Load the gfpoly constant. ++ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY ++ ++ // Square H^1 to get H^2. ++ // ++ // Note that as with H^1, all higher key powers also need an extra ++ // factor of x^-1 (or x using the natural interpretation). Nothing ++ // special needs to be done to make this happen, though: H^1 * H^1 would ++ // end up with two factors of x^-1, but the multiplication consumes one. ++ // So the product H^2 ends up with the desired one factor of x^-1. ++ _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ ++ %xmm0, %xmm1, %xmm2 ++ ++ // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. ++ vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM ++ vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM ++ ++.if VL == 64 ++ // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. ++ _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ ++ %ymm0, %ymm1, %ymm2 ++ vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR ++ vshufi64x2 $0, H_INC, H_INC, H_INC ++.endif ++ ++ // Store the lowest set of key powers. ++ vmovdqu8 H_CUR, (POWERS_PTR) ++ ++ // Compute and store the remaining key powers. With VL=32, repeatedly ++ // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. ++ // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by ++ // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. ++ mov $(NUM_H_POWERS*16/VL) - 1, %eax ++.Lprecompute_next\@: ++ sub $VL, POWERS_PTR ++ _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 ++ vmovdqu8 H_CUR, (POWERS_PTR) ++ dec %eax ++ jnz .Lprecompute_next\@ ++ ++ vzeroupper // This is needed after using ymm or zmm registers. ++ RET ++.endm ++ ++// XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store ++// the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. ++.macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm ++ vextracti32x4 $1, \src, \t0_xmm ++.if VL == 32 ++ vpxord \t0_xmm, \src_xmm, \dst_xmm ++.elseif VL == 64 ++ vextracti32x4 $2, \src, \t1_xmm ++ vextracti32x4 $3, \src, \t2_xmm ++ vpxord \t0_xmm, \src_xmm, \dst_xmm ++ vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm ++.else ++ .error "Unsupported vector length" ++.endif ++.endm ++ ++// Do one step of the GHASH update of the data blocks given in the vector ++// registers GHASHDATA[0-3]. \i specifies the step to do, 0 through 9. The ++// division into steps allows users of this macro to optionally interleave the ++// computation with other instructions. This macro uses the vector register ++// GHASH_ACC as input/output; GHASHDATA[0-3] as inputs that are clobbered; ++// H_POW[4-1], GFPOLY, and BSWAP_MASK as inputs that aren't clobbered; and ++// GHASHTMP[0-2] as temporaries. This macro handles the byte-reflection of the ++// data blocks. The parameter registers must be preserved across steps. ++// ++// The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + ++// H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the ++// operations are vectorized operations on vectors of 16-byte blocks. E.g., ++// with VL=32 there are 2 blocks per vector and the vectorized terms correspond ++// to the following non-vectorized terms: ++// ++// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) ++// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 ++// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 ++// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 ++// ++// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. ++// ++// More concretely, this code does: ++// - Do vectorized "schoolbook" multiplications to compute the intermediate ++// 256-bit product of each block and its corresponding hash key power. ++// There are 4*VL/16 of these intermediate products. ++// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves ++// VL/16 256-bit intermediate values. ++// - Do a vectorized reduction of these 256-bit intermediate values to ++// 128-bits each. This leaves VL/16 128-bit intermediate values. ++// - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. ++// ++// See _ghash_mul_step for the full explanation of the operations performed for ++// each individual finite field multiplication and reduction. ++.macro _ghash_step_4x i ++.if \i == 0 ++ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 ++ vpxord GHASH_ACC, GHASHDATA0, GHASHDATA0 ++ vpshufb BSWAP_MASK, GHASHDATA1, GHASHDATA1 ++ vpshufb BSWAP_MASK, GHASHDATA2, GHASHDATA2 ++.elseif \i == 1 ++ vpshufb BSWAP_MASK, GHASHDATA3, GHASHDATA3 ++ vpclmulqdq $0x00, H_POW4, GHASHDATA0, GHASH_ACC // LO_0 ++ vpclmulqdq $0x00, H_POW3, GHASHDATA1, GHASHTMP0 // LO_1 ++ vpclmulqdq $0x00, H_POW2, GHASHDATA2, GHASHTMP1 // LO_2 ++.elseif \i == 2 ++ vpxord GHASHTMP0, GHASH_ACC, GHASH_ACC // sum(LO_{1,0}) ++ vpclmulqdq $0x00, H_POW1, GHASHDATA3, GHASHTMP2 // LO_3 ++ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASH_ACC // LO = sum(LO_{3,2,1,0}) ++ vpclmulqdq $0x01, H_POW4, GHASHDATA0, GHASHTMP0 // MI_0 ++.elseif \i == 3 ++ vpclmulqdq $0x01, H_POW3, GHASHDATA1, GHASHTMP1 // MI_1 ++ vpclmulqdq $0x01, H_POW2, GHASHDATA2, GHASHTMP2 // MI_2 ++ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{2,1,0}) ++ vpclmulqdq $0x01, H_POW1, GHASHDATA3, GHASHTMP1 // MI_3 ++.elseif \i == 4 ++ vpclmulqdq $0x10, H_POW4, GHASHDATA0, GHASHTMP2 // MI_4 ++ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{4,3,2,1,0}) ++ vpclmulqdq $0x10, H_POW3, GHASHDATA1, GHASHTMP1 // MI_5 ++ vpclmulqdq $0x10, H_POW2, GHASHDATA2, GHASHTMP2 // MI_6 ++.elseif \i == 5 ++ vpternlogd $0x96, GHASHTMP2, GHASHTMP1, GHASHTMP0 // sum(MI_{6,5,4,3,2,1,0}) ++ vpclmulqdq $0x01, GHASH_ACC, GFPOLY, GHASHTMP2 // LO_L*(x^63 + x^62 + x^57) ++ vpclmulqdq $0x10, H_POW1, GHASHDATA3, GHASHTMP1 // MI_7 ++ vpxord GHASHTMP1, GHASHTMP0, GHASHTMP0 // MI = sum(MI_{7,6,5,4,3,2,1,0}) ++.elseif \i == 6 ++ vpshufd $0x4e, GHASH_ACC, GHASH_ACC // Swap halves of LO ++ vpclmulqdq $0x11, H_POW4, GHASHDATA0, GHASHDATA0 // HI_0 ++ vpclmulqdq $0x11, H_POW3, GHASHDATA1, GHASHDATA1 // HI_1 ++ vpclmulqdq $0x11, H_POW2, GHASHDATA2, GHASHDATA2 // HI_2 ++.elseif \i == 7 ++ vpternlogd $0x96, GHASHTMP2, GHASH_ACC, GHASHTMP0 // Fold LO into MI ++ vpclmulqdq $0x11, H_POW1, GHASHDATA3, GHASHDATA3 // HI_3 ++ vpternlogd $0x96, GHASHDATA2, GHASHDATA1, GHASHDATA0 // sum(HI_{2,1,0}) ++ vpclmulqdq $0x01, GHASHTMP0, GFPOLY, GHASHTMP1 // MI_L*(x^63 + x^62 + x^57) ++.elseif \i == 8 ++ vpxord GHASHDATA3, GHASHDATA0, GHASH_ACC // HI = sum(HI_{3,2,1,0}) ++ vpshufd $0x4e, GHASHTMP0, GHASHTMP0 // Swap halves of MI ++ vpternlogd $0x96, GHASHTMP1, GHASHTMP0, GHASH_ACC // Fold MI into HI ++.elseif \i == 9 ++ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ ++ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM ++.endif ++.endm ++ ++// Do one non-last round of AES encryption on the counter blocks in V0-V3 using ++// the round key that has been broadcast to all 128-bit lanes of \round_key. ++.macro _vaesenc_4x round_key ++ vaesenc \round_key, V0, V0 ++ vaesenc \round_key, V1, V1 ++ vaesenc \round_key, V2, V2 ++ vaesenc \round_key, V3, V3 ++.endm ++ ++// Start the AES encryption of four vectors of counter blocks. ++.macro _ctr_begin_4x ++ ++ // Increment LE_CTR four times to generate four vectors of little-endian ++ // counter blocks, swap each to big-endian, and store them in V0-V3. ++ vpshufb BSWAP_MASK, LE_CTR, V0 ++ vpaddd LE_CTR_INC, LE_CTR, LE_CTR ++ vpshufb BSWAP_MASK, LE_CTR, V1 ++ vpaddd LE_CTR_INC, LE_CTR, LE_CTR ++ vpshufb BSWAP_MASK, LE_CTR, V2 ++ vpaddd LE_CTR_INC, LE_CTR, LE_CTR ++ vpshufb BSWAP_MASK, LE_CTR, V3 ++ vpaddd LE_CTR_INC, LE_CTR, LE_CTR ++ ++ // AES "round zero": XOR in the zero-th round key. ++ vpxord RNDKEY0, V0, V0 ++ vpxord RNDKEY0, V1, V1 ++ vpxord RNDKEY0, V2, V2 ++ vpxord RNDKEY0, V3, V3 ++.endm ++ ++// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// const u8 *src, u8 *dst, int datalen); ++// ++// This macro generates a GCM encryption or decryption update function with the ++// above prototype (with \enc selecting which one). This macro supports both ++// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. ++// ++// This function computes the next portion of the CTR keystream, XOR's it with ++// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted ++// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the ++// next |datalen| ciphertext bytes. ++// ++// |datalen| must be a multiple of 16, except on the last call where it can be ++// any length. The caller must do any buffering needed to ensure this. Both ++// in-place and out-of-place en/decryption are supported. ++// ++// |le_ctr| must give the current counter in little-endian format. For a new ++// message, the low word of the counter must be 2. This function loads the ++// counter from |le_ctr| and increments the loaded counter as needed, but it ++// does *not* store the updated counter back to |le_ctr|. The caller must ++// update |le_ctr| if any more data segments follow. Internally, only the low ++// 32-bit word of the counter is incremented, following the GCM standard. ++.macro _aes_gcm_update enc ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set LE_CTR_PTR, %rsi ++ .set GHASH_ACC_PTR, %rdx ++ .set SRC, %rcx ++ .set DST, %r8 ++ .set DATALEN, %r9d ++ .set DATALEN64, %r9 // Zero-extend DATALEN before using! ++ ++ // Additional local variables ++ ++ // %rax and %k1 are used as temporary registers. LE_CTR_PTR is also ++ // available as a temporary register after the counter is loaded. ++ ++ // AES key length in bytes ++ .set AESKEYLEN, %r10d ++ .set AESKEYLEN64, %r10 ++ ++ // Pointer to the last AES round key for the chosen AES variant ++ .set RNDKEYLAST_PTR, %r11 ++ ++ // In the main loop, V0-V3 are used as AES input and output. Elsewhere ++ // they are used as temporary registers. ++ ++ // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. ++ .set GHASHDATA0, V4 ++ .set GHASHDATA0_XMM, %xmm4 ++ .set GHASHDATA1, V5 ++ .set GHASHDATA1_XMM, %xmm5 ++ .set GHASHDATA2, V6 ++ .set GHASHDATA2_XMM, %xmm6 ++ .set GHASHDATA3, V7 ++ ++ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values ++ // using vpshufb, copied to all 128-bit lanes. ++ .set BSWAP_MASK, V8 ++ ++ // RNDKEY temporarily holds the next AES round key. ++ .set RNDKEY, V9 ++ ++ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, ++ // only the lowest 128-bit lane can be nonzero. When not fully reduced, ++ // more than one lane may be used, and they need to be XOR'd together. ++ .set GHASH_ACC, V10 ++ .set GHASH_ACC_XMM, %xmm10 ++ ++ // LE_CTR_INC is the vector of 32-bit words that need to be added to a ++ // vector of little-endian counter blocks to advance it forwards. ++ .set LE_CTR_INC, V11 ++ ++ // LE_CTR contains the next set of little-endian counter blocks. ++ .set LE_CTR, V12 ++ ++ // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, ++ // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, ++ // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. ++ .set RNDKEY0, V13 ++ .set RNDKEYLAST, V14 ++ .set RNDKEY_M9, V15 ++ .set RNDKEY_M8, V16 ++ .set RNDKEY_M7, V17 ++ .set RNDKEY_M6, V18 ++ .set RNDKEY_M5, V19 ++ ++ // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with ++ // the corresponding block of source data. This is useful because ++ // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can ++ // be computed in parallel with the AES rounds. ++ .set RNDKEYLAST0, V20 ++ .set RNDKEYLAST1, V21 ++ .set RNDKEYLAST2, V22 ++ .set RNDKEYLAST3, V23 ++ ++ // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These ++ // cannot coincide with anything used for AES encryption, since for ++ // performance reasons GHASH and AES encryption are interleaved. ++ .set GHASHTMP0, V24 ++ .set GHASHTMP1, V25 ++ .set GHASHTMP2, V26 ++ ++ // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The ++ // descending numbering reflects the order of the key powers. ++ .set H_POW4, V27 ++ .set H_POW3, V28 ++ .set H_POW2, V29 ++ .set H_POW1, V30 ++ ++ // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. ++ .set GFPOLY, V31 ++ ++ // Load some constants. ++ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK ++ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY ++ ++ // Load the GHASH accumulator and the starting counter. ++ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM ++ vbroadcasti32x4 (LE_CTR_PTR), LE_CTR ++ ++ // Load the AES key length in bytes. ++ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN ++ ++ // Make RNDKEYLAST_PTR point to the last AES round key. This is the ++ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 ++ // respectively. Then load the zero-th and last round keys. ++ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR ++ vbroadcasti32x4 (KEY), RNDKEY0 ++ vbroadcasti32x4 (RNDKEYLAST_PTR), RNDKEYLAST ++ ++ // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. ++ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR ++ ++ // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. ++.if VL == 32 ++ vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC ++.elseif VL == 64 ++ vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC ++.else ++ .error "Unsupported vector length" ++.endif ++ ++ // If there are at least 4*VL bytes of data, then continue into the loop ++ // that processes 4*VL bytes of data at a time. Otherwise skip it. ++ // ++ // Pre-subtracting 4*VL from DATALEN saves an instruction from the main ++ // loop and also ensures that at least one write always occurs to ++ // DATALEN, zero-extending it and allowing DATALEN64 to be used later. ++ sub $4*VL, DATALEN ++ jl .Lcrypt_loop_4x_done\@ ++ ++ // Load powers of the hash key. ++ vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 ++ vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 ++ vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 ++ vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 ++ ++ // Main loop: en/decrypt and hash 4 vectors at a time. ++ // ++ // When possible, interleave the AES encryption of the counter blocks ++ // with the GHASH update of the ciphertext blocks. This improves ++ // performance on many CPUs because the execution ports used by the VAES ++ // instructions often differ from those used by vpclmulqdq and other ++ // instructions used in GHASH. For example, many Intel CPUs dispatch ++ // vaesenc to ports 0 and 1 and vpclmulqdq to port 5. ++ // ++ // The interleaving is easiest to do during decryption, since during ++ // decryption the ciphertext blocks are immediately available. For ++ // encryption, instead encrypt the first set of blocks, then hash those ++ // blocks while encrypting the next set of blocks, repeat that as ++ // needed, and finally hash the last set of blocks. ++ ++.if \enc ++ // Encrypt the first 4 vectors of plaintext blocks. Leave the resulting ++ // ciphertext in GHASHDATA[0-3] for GHASH. ++ _ctr_begin_4x ++ lea 16(KEY), %rax ++1: ++ vbroadcasti32x4 (%rax), RNDKEY ++ _vaesenc_4x RNDKEY ++ add $16, %rax ++ cmp %rax, RNDKEYLAST_PTR ++ jne 1b ++ vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 ++ vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 ++ vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 ++ vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 ++ vaesenclast RNDKEYLAST0, V0, GHASHDATA0 ++ vaesenclast RNDKEYLAST1, V1, GHASHDATA1 ++ vaesenclast RNDKEYLAST2, V2, GHASHDATA2 ++ vaesenclast RNDKEYLAST3, V3, GHASHDATA3 ++ vmovdqu8 GHASHDATA0, 0*VL(DST) ++ vmovdqu8 GHASHDATA1, 1*VL(DST) ++ vmovdqu8 GHASHDATA2, 2*VL(DST) ++ vmovdqu8 GHASHDATA3, 3*VL(DST) ++ add $4*VL, SRC ++ add $4*VL, DST ++ sub $4*VL, DATALEN ++ jl .Lghash_last_ciphertext_4x\@ ++.endif ++ ++ // Cache as many additional AES round keys as possible. ++.irp i, 9,8,7,6,5 ++ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i ++.endr ++ ++.Lcrypt_loop_4x\@: ++ ++ // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If ++ // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. ++.if !\enc ++ vmovdqu8 0*VL(SRC), GHASHDATA0 ++ vmovdqu8 1*VL(SRC), GHASHDATA1 ++ vmovdqu8 2*VL(SRC), GHASHDATA2 ++ vmovdqu8 3*VL(SRC), GHASHDATA3 ++.endif ++ ++ // Start the AES encryption of the counter blocks. ++ _ctr_begin_4x ++ cmp $24, AESKEYLEN ++ jl 128f // AES-128? ++ je 192f // AES-192? ++ // AES-256 ++ vbroadcasti32x4 -13*16(RNDKEYLAST_PTR), RNDKEY ++ _vaesenc_4x RNDKEY ++ vbroadcasti32x4 -12*16(RNDKEYLAST_PTR), RNDKEY ++ _vaesenc_4x RNDKEY ++192: ++ vbroadcasti32x4 -11*16(RNDKEYLAST_PTR), RNDKEY ++ _vaesenc_4x RNDKEY ++ vbroadcasti32x4 -10*16(RNDKEYLAST_PTR), RNDKEY ++ _vaesenc_4x RNDKEY ++128: ++ ++ // XOR the source data with the last round key, saving the result in ++ // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the ++ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). ++.if \enc ++ vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 ++ vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 ++ vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 ++ vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 ++.else ++ vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 ++ vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 ++ vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 ++ vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 ++.endif ++ ++ // Finish the AES encryption of the counter blocks in V0-V3, interleaved ++ // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. ++.irp i, 9,8,7,6,5 ++ _vaesenc_4x RNDKEY_M\i ++ _ghash_step_4x (9 - \i) ++.endr ++.irp i, 4,3,2,1 ++ vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY ++ _vaesenc_4x RNDKEY ++ _ghash_step_4x (9 - \i) ++.endr ++ _ghash_step_4x 9 ++ ++ // Do the last AES round. This handles the XOR with the source data ++ // too, as per the optimization described above. ++ vaesenclast RNDKEYLAST0, V0, GHASHDATA0 ++ vaesenclast RNDKEYLAST1, V1, GHASHDATA1 ++ vaesenclast RNDKEYLAST2, V2, GHASHDATA2 ++ vaesenclast RNDKEYLAST3, V3, GHASHDATA3 ++ ++ // Store the en/decrypted data to DST. ++ vmovdqu8 GHASHDATA0, 0*VL(DST) ++ vmovdqu8 GHASHDATA1, 1*VL(DST) ++ vmovdqu8 GHASHDATA2, 2*VL(DST) ++ vmovdqu8 GHASHDATA3, 3*VL(DST) ++ ++ add $4*VL, SRC ++ add $4*VL, DST ++ sub $4*VL, DATALEN ++ jge .Lcrypt_loop_4x\@ ++ ++.if \enc ++.Lghash_last_ciphertext_4x\@: ++ // Update GHASH with the last set of ciphertext blocks. ++.irp i, 0,1,2,3,4,5,6,7,8,9 ++ _ghash_step_4x \i ++.endr ++.endif ++ ++.Lcrypt_loop_4x_done\@: ++ ++ // Undo the extra subtraction by 4*VL and check whether data remains. ++ add $4*VL, DATALEN ++ jz .Ldone\@ ++ ++ // The data length isn't a multiple of 4*VL. Process the remaining data ++ // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. ++ // Going one vector at a time may seem inefficient compared to having ++ // separate code paths for each possible number of vectors remaining. ++ // However, using a loop keeps the code size down, and it performs ++ // surprising well; modern CPUs will start executing the next iteration ++ // before the previous one finishes and also predict the number of loop ++ // iterations. For a similar reason, we roll up the AES rounds. ++ // ++ // On the last iteration, the remaining length may be less than VL. ++ // Handle this using masking. ++ // ++ // Since there are enough key powers available for all remaining data, ++ // there is no need to do a GHASH reduction after each iteration. ++ // Instead, multiply each remaining block by its own key power, and only ++ // do a GHASH reduction at the very end. ++ ++ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N ++ // is the number of blocks that remain. ++ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. ++ mov DATALEN, %eax ++ neg %rax ++ and $~15, %rax // -round_up(DATALEN, 16) ++ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR ++ ++ // Start collecting the unreduced GHASH intermediate value LO, MI, HI. ++ .set LO, GHASHDATA0 ++ .set LO_XMM, GHASHDATA0_XMM ++ .set MI, GHASHDATA1 ++ .set MI_XMM, GHASHDATA1_XMM ++ .set HI, GHASHDATA2 ++ .set HI_XMM, GHASHDATA2_XMM ++ vpxor LO_XMM, LO_XMM, LO_XMM ++ vpxor MI_XMM, MI_XMM, MI_XMM ++ vpxor HI_XMM, HI_XMM, HI_XMM ++ ++.Lcrypt_loop_1x\@: ++ ++ // Select the appropriate mask for this iteration: all 1's if ++ // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the ++ // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) ++.if VL < 64 ++ mov $-1, %eax ++ bzhi DATALEN, %eax, %eax ++ kmovd %eax, %k1 ++.else ++ mov $-1, %rax ++ bzhi DATALEN64, %rax, %rax ++ kmovq %rax, %k1 ++.endif ++ ++ // Encrypt a vector of counter blocks. This does not need to be masked. ++ vpshufb BSWAP_MASK, LE_CTR, V0 ++ vpaddd LE_CTR_INC, LE_CTR, LE_CTR ++ vpxord RNDKEY0, V0, V0 ++ lea 16(KEY), %rax ++1: ++ vbroadcasti32x4 (%rax), RNDKEY ++ vaesenc RNDKEY, V0, V0 ++ add $16, %rax ++ cmp %rax, RNDKEYLAST_PTR ++ jne 1b ++ vaesenclast RNDKEYLAST, V0, V0 ++ ++ // XOR the data with the appropriate number of keystream bytes. ++ vmovdqu8 (SRC), V1{%k1}{z} ++ vpxord V1, V0, V0 ++ vmovdqu8 V0, (DST){%k1} ++ ++ // Update GHASH with the ciphertext block(s), without reducing. ++ // ++ // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. ++ // (If decrypting, it's done by the above masked load. If encrypting, ++ // it's done by the below masked register-to-register move.) Note that ++ // if DATALEN <= VL - 16, there will be additional padding beyond the ++ // padding of the last block specified by GHASH itself; i.e., there may ++ // be whole block(s) that get processed by the GHASH multiplication and ++ // reduction instructions but should not actually be included in the ++ // GHASH. However, any such blocks are all-zeroes, and the values that ++ // they're multiplied with are also all-zeroes. Therefore they just add ++ // 0 * 0 = 0 to the final GHASH result, which makes no difference. ++ vmovdqu8 (POWERS_PTR), H_POW1 ++.if \enc ++ vmovdqu8 V0, V1{%k1}{z} ++.endif ++ vpshufb BSWAP_MASK, V1, V0 ++ vpxord GHASH_ACC, V0, V0 ++ _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 ++ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ ++ add $VL, POWERS_PTR ++ add $VL, SRC ++ add $VL, DST ++ sub $VL, DATALEN ++ jg .Lcrypt_loop_1x\@ ++ ++ // Finally, do the GHASH reduction. ++ _ghash_reduce LO, MI, HI, GFPOLY, V0 ++ _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 ++ ++.Ldone\@: ++ // Store the updated GHASH accumulator back to memory. ++ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) ++ ++ vzeroupper // This is needed after using ymm or zmm registers. ++ RET ++.endm ++ ++// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen); ++// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, ++// const u32 le_ctr[4], ++// const u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen, ++// const u8 tag[16], int taglen); ++// ++// This macro generates one of the above two functions (with \enc selecting ++// which one). Both functions finish computing the GCM authentication tag by ++// updating GHASH with the lengths block and encrypting the GHASH accumulator. ++// |total_aadlen| and |total_datalen| must be the total length of the additional ++// authenticated data and the en/decrypted data in bytes, respectively. ++// ++// The encryption function then stores the full-length (16-byte) computed ++// authentication tag to |ghash_acc|. The decryption function instead loads the ++// expected authentication tag (the one that was transmitted) from the 16-byte ++// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the ++// computed tag in constant time, and returns true if and only if they match. ++.macro _aes_gcm_final enc ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set LE_CTR_PTR, %rsi ++ .set GHASH_ACC_PTR, %rdx ++ .set TOTAL_AADLEN, %rcx ++ .set TOTAL_DATALEN, %r8 ++ .set TAG, %r9 ++ .set TAGLEN, %r10d // Originally at 8(%rsp) ++ ++ // Additional local variables. ++ // %rax, %xmm0-%xmm3, and %k1 are used as temporary registers. ++ .set AESKEYLEN, %r11d ++ .set AESKEYLEN64, %r11 ++ .set GFPOLY, %xmm4 ++ .set BSWAP_MASK, %xmm5 ++ .set LE_CTR, %xmm6 ++ .set GHASH_ACC, %xmm7 ++ .set H_POW1, %xmm8 ++ ++ // Load some constants. ++ vmovdqa .Lgfpoly(%rip), GFPOLY ++ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK ++ ++ // Load the AES key length in bytes. ++ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN ++ ++ // Set up a counter block with 1 in the low 32-bit word. This is the ++ // counter that produces the ciphertext needed to encrypt the auth tag. ++ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. ++ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR ++ ++ // Build the lengths block and XOR it with the GHASH accumulator. ++ // Although the lengths block is defined as the AAD length followed by ++ // the en/decrypted data length, both in big-endian byte order, a byte ++ // reflection of the full block is needed because of the way we compute ++ // GHASH (see _ghash_mul_step). By using little-endian values in the ++ // opposite order, we avoid having to reflect any bytes here. ++ vmovq TOTAL_DATALEN, %xmm0 ++ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 ++ vpsllq $3, %xmm0, %xmm0 // Bytes to bits ++ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC ++ ++ // Load the first hash key power (H^1), which is stored last. ++ vmovdqu8 OFFSETOFEND_H_POWERS-16(KEY), H_POW1 ++ ++.if !\enc ++ // Prepare a mask of TAGLEN one bits. ++ movl 8(%rsp), TAGLEN ++ mov $-1, %eax ++ bzhi TAGLEN, %eax, %eax ++ kmovd %eax, %k1 ++.endif ++ ++ // Make %rax point to the last AES round key for the chosen AES variant. ++ lea 6*16(KEY,AESKEYLEN64,4), %rax ++ ++ // Start the AES encryption of the counter block by swapping the counter ++ // block to big-endian and XOR-ing it with the zero-th AES round key. ++ vpshufb BSWAP_MASK, LE_CTR, %xmm0 ++ vpxor (KEY), %xmm0, %xmm0 ++ ++ // Complete the AES encryption and multiply GHASH_ACC by H^1. ++ // Interleave the AES and GHASH instructions to improve performance. ++ cmp $24, AESKEYLEN ++ jl 128f // AES-128? ++ je 192f // AES-192? ++ // AES-256 ++ vaesenc -13*16(%rax), %xmm0, %xmm0 ++ vaesenc -12*16(%rax), %xmm0, %xmm0 ++192: ++ vaesenc -11*16(%rax), %xmm0, %xmm0 ++ vaesenc -10*16(%rax), %xmm0, %xmm0 ++128: ++.irp i, 0,1,2,3,4,5,6,7,8 ++ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ %xmm1, %xmm2, %xmm3 ++ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 ++.endr ++ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ %xmm1, %xmm2, %xmm3 ++ ++ // Undo the byte reflection of the GHASH accumulator. ++ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC ++ ++ // Do the last AES round and XOR the resulting keystream block with the ++ // GHASH accumulator to produce the full computed authentication tag. ++ // ++ // Reduce latency by taking advantage of the property vaesenclast(key, ++ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last ++ // round key, instead of XOR'ing the final AES output with GHASH_ACC. ++ // ++ // enc_final then returns the computed auth tag, while dec_final ++ // compares it with the transmitted one and returns a bool. To compare ++ // the tags, dec_final XORs them together and uses vptest to check ++ // whether the result is all-zeroes. This should be constant-time. ++ // dec_final applies the vaesenclast optimization to this additional ++ // value XOR'd too, using vpternlogd to XOR the last round key, GHASH ++ // accumulator, and transmitted auth tag together in one instruction. ++.if \enc ++ vpxor (%rax), GHASH_ACC, %xmm1 ++ vaesenclast %xmm1, %xmm0, GHASH_ACC ++ vmovdqu GHASH_ACC, (GHASH_ACC_PTR) ++.else ++ vmovdqu (TAG), %xmm1 ++ vpternlogd $0x96, (%rax), GHASH_ACC, %xmm1 ++ vaesenclast %xmm1, %xmm0, %xmm0 ++ xor %eax, %eax ++ vmovdqu8 %xmm0, %xmm0{%k1}{z} // Truncate to TAGLEN bytes ++ vptest %xmm0, %xmm0 ++ sete %al ++.endif ++ // No need for vzeroupper here, since only used xmm registers were used. ++ RET ++.endm ++ ++_set_veclen 32 ++SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) ++ _aes_gcm_precompute ++SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) ++SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) ++ _aes_gcm_update 1 ++SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) ++SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) ++ _aes_gcm_update 0 ++SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) ++ ++_set_veclen 64 ++SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) ++ _aes_gcm_precompute ++SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) ++SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) ++ _aes_gcm_update 1 ++SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) ++SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) ++ _aes_gcm_update 0 ++SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) ++ ++// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, ++// u8 ghash_acc[16], ++// const u8 *aad, int aadlen); ++// ++// This function processes the AAD (Additional Authenticated Data) in GCM. ++// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the ++// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been ++// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| ++// must be a multiple of 16, except on the last call where it can be any length. ++// The caller must do any buffering needed to ensure this. ++// ++// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. ++// Therefore, for AAD processing we currently only provide this implementation ++// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This ++// keeps the code size down, and it enables some micro-optimizations, e.g. using ++// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. ++// To optimize for large amounts of AAD, we could implement a 4x-wide loop and ++// provide a version using 512-bit vectors, but that doesn't seem to be useful. ++SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set GHASH_ACC_PTR, %rsi ++ .set AAD, %rdx ++ .set AADLEN, %ecx ++ .set AADLEN64, %rcx // Zero-extend AADLEN before using! ++ ++ // Additional local variables. ++ // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. ++ .set BSWAP_MASK, %ymm4 ++ .set GFPOLY, %ymm5 ++ .set GHASH_ACC, %ymm6 ++ .set GHASH_ACC_XMM, %xmm6 ++ .set H_POW1, %ymm7 ++ ++ // Load some constants. ++ vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK ++ vbroadcasti128 .Lgfpoly(%rip), GFPOLY ++ ++ // Load the GHASH accumulator. ++ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM ++ ++ // Update GHASH with 32 bytes of AAD at a time. ++ // ++ // Pre-subtracting 32 from AADLEN saves an instruction from the loop and ++ // also ensures that at least one write always occurs to AADLEN, ++ // zero-extending it and allowing AADLEN64 to be used later. ++ sub $32, AADLEN ++ jl .Laad_loop_1x_done ++ vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] ++.Laad_loop_1x: ++ vmovdqu (AAD), %ymm0 ++ vpshufb BSWAP_MASK, %ymm0, %ymm0 ++ vpxor %ymm0, GHASH_ACC, GHASH_ACC ++ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ %ymm0, %ymm1, %ymm2 ++ vextracti128 $1, GHASH_ACC, %xmm0 ++ vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM ++ add $32, AAD ++ sub $32, AADLEN ++ jge .Laad_loop_1x ++.Laad_loop_1x_done: ++ add $32, AADLEN ++ jz .Laad_done ++ ++ // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. ++ mov $-1, %eax ++ bzhi AADLEN, %eax, %eax ++ kmovd %eax, %k1 ++ vmovdqu8 (AAD), %ymm0{%k1}{z} ++ neg AADLEN64 ++ and $~15, AADLEN64 // -round_up(AADLEN, 16) ++ vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 ++ vpshufb BSWAP_MASK, %ymm0, %ymm0 ++ vpxor %ymm0, GHASH_ACC, GHASH_ACC ++ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ %ymm0, %ymm1, %ymm2 ++ vextracti128 $1, GHASH_ACC, %xmm0 ++ vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM ++ ++.Laad_done: ++ // Store the updated GHASH accumulator back to memory. ++ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) ++ ++ vzeroupper // This is needed after using ymm or zmm registers. ++ RET ++SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) ++ ++SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) ++ _aes_gcm_final 1 ++SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) ++SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) ++ _aes_gcm_final 0 ++SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) +diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S +index 39066b57a70e..eb153eff9331 100644 +--- a/arch/x86/crypto/aesni-intel_asm.S ++++ b/arch/x86/crypto/aesni-intel_asm.S +@@ -10,16 +10,7 @@ + * Vinodh Gopal + * Kahraman Akdemir + * +- * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD +- * interface for 64-bit kernels. +- * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) +- * Aidan O'Mahony (aidan.o.mahony@intel.com) +- * Adrian Hoban +- * James Guilford (james.guilford@intel.com) +- * Gabriele Paoloni +- * Tadeusz Struk (tadeusz.struk@intel.com) +- * Wajdi Feghali (wajdi.k.feghali@intel.com) +- * Copyright (c) 2010, Intel Corporation. ++ * Copyright (c) 2010, Intel Corporation. + * + * Ported x86_64 version to x86: + * Author: Mathias Krause +@@ -27,95 +18,6 @@ --anon_min_ratio --============== + #include + #include +-#include - --This knob provides *hard* protection of anonymous pages. The anonymous pages --on the current node won't be reclaimed under any conditions when their amount --is below vm.anon_min_ratio. +-/* +- * The following macros are used to move an (un)aligned 16 byte value to/from +- * an XMM register. This can done for either FP or integer values, for FP use +- * movaps (move aligned packed single) or integer use movdqa (move double quad +- * aligned). It doesn't make a performance difference which instruction is used +- * since Nehalem (original Core i7) was released. However, the movaps is a byte +- * shorter, so that is the one we'll use for now. (same for unaligned). +- */ +-#define MOVADQ movaps +-#define MOVUDQ movups - --This knob may be used to prevent excessive swap thrashing when anonymous --memory is low (for example, when memory is going to be overfilled by --compressed data of zram module). +-#ifdef __x86_64__ - --Setting this value too high (close to 100) can result in inability to --swap and can lead to early OOM under memory pressure. +-# constants in mergeable sections, linker can reorder and merge +-.section .rodata.cst16.POLY, "aM", @progbits, 16 +-.align 16 +-POLY: .octa 0xC2000000000000000000000000000001 +-.section .rodata.cst16.TWOONE, "aM", @progbits, 16 +-.align 16 +-TWOONE: .octa 0x00000001000000000000000000000001 - --The unit of measurement is the percentage of the total memory of the node. +-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 +-.align 16 +-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +-.section .rodata.cst16.MASK1, "aM", @progbits, 16 +-.align 16 +-MASK1: .octa 0x0000000000000000ffffffffffffffff +-.section .rodata.cst16.MASK2, "aM", @progbits, 16 +-.align 16 +-MASK2: .octa 0xffffffffffffffff0000000000000000 +-.section .rodata.cst16.ONE, "aM", @progbits, 16 +-.align 16 +-ONE: .octa 0x00000000000000000000000000000001 +-.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 +-.align 16 +-F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 +-.section .rodata.cst16.dec, "aM", @progbits, 16 +-.align 16 +-dec: .octa 0x1 +-.section .rodata.cst16.enc, "aM", @progbits, 16 +-.align 16 +-enc: .octa 0x2 - --The default value is 15. +-# order of these constants should not change. +-# more specifically, ALL_F should follow SHIFT_MASK, +-# and zero should follow ALL_F +-.section .rodata, "a", @progbits +-.align 16 +-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +-ALL_F: .octa 0xffffffffffffffffffffffffffffffff +- .octa 0x00000000000000000000000000000000 - +-.text - --clean_low_ratio --================ +-#define AadHash 16*0 +-#define AadLen 16*1 +-#define InLen (16*1)+8 +-#define PBlockEncKey 16*2 +-#define OrigIV 16*3 +-#define CurCount 16*4 +-#define PBlockLen 16*5 +-#define HashKey 16*6 // store HashKey <<1 mod poly here +-#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here +-#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here +-#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here +-#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 +- // bits of HashKey <<1 mod poly here +- //(for Karatsuba purposes) +-#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 +- // bits of HashKey^2 <<1 mod poly here +- // (for Karatsuba purposes) +-#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 +- // bits of HashKey^3 <<1 mod poly here +- // (for Karatsuba purposes) +-#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 +- // bits of HashKey^4 <<1 mod poly here +- // (for Karatsuba purposes) - --This knob provides *best-effort* protection of clean file pages. The file pages --on the current node won't be reclaimed under memory pressure when the amount of --clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. -- --Protection of clean file pages using this knob may be used when swapping is --still possible to -- - prevent disk I/O thrashing under memory pressure; -- - improve performance in disk cache-bound tasks under memory pressure. -- --Setting it to a high value may result in a early eviction of anonymous pages --into the swap space by attempting to hold the protected amount of clean file --pages in memory. -- --The unit of measurement is the percentage of the total memory of the node. -- --The default value is 0. -- -- --clean_min_ratio --================ -- --This knob provides *hard* protection of clean file pages. The file pages on the --current node won't be reclaimed under memory pressure when the amount of clean --file pages is below vm.clean_min_ratio. -- --Hard protection of clean file pages using this knob may be used to -- - prevent disk I/O thrashing under memory pressure even with no free swap space; -- - improve performance in disk cache-bound tasks under memory pressure; -- - avoid high latency and prevent livelock in near-OOM conditions. -- --Setting it to a high value may result in a early out-of-memory condition due to --the inability to reclaim the protected amount of clean file pages when other --types of pages cannot be reclaimed. -- --The unit of measurement is the percentage of the total memory of the node. -- --The default value is 15. -- -- - compact_memory - ============== - -@@ -974,14 +910,6 @@ be 133 (x + 2x = 200, 2x = 133.33). - At 0, the kernel will not initiate swap until the amount of free and - file-backed pages is less than the high watermark in a zone. - --This knob has no effect if the amount of clean file pages on the current --node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, --only anonymous pages can be reclaimed. -- --If the number of anonymous pages on the current node is below --vm.anon_min_ratio, then only file pages can be reclaimed with --any vm.swappiness value. +-#define arg1 rdi +-#define arg2 rsi +-#define arg3 rdx +-#define arg4 rcx +-#define arg5 r8 +-#define arg6 r9 +-#define keysize 2*15*16(%arg1) +-#endif - - unprivileged_userfaultfd - ======================== -diff --git a/include/linux/mm.h b/include/linux/mm.h -index 397ad6f1ac39..c9fb00c56844 100644 ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -195,14 +195,6 @@ static inline void __mm_zero_struct_page(struct page *page) - - extern int sysctl_max_map_count; - --extern bool sysctl_workingset_protection; --extern u8 sysctl_anon_min_ratio; --extern u8 sysctl_clean_low_ratio; --extern u8 sysctl_clean_min_ratio; --int vm_workingset_protection_update_handler( -- struct ctl_table *table, int write, -- void __user *buffer, size_t *lenp, loff_t *ppos); -- - extern unsigned long sysctl_user_reserve_kbytes; - extern unsigned long sysctl_admin_reserve_kbytes; - -diff --git a/kernel/sysctl.c b/kernel/sysctl.c -index aa55ebba2ec3..c92d8a4b23fb 100644 ---- a/kernel/sysctl.c -+++ b/kernel/sysctl.c -@@ -2216,40 +2216,6 @@ static struct ctl_table vm_table[] = { - .extra1 = SYSCTL_ZERO, - }, + #define STATE1 %xmm0 + #define STATE2 %xmm4 +@@ -162,1409 +64,6 @@ ALL_F: .octa 0xffffffffffffffffffffffffffffffff + #define TKEYP T1 #endif -- { -- .procname = "workingset_protection", -- .data = &sysctl_workingset_protection, -- .maxlen = sizeof(bool), -- .mode = 0644, -- .proc_handler = &proc_dobool, -- }, -- { -- .procname = "anon_min_ratio", -- .data = &sysctl_anon_min_ratio, -- .maxlen = sizeof(u8), -- .mode = 0644, -- .proc_handler = &vm_workingset_protection_update_handler, -- .extra1 = SYSCTL_ZERO, -- .extra2 = SYSCTL_ONE_HUNDRED, -- }, -- { -- .procname = "clean_low_ratio", -- .data = &sysctl_clean_low_ratio, -- .maxlen = sizeof(u8), -- .mode = 0644, -- .proc_handler = &vm_workingset_protection_update_handler, -- .extra1 = SYSCTL_ZERO, -- .extra2 = SYSCTL_ONE_HUNDRED, -- }, -- { -- .procname = "clean_min_ratio", -- .data = &sysctl_clean_min_ratio, -- .maxlen = sizeof(u8), -- .mode = 0644, -- .proc_handler = &vm_workingset_protection_update_handler, -- .extra1 = SYSCTL_ZERO, -- .extra2 = SYSCTL_ONE_HUNDRED, -- }, - { - .procname = "user_reserve_kbytes", - .data = &sysctl_user_reserve_kbytes, -diff --git a/mm/Kconfig b/mm/Kconfig -index 002f48b655de..0e440573033c 100644 ---- a/mm/Kconfig -+++ b/mm/Kconfig -@@ -486,69 +486,6 @@ config ARCH_WANT_OPTIMIZE_DAX_VMEMMAP - config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP - bool --config ANON_MIN_RATIO -- int "Default value for vm.anon_min_ratio" -- depends on SYSCTL -- range 0 100 -- default 15 -- help -- This option sets the default value for vm.anon_min_ratio sysctl knob. +-.macro FUNC_SAVE +- push %r12 +- push %r13 +- push %r14 +-# +-# states of %xmm registers %xmm6:%xmm15 not saved +-# all %xmm registers are clobbered +-# +-.endm - -- The vm.anon_min_ratio sysctl knob provides *hard* protection of -- anonymous pages. The anonymous pages on the current node won't be -- reclaimed under any conditions when their amount is below -- vm.anon_min_ratio. This knob may be used to prevent excessive swap -- thrashing when anonymous memory is low (for example, when memory is -- going to be overfilled by compressed data of zram module). - -- Setting this value too high (close to MemTotal) can result in -- inability to swap and can lead to early OOM under memory pressure. +-.macro FUNC_RESTORE +- pop %r14 +- pop %r13 +- pop %r12 +-.endm - --config CLEAN_LOW_RATIO -- int "Default value for vm.clean_low_ratio" -- depends on SYSCTL -- range 0 100 -- default 0 -- help -- This option sets the default value for vm.clean_low_ratio sysctl knob. +-# Precompute hashkeys. +-# Input: Hash subkey. +-# Output: HashKeys stored in gcm_context_data. Only needs to be called +-# once per key. +-# clobbers r12, and tmp xmm registers. +-.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 +- mov \SUBKEY, %r12 +- movdqu (%r12), \TMP3 +- movdqa SHUF_MASK(%rip), \TMP2 +- pshufb \TMP2, \TMP3 - -- The vm.clean_low_ratio sysctl knob provides *best-effort* -- protection of clean file pages. The file pages on the current node -- won't be reclaimed under memory pressure when the amount of clean file -- pages is below vm.clean_low_ratio *unless* we threaten to OOM. -- Protection of clean file pages using this knob may be used when -- swapping is still possible to -- - prevent disk I/O thrashing under memory pressure; -- - improve performance in disk cache-bound tasks under memory -- pressure. +- # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) - -- Setting it to a high value may result in a early eviction of anonymous -- pages into the swap space by attempting to hold the protected amount -- of clean file pages in memory. +- movdqa \TMP3, \TMP2 +- psllq $1, \TMP3 +- psrlq $63, \TMP2 +- movdqa \TMP2, \TMP1 +- pslldq $8, \TMP2 +- psrldq $8, \TMP1 +- por \TMP2, \TMP3 - --config CLEAN_MIN_RATIO -- int "Default value for vm.clean_min_ratio" -- depends on SYSCTL -- range 0 100 -- default 15 -- help -- This option sets the default value for vm.clean_min_ratio sysctl knob. +- # reduce HashKey<<1 - -- The vm.clean_min_ratio sysctl knob provides *hard* protection of -- clean file pages. The file pages on the current node won't be -- reclaimed under memory pressure when the amount of clean file pages is -- below vm.clean_min_ratio. Hard protection of clean file pages using -- this knob may be used to -- - prevent disk I/O thrashing under memory pressure even with no free -- swap space; -- - improve performance in disk cache-bound tasks under memory -- pressure; -- - avoid high latency and prevent livelock in near-OOM conditions. +- pshufd $0x24, \TMP1, \TMP2 +- pcmpeqd TWOONE(%rip), \TMP2 +- pand POLY(%rip), \TMP2 +- pxor \TMP2, \TMP3 +- movdqu \TMP3, HashKey(%arg2) - -- Setting it to a high value may result in a early out-of-memory condition -- due to the inability to reclaim the protected amount of clean file pages -- when other types of pages cannot be reclaimed. +- movdqa \TMP3, \TMP5 +- pshufd $78, \TMP3, \TMP1 +- pxor \TMP3, \TMP1 +- movdqu \TMP1, HashKey_k(%arg2) - - config HAVE_MEMBLOCK_PHYS_MAP - bool - -diff --git a/mm/mm_init.c b/mm/mm_init.c -index 419ba5ac7c52..2c19f5515e36 100644 ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -2749,7 +2749,6 @@ static void __init mem_init_print_info(void) - , K(totalhigh_pages()) - #endif - ); -- printk(KERN_INFO "le9 Unofficial (le9uo) working set protection 1.4 by Masahito Suzuki (forked from hakavlad's original le9 patch)"); - } - +- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +-# TMP5 = HashKey^2<<1 (mod poly) +- movdqu \TMP5, HashKey_2(%arg2) +-# HashKey_2 = HashKey^2<<1 (mod poly) +- pshufd $78, \TMP5, \TMP1 +- pxor \TMP5, \TMP1 +- movdqu \TMP1, HashKey_2_k(%arg2) +- +- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +-# TMP5 = HashKey^3<<1 (mod poly) +- movdqu \TMP5, HashKey_3(%arg2) +- pshufd $78, \TMP5, \TMP1 +- pxor \TMP5, \TMP1 +- movdqu \TMP1, HashKey_3_k(%arg2) +- +- GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 +-# TMP5 = HashKey^3<<1 (mod poly) +- movdqu \TMP5, HashKey_4(%arg2) +- pshufd $78, \TMP5, \TMP1 +- pxor \TMP5, \TMP1 +- movdqu \TMP1, HashKey_4_k(%arg2) +-.endm +- +-# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. +-# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 +-.macro GCM_INIT Iv SUBKEY AAD AADLEN +- mov \AADLEN, %r11 +- mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length +- xor %r11d, %r11d +- mov %r11, InLen(%arg2) # ctx_data.in_length = 0 +- mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 +- mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 +- mov \Iv, %rax +- movdqu (%rax), %xmm0 +- movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv +- +- movdqa SHUF_MASK(%rip), %xmm2 +- pshufb %xmm2, %xmm0 +- movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv +- +- PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 +- movdqu HashKey(%arg2), %xmm13 +- +- CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ +- %xmm4, %xmm5, %xmm6 +-.endm +- +-# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context +-# struct has been initialized by GCM_INIT. +-# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK +-# Clobbers rax, r10-r13, and xmm0-xmm15 +-.macro GCM_ENC_DEC operation +- movdqu AadHash(%arg2), %xmm8 +- movdqu HashKey(%arg2), %xmm13 +- add %arg5, InLen(%arg2) +- +- xor %r11d, %r11d # initialise the data pointer offset as zero +- PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation +- +- sub %r11, %arg5 # sub partial block data used +- mov %arg5, %r13 # save the number of bytes +- +- and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) +- mov %r13, %r12 +- # Encrypt/Decrypt first few blocks +- +- and $(3<<4), %r12 +- jz .L_initial_num_blocks_is_0_\@ +- cmp $(2<<4), %r12 +- jb .L_initial_num_blocks_is_1_\@ +- je .L_initial_num_blocks_is_2_\@ +-.L_initial_num_blocks_is_3_\@: +- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation +- sub $48, %r13 +- jmp .L_initial_blocks_\@ +-.L_initial_num_blocks_is_2_\@: +- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation +- sub $32, %r13 +- jmp .L_initial_blocks_\@ +-.L_initial_num_blocks_is_1_\@: +- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation +- sub $16, %r13 +- jmp .L_initial_blocks_\@ +-.L_initial_num_blocks_is_0_\@: +- INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ +-%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation +-.L_initial_blocks_\@: +- +- # Main loop - Encrypt/Decrypt remaining blocks +- +- test %r13, %r13 +- je .L_zero_cipher_left_\@ +- sub $64, %r13 +- je .L_four_cipher_left_\@ +-.L_crypt_by_4_\@: +- GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ +- %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ +- %xmm7, %xmm8, enc +- add $64, %r11 +- sub $64, %r13 +- jne .L_crypt_by_4_\@ +-.L_four_cipher_left_\@: +- GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ +-%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 +-.L_zero_cipher_left_\@: +- movdqu %xmm8, AadHash(%arg2) +- movdqu %xmm0, CurCount(%arg2) +- +- mov %arg5, %r13 +- and $15, %r13 # %r13 = arg5 (mod 16) +- je .L_multiple_of_16_bytes_\@ +- +- mov %r13, PBlockLen(%arg2) +- +- # Handle the last <16 Byte block separately +- paddd ONE(%rip), %xmm0 # INCR CNT to get Yn +- movdqu %xmm0, CurCount(%arg2) +- movdqa SHUF_MASK(%rip), %xmm10 +- pshufb %xmm10, %xmm0 +- +- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) +- movdqu %xmm0, PBlockEncKey(%arg2) +- +- cmp $16, %arg5 +- jge .L_large_enough_update_\@ +- +- lea (%arg4,%r11,1), %r10 +- mov %r13, %r12 +- READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 +- jmp .L_data_read_\@ +- +-.L_large_enough_update_\@: +- sub $16, %r11 +- add %r13, %r11 +- +- # receive the last <16 Byte block +- movdqu (%arg4, %r11, 1), %xmm1 +- +- sub %r13, %r11 +- add $16, %r11 +- +- lea SHIFT_MASK+16(%rip), %r12 +- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes +- # (r13 is the number of bytes in plaintext mod 16) +- sub %r13, %r12 +- # get the appropriate shuffle mask +- movdqu (%r12), %xmm2 +- # shift right 16-r13 bytes +- pshufb %xmm2, %xmm1 +- +-.L_data_read_\@: +- lea ALL_F+16(%rip), %r12 +- sub %r13, %r12 +- +-.ifc \operation, dec +- movdqa %xmm1, %xmm2 +-.endif +- pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) +- movdqu (%r12), %xmm1 +- # get the appropriate mask to mask out top 16-r13 bytes of xmm0 +- pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 +-.ifc \operation, dec +- pand %xmm1, %xmm2 +- movdqa SHUF_MASK(%rip), %xmm10 +- pshufb %xmm10 ,%xmm2 +- +- pxor %xmm2, %xmm8 +-.else +- movdqa SHUF_MASK(%rip), %xmm10 +- pshufb %xmm10,%xmm0 +- +- pxor %xmm0, %xmm8 +-.endif +- +- movdqu %xmm8, AadHash(%arg2) +-.ifc \operation, enc +- # GHASH computation for the last <16 byte block +- movdqa SHUF_MASK(%rip), %xmm10 +- # shuffle xmm0 back to output as ciphertext +- pshufb %xmm10, %xmm0 +-.endif +- +- # Output %r13 bytes +- movq %xmm0, %rax +- cmp $8, %r13 +- jle .L_less_than_8_bytes_left_\@ +- mov %rax, (%arg3 , %r11, 1) +- add $8, %r11 +- psrldq $8, %xmm0 +- movq %xmm0, %rax +- sub $8, %r13 +-.L_less_than_8_bytes_left_\@: +- mov %al, (%arg3, %r11, 1) +- add $1, %r11 +- shr $8, %rax +- sub $1, %r13 +- jne .L_less_than_8_bytes_left_\@ +-.L_multiple_of_16_bytes_\@: +-.endm +- +-# GCM_COMPLETE Finishes update of tag of last partial block +-# Output: Authorization Tag (AUTH_TAG) +-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 +-.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN +- movdqu AadHash(%arg2), %xmm8 +- movdqu HashKey(%arg2), %xmm13 +- +- mov PBlockLen(%arg2), %r12 +- +- test %r12, %r12 +- je .L_partial_done\@ +- +- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 +- +-.L_partial_done\@: +- mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) +- shl $3, %r12 # convert into number of bits +- movd %r12d, %xmm15 # len(A) in %xmm15 +- mov InLen(%arg2), %r12 +- shl $3, %r12 # len(C) in bits (*128) +- movq %r12, %xmm1 +- +- pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 +- pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) +- pxor %xmm15, %xmm8 +- GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 +- # final GHASH computation +- movdqa SHUF_MASK(%rip), %xmm10 +- pshufb %xmm10, %xmm8 +- +- movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 +- ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) +- pxor %xmm8, %xmm0 +-.L_return_T_\@: +- mov \AUTHTAG, %r10 # %r10 = authTag +- mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len +- cmp $16, %r11 +- je .L_T_16_\@ +- cmp $8, %r11 +- jl .L_T_4_\@ +-.L_T_8_\@: +- movq %xmm0, %rax +- mov %rax, (%r10) +- add $8, %r10 +- sub $8, %r11 +- psrldq $8, %xmm0 +- test %r11, %r11 +- je .L_return_T_done_\@ +-.L_T_4_\@: +- movd %xmm0, %eax +- mov %eax, (%r10) +- add $4, %r10 +- sub $4, %r11 +- psrldq $4, %xmm0 +- test %r11, %r11 +- je .L_return_T_done_\@ +-.L_T_123_\@: +- movd %xmm0, %eax +- cmp $2, %r11 +- jl .L_T_1_\@ +- mov %ax, (%r10) +- cmp $2, %r11 +- je .L_return_T_done_\@ +- add $2, %r10 +- sar $16, %eax +-.L_T_1_\@: +- mov %al, (%r10) +- jmp .L_return_T_done_\@ +-.L_T_16_\@: +- movdqu %xmm0, (%r10) +-.L_return_T_done_\@: +-.endm +- +-#ifdef __x86_64__ +-/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +-* +-* +-* Input: A and B (128-bits each, bit-reflected) +-* Output: C = A*B*x mod poly, (i.e. >>1 ) +-* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +-* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +-* +-*/ +-.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 +- movdqa \GH, \TMP1 +- pshufd $78, \GH, \TMP2 +- pshufd $78, \HK, \TMP3 +- pxor \GH, \TMP2 # TMP2 = a1+a0 +- pxor \HK, \TMP3 # TMP3 = b1+b0 +- pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 +- pclmulqdq $0x00, \HK, \GH # GH = a0*b0 +- pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) +- pxor \GH, \TMP2 +- pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) +- movdqa \TMP2, \TMP3 +- pslldq $8, \TMP3 # left shift TMP3 2 DWs +- psrldq $8, \TMP2 # right shift TMP2 2 DWs +- pxor \TMP3, \GH +- pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK +- +- # first phase of the reduction +- +- movdqa \GH, \TMP2 +- movdqa \GH, \TMP3 +- movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 +- # in in order to perform +- # independent shifts +- pslld $31, \TMP2 # packed right shift <<31 +- pslld $30, \TMP3 # packed right shift <<30 +- pslld $25, \TMP4 # packed right shift <<25 +- pxor \TMP3, \TMP2 # xor the shifted versions +- pxor \TMP4, \TMP2 +- movdqa \TMP2, \TMP5 +- psrldq $4, \TMP5 # right shift TMP5 1 DW +- pslldq $12, \TMP2 # left shift TMP2 3 DWs +- pxor \TMP2, \GH +- +- # second phase of the reduction +- +- movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 +- # in in order to perform +- # independent shifts +- movdqa \GH,\TMP3 +- movdqa \GH,\TMP4 +- psrld $1,\TMP2 # packed left shift >>1 +- psrld $2,\TMP3 # packed left shift >>2 +- psrld $7,\TMP4 # packed left shift >>7 +- pxor \TMP3,\TMP2 # xor the shifted versions +- pxor \TMP4,\TMP2 +- pxor \TMP5, \TMP2 +- pxor \TMP2, \GH +- pxor \TMP1, \GH # result is in TMP1 +-.endm +- +-# Reads DLEN bytes starting at DPTR and stores in XMMDst +-# where 0 < DLEN < 16 +-# Clobbers %rax, DLEN and XMM1 +-.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst +- cmp $8, \DLEN +- jl .L_read_lt8_\@ +- mov (\DPTR), %rax +- movq %rax, \XMMDst +- sub $8, \DLEN +- jz .L_done_read_partial_block_\@ +- xor %eax, %eax +-.L_read_next_byte_\@: +- shl $8, %rax +- mov 7(\DPTR, \DLEN, 1), %al +- dec \DLEN +- jnz .L_read_next_byte_\@ +- movq %rax, \XMM1 +- pslldq $8, \XMM1 +- por \XMM1, \XMMDst +- jmp .L_done_read_partial_block_\@ +-.L_read_lt8_\@: +- xor %eax, %eax +-.L_read_next_byte_lt8_\@: +- shl $8, %rax +- mov -1(\DPTR, \DLEN, 1), %al +- dec \DLEN +- jnz .L_read_next_byte_lt8_\@ +- movq %rax, \XMMDst +-.L_done_read_partial_block_\@: +-.endm +- +-# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +-# clobbers r10-11, xmm14 +-.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ +- TMP6 TMP7 +- MOVADQ SHUF_MASK(%rip), %xmm14 +- mov \AAD, %r10 # %r10 = AAD +- mov \AADLEN, %r11 # %r11 = aadLen +- pxor \TMP7, \TMP7 +- pxor \TMP6, \TMP6 +- +- cmp $16, %r11 +- jl .L_get_AAD_rest\@ +-.L_get_AAD_blocks\@: +- movdqu (%r10), \TMP7 +- pshufb %xmm14, \TMP7 # byte-reflect the AAD data +- pxor \TMP7, \TMP6 +- GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 +- add $16, %r10 +- sub $16, %r11 +- cmp $16, %r11 +- jge .L_get_AAD_blocks\@ +- +- movdqu \TMP6, \TMP7 +- +- /* read the last <16B of AAD */ +-.L_get_AAD_rest\@: +- test %r11, %r11 +- je .L_get_AAD_done\@ +- +- READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 +- pshufb %xmm14, \TMP7 # byte-reflect the AAD data +- pxor \TMP6, \TMP7 +- GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 +- movdqu \TMP7, \TMP6 +- +-.L_get_AAD_done\@: +- movdqu \TMP6, AadHash(%arg2) +-.endm +- +-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +-# between update calls. +-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK +-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context +-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 +-.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ +- AAD_HASH operation +- mov PBlockLen(%arg2), %r13 +- test %r13, %r13 +- je .L_partial_block_done_\@ # Leave Macro if no partial blocks +- # Read in input data without over reading +- cmp $16, \PLAIN_CYPH_LEN +- jl .L_fewer_than_16_bytes_\@ +- movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm +- jmp .L_data_read_\@ +- +-.L_fewer_than_16_bytes_\@: +- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 +- mov \PLAIN_CYPH_LEN, %r12 +- READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 +- +- mov PBlockLen(%arg2), %r13 +- +-.L_data_read_\@: # Finished reading in data +- +- movdqu PBlockEncKey(%arg2), %xmm9 +- movdqu HashKey(%arg2), %xmm13 +- +- lea SHIFT_MASK(%rip), %r12 +- +- # adjust the shuffle mask pointer to be able to shift r13 bytes +- # r16-r13 is the number of bytes in plaintext mod 16) +- add %r13, %r12 +- movdqu (%r12), %xmm2 # get the appropriate shuffle mask +- pshufb %xmm2, %xmm9 # shift right r13 bytes +- +-.ifc \operation, dec +- movdqa %xmm1, %xmm3 +- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) +- +- mov \PLAIN_CYPH_LEN, %r10 +- add %r13, %r10 +- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling +- sub $16, %r10 +- # Determine if partial block is not being filled and +- # shift mask accordingly +- jge .L_no_extra_mask_1_\@ +- sub %r10, %r12 +-.L_no_extra_mask_1_\@: +- +- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 +- # get the appropriate mask to mask out bottom r13 bytes of xmm9 +- pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 +- +- pand %xmm1, %xmm3 +- movdqa SHUF_MASK(%rip), %xmm10 +- pshufb %xmm10, %xmm3 +- pshufb %xmm2, %xmm3 +- pxor %xmm3, \AAD_HASH +- +- test %r10, %r10 +- jl .L_partial_incomplete_1_\@ +- +- # GHASH computation for the last <16 Byte block +- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 +- xor %eax, %eax +- +- mov %rax, PBlockLen(%arg2) +- jmp .L_dec_done_\@ +-.L_partial_incomplete_1_\@: +- add \PLAIN_CYPH_LEN, PBlockLen(%arg2) +-.L_dec_done_\@: +- movdqu \AAD_HASH, AadHash(%arg2) +-.else +- pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) +- +- mov \PLAIN_CYPH_LEN, %r10 +- add %r13, %r10 +- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling +- sub $16, %r10 +- # Determine if partial block is not being filled and +- # shift mask accordingly +- jge .L_no_extra_mask_2_\@ +- sub %r10, %r12 +-.L_no_extra_mask_2_\@: +- +- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 +- # get the appropriate mask to mask out bottom r13 bytes of xmm9 +- pand %xmm1, %xmm9 +- +- movdqa SHUF_MASK(%rip), %xmm1 +- pshufb %xmm1, %xmm9 +- pshufb %xmm2, %xmm9 +- pxor %xmm9, \AAD_HASH +- +- test %r10, %r10 +- jl .L_partial_incomplete_2_\@ +- +- # GHASH computation for the last <16 Byte block +- GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 +- xor %eax, %eax +- +- mov %rax, PBlockLen(%arg2) +- jmp .L_encode_done_\@ +-.L_partial_incomplete_2_\@: +- add \PLAIN_CYPH_LEN, PBlockLen(%arg2) +-.L_encode_done_\@: +- movdqu \AAD_HASH, AadHash(%arg2) +- +- movdqa SHUF_MASK(%rip), %xmm10 +- # shuffle xmm9 back to output as ciphertext +- pshufb %xmm10, %xmm9 +- pshufb %xmm2, %xmm9 +-.endif +- # output encrypted Bytes +- test %r10, %r10 +- jl .L_partial_fill_\@ +- mov %r13, %r12 +- mov $16, %r13 +- # Set r13 to be the number of bytes to write out +- sub %r12, %r13 +- jmp .L_count_set_\@ +-.L_partial_fill_\@: +- mov \PLAIN_CYPH_LEN, %r13 +-.L_count_set_\@: +- movdqa %xmm9, %xmm0 +- movq %xmm0, %rax +- cmp $8, %r13 +- jle .L_less_than_8_bytes_left_\@ +- +- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) +- add $8, \DATA_OFFSET +- psrldq $8, %xmm0 +- movq %xmm0, %rax +- sub $8, %r13 +-.L_less_than_8_bytes_left_\@: +- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) +- add $1, \DATA_OFFSET +- shr $8, %rax +- sub $1, %r13 +- jne .L_less_than_8_bytes_left_\@ +-.L_partial_block_done_\@: +-.endm # PARTIAL_BLOCK +- +-/* +-* if a = number of total plaintext bytes +-* b = floor(a/16) +-* num_initial_blocks = b mod 4 +-* encrypt the initial num_initial_blocks blocks and apply ghash on +-* the ciphertext +-* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers +-* are clobbered +-* arg1, %arg2, %arg3 are used as a pointer only, not modified +-*/ +- +- +-.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ +- XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation +- MOVADQ SHUF_MASK(%rip), %xmm14 +- +- movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 +- +- # start AES for num_initial_blocks blocks +- +- movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 +- +-.if (\i == 5) || (\i == 6) || (\i == 7) +- +- MOVADQ ONE(%RIP),\TMP1 +- MOVADQ 0(%arg1),\TMP2 +-.irpc index, \i_seq +- paddd \TMP1, \XMM0 # INCR Y0 +-.ifc \operation, dec +- movdqa \XMM0, %xmm\index +-.else +- MOVADQ \XMM0, %xmm\index +-.endif +- pshufb %xmm14, %xmm\index # perform a 16 byte swap +- pxor \TMP2, %xmm\index +-.endr +- lea 0x10(%arg1),%r10 +- mov keysize,%eax +- shr $2,%eax # 128->4, 192->6, 256->8 +- add $5,%eax # 128->9, 192->11, 256->13 +- +-.Laes_loop_initial_\@: +- MOVADQ (%r10),\TMP1 +-.irpc index, \i_seq +- aesenc \TMP1, %xmm\index +-.endr +- add $16,%r10 +- sub $1,%eax +- jnz .Laes_loop_initial_\@ +- +- MOVADQ (%r10), \TMP1 +-.irpc index, \i_seq +- aesenclast \TMP1, %xmm\index # Last Round +-.endr +-.irpc index, \i_seq +- movdqu (%arg4 , %r11, 1), \TMP1 +- pxor \TMP1, %xmm\index +- movdqu %xmm\index, (%arg3 , %r11, 1) +- # write back plaintext/ciphertext for num_initial_blocks +- add $16, %r11 +- +-.ifc \operation, dec +- movdqa \TMP1, %xmm\index +-.endif +- pshufb %xmm14, %xmm\index +- +- # prepare plaintext/ciphertext for GHASH computation +-.endr +-.endif +- +- # apply GHASH on num_initial_blocks blocks +- +-.if \i == 5 +- pxor %xmm5, %xmm6 +- GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +- pxor %xmm6, %xmm7 +- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +- pxor %xmm7, %xmm8 +- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +-.elseif \i == 6 +- pxor %xmm6, %xmm7 +- GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +- pxor %xmm7, %xmm8 +- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +-.elseif \i == 7 +- pxor %xmm7, %xmm8 +- GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 +-.endif +- cmp $64, %r13 +- jl .L_initial_blocks_done\@ +- # no need for precomputed values +-/* +-* +-* Precomputations for HashKey parallel with encryption of first 4 blocks. +-* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +-*/ +- MOVADQ ONE(%RIP),\TMP1 +- paddd \TMP1, \XMM0 # INCR Y0 +- MOVADQ \XMM0, \XMM1 +- pshufb %xmm14, \XMM1 # perform a 16 byte swap +- +- paddd \TMP1, \XMM0 # INCR Y0 +- MOVADQ \XMM0, \XMM2 +- pshufb %xmm14, \XMM2 # perform a 16 byte swap +- +- paddd \TMP1, \XMM0 # INCR Y0 +- MOVADQ \XMM0, \XMM3 +- pshufb %xmm14, \XMM3 # perform a 16 byte swap +- +- paddd \TMP1, \XMM0 # INCR Y0 +- MOVADQ \XMM0, \XMM4 +- pshufb %xmm14, \XMM4 # perform a 16 byte swap +- +- MOVADQ 0(%arg1),\TMP1 +- pxor \TMP1, \XMM1 +- pxor \TMP1, \XMM2 +- pxor \TMP1, \XMM3 +- pxor \TMP1, \XMM4 +-.irpc index, 1234 # do 4 rounds +- movaps 0x10*\index(%arg1), \TMP1 +- aesenc \TMP1, \XMM1 +- aesenc \TMP1, \XMM2 +- aesenc \TMP1, \XMM3 +- aesenc \TMP1, \XMM4 +-.endr +-.irpc index, 56789 # do next 5 rounds +- movaps 0x10*\index(%arg1), \TMP1 +- aesenc \TMP1, \XMM1 +- aesenc \TMP1, \XMM2 +- aesenc \TMP1, \XMM3 +- aesenc \TMP1, \XMM4 +-.endr +- lea 0xa0(%arg1),%r10 +- mov keysize,%eax +- shr $2,%eax # 128->4, 192->6, 256->8 +- sub $4,%eax # 128->0, 192->2, 256->4 +- jz .Laes_loop_pre_done\@ +- +-.Laes_loop_pre_\@: +- MOVADQ (%r10),\TMP2 +-.irpc index, 1234 +- aesenc \TMP2, %xmm\index +-.endr +- add $16,%r10 +- sub $1,%eax +- jnz .Laes_loop_pre_\@ +- +-.Laes_loop_pre_done\@: +- MOVADQ (%r10), \TMP2 +- aesenclast \TMP2, \XMM1 +- aesenclast \TMP2, \XMM2 +- aesenclast \TMP2, \XMM3 +- aesenclast \TMP2, \XMM4 +- movdqu 16*0(%arg4 , %r11 , 1), \TMP1 +- pxor \TMP1, \XMM1 +-.ifc \operation, dec +- movdqu \XMM1, 16*0(%arg3 , %r11 , 1) +- movdqa \TMP1, \XMM1 +-.endif +- movdqu 16*1(%arg4 , %r11 , 1), \TMP1 +- pxor \TMP1, \XMM2 +-.ifc \operation, dec +- movdqu \XMM2, 16*1(%arg3 , %r11 , 1) +- movdqa \TMP1, \XMM2 +-.endif +- movdqu 16*2(%arg4 , %r11 , 1), \TMP1 +- pxor \TMP1, \XMM3 +-.ifc \operation, dec +- movdqu \XMM3, 16*2(%arg3 , %r11 , 1) +- movdqa \TMP1, \XMM3 +-.endif +- movdqu 16*3(%arg4 , %r11 , 1), \TMP1 +- pxor \TMP1, \XMM4 +-.ifc \operation, dec +- movdqu \XMM4, 16*3(%arg3 , %r11 , 1) +- movdqa \TMP1, \XMM4 +-.else +- movdqu \XMM1, 16*0(%arg3 , %r11 , 1) +- movdqu \XMM2, 16*1(%arg3 , %r11 , 1) +- movdqu \XMM3, 16*2(%arg3 , %r11 , 1) +- movdqu \XMM4, 16*3(%arg3 , %r11 , 1) +-.endif +- +- add $64, %r11 +- pshufb %xmm14, \XMM1 # perform a 16 byte swap +- pxor \XMMDst, \XMM1 +-# combine GHASHed value with the corresponding ciphertext +- pshufb %xmm14, \XMM2 # perform a 16 byte swap +- pshufb %xmm14, \XMM3 # perform a 16 byte swap +- pshufb %xmm14, \XMM4 # perform a 16 byte swap +- +-.L_initial_blocks_done\@: +- +-.endm +- +-/* +-* encrypt 4 blocks at a time +-* ghash the 4 previously encrypted ciphertext blocks +-* arg1, %arg3, %arg4 are used as pointers only, not modified +-* %r11 is the data offset value +-*/ +-.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ +-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation +- +- movdqa \XMM1, \XMM5 +- movdqa \XMM2, \XMM6 +- movdqa \XMM3, \XMM7 +- movdqa \XMM4, \XMM8 +- +- movdqa SHUF_MASK(%rip), %xmm15 +- # multiply TMP5 * HashKey using karatsuba +- +- movdqa \XMM5, \TMP4 +- pshufd $78, \XMM5, \TMP6 +- pxor \XMM5, \TMP6 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqu HashKey_4(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 +- movdqa \XMM0, \XMM1 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqa \XMM0, \XMM2 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqa \XMM0, \XMM3 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqa \XMM0, \XMM4 +- pshufb %xmm15, \XMM1 # perform a 16 byte swap +- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 +- pshufb %xmm15, \XMM2 # perform a 16 byte swap +- pshufb %xmm15, \XMM3 # perform a 16 byte swap +- pshufb %xmm15, \XMM4 # perform a 16 byte swap +- +- pxor (%arg1), \XMM1 +- pxor (%arg1), \XMM2 +- pxor (%arg1), \XMM3 +- pxor (%arg1), \XMM4 +- movdqu HashKey_4_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) +- movaps 0x10(%arg1), \TMP1 +- aesenc \TMP1, \XMM1 # Round 1 +- aesenc \TMP1, \XMM2 +- aesenc \TMP1, \XMM3 +- aesenc \TMP1, \XMM4 +- movaps 0x20(%arg1), \TMP1 +- aesenc \TMP1, \XMM1 # Round 2 +- aesenc \TMP1, \XMM2 +- aesenc \TMP1, \XMM3 +- aesenc \TMP1, \XMM4 +- movdqa \XMM6, \TMP1 +- pshufd $78, \XMM6, \TMP2 +- pxor \XMM6, \TMP2 +- movdqu HashKey_3(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 +- movaps 0x30(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 3 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 +- movaps 0x40(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 4 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- movdqu HashKey_3_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movaps 0x50(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 5 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pxor \TMP1, \TMP4 +-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part +- pxor \XMM6, \XMM5 +- pxor \TMP2, \TMP6 +- movdqa \XMM7, \TMP1 +- pshufd $78, \XMM7, \TMP2 +- pxor \XMM7, \TMP2 +- movdqu HashKey_2(%arg2), \TMP5 +- +- # Multiply TMP5 * HashKey using karatsuba +- +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- movaps 0x60(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 6 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 +- movaps 0x70(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 7 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- movdqu HashKey_2_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movaps 0x80(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 8 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pxor \TMP1, \TMP4 +-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part +- pxor \XMM7, \XMM5 +- pxor \TMP2, \TMP6 +- +- # Multiply XMM8 * HashKey +- # XMM8 and TMP5 hold the values for the two operands +- +- movdqa \XMM8, \TMP1 +- pshufd $78, \XMM8, \TMP2 +- pxor \XMM8, \TMP2 +- movdqu HashKey(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- movaps 0x90(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 9 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 +- lea 0xa0(%arg1),%r10 +- mov keysize,%eax +- shr $2,%eax # 128->4, 192->6, 256->8 +- sub $4,%eax # 128->0, 192->2, 256->4 +- jz .Laes_loop_par_enc_done\@ +- +-.Laes_loop_par_enc\@: +- MOVADQ (%r10),\TMP3 +-.irpc index, 1234 +- aesenc \TMP3, %xmm\index +-.endr +- add $16,%r10 +- sub $1,%eax +- jnz .Laes_loop_par_enc\@ +- +-.Laes_loop_par_enc_done\@: +- MOVADQ (%r10), \TMP3 +- aesenclast \TMP3, \XMM1 # Round 10 +- aesenclast \TMP3, \XMM2 +- aesenclast \TMP3, \XMM3 +- aesenclast \TMP3, \XMM4 +- movdqu HashKey_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movdqu (%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK +- movdqu 16(%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK +- movdqu 32(%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK +- movdqu 48(%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK +- movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer +- movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer +- movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer +- movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer +- pshufb %xmm15, \XMM1 # perform a 16 byte swap +- pshufb %xmm15, \XMM2 # perform a 16 byte swap +- pshufb %xmm15, \XMM3 # perform a 16 byte swap +- pshufb %xmm15, \XMM4 # perform a 16 byte swap +- +- pxor \TMP4, \TMP1 +- pxor \XMM8, \XMM5 +- pxor \TMP6, \TMP2 +- pxor \TMP1, \TMP2 +- pxor \XMM5, \TMP2 +- movdqa \TMP2, \TMP3 +- pslldq $8, \TMP3 # left shift TMP3 2 DWs +- psrldq $8, \TMP2 # right shift TMP2 2 DWs +- pxor \TMP3, \XMM5 +- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 +- +- # first phase of reduction +- +- movdqa \XMM5, \TMP2 +- movdqa \XMM5, \TMP3 +- movdqa \XMM5, \TMP4 +-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently +- pslld $31, \TMP2 # packed right shift << 31 +- pslld $30, \TMP3 # packed right shift << 30 +- pslld $25, \TMP4 # packed right shift << 25 +- pxor \TMP3, \TMP2 # xor the shifted versions +- pxor \TMP4, \TMP2 +- movdqa \TMP2, \TMP5 +- psrldq $4, \TMP5 # right shift T5 1 DW +- pslldq $12, \TMP2 # left shift T2 3 DWs +- pxor \TMP2, \XMM5 +- +- # second phase of reduction +- +- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 +- movdqa \XMM5,\TMP3 +- movdqa \XMM5,\TMP4 +- psrld $1, \TMP2 # packed left shift >>1 +- psrld $2, \TMP3 # packed left shift >>2 +- psrld $7, \TMP4 # packed left shift >>7 +- pxor \TMP3,\TMP2 # xor the shifted versions +- pxor \TMP4,\TMP2 +- pxor \TMP5, \TMP2 +- pxor \TMP2, \XMM5 +- pxor \TMP1, \XMM5 # result is in TMP1 +- +- pxor \XMM5, \XMM1 +-.endm +- +-/* +-* decrypt 4 blocks at a time +-* ghash the 4 previously decrypted ciphertext blocks +-* arg1, %arg3, %arg4 are used as pointers only, not modified +-* %r11 is the data offset value +-*/ +-.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ +-TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation +- +- movdqa \XMM1, \XMM5 +- movdqa \XMM2, \XMM6 +- movdqa \XMM3, \XMM7 +- movdqa \XMM4, \XMM8 +- +- movdqa SHUF_MASK(%rip), %xmm15 +- # multiply TMP5 * HashKey using karatsuba +- +- movdqa \XMM5, \TMP4 +- pshufd $78, \XMM5, \TMP6 +- pxor \XMM5, \TMP6 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqu HashKey_4(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 +- movdqa \XMM0, \XMM1 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqa \XMM0, \XMM2 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqa \XMM0, \XMM3 +- paddd ONE(%rip), \XMM0 # INCR CNT +- movdqa \XMM0, \XMM4 +- pshufb %xmm15, \XMM1 # perform a 16 byte swap +- pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 +- pshufb %xmm15, \XMM2 # perform a 16 byte swap +- pshufb %xmm15, \XMM3 # perform a 16 byte swap +- pshufb %xmm15, \XMM4 # perform a 16 byte swap +- +- pxor (%arg1), \XMM1 +- pxor (%arg1), \XMM2 +- pxor (%arg1), \XMM3 +- pxor (%arg1), \XMM4 +- movdqu HashKey_4_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) +- movaps 0x10(%arg1), \TMP1 +- aesenc \TMP1, \XMM1 # Round 1 +- aesenc \TMP1, \XMM2 +- aesenc \TMP1, \XMM3 +- aesenc \TMP1, \XMM4 +- movaps 0x20(%arg1), \TMP1 +- aesenc \TMP1, \XMM1 # Round 2 +- aesenc \TMP1, \XMM2 +- aesenc \TMP1, \XMM3 +- aesenc \TMP1, \XMM4 +- movdqa \XMM6, \TMP1 +- pshufd $78, \XMM6, \TMP2 +- pxor \XMM6, \TMP2 +- movdqu HashKey_3(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 +- movaps 0x30(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 3 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 +- movaps 0x40(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 4 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- movdqu HashKey_3_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movaps 0x50(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 5 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pxor \TMP1, \TMP4 +-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part +- pxor \XMM6, \XMM5 +- pxor \TMP2, \TMP6 +- movdqa \XMM7, \TMP1 +- pshufd $78, \XMM7, \TMP2 +- pxor \XMM7, \TMP2 +- movdqu HashKey_2(%arg2), \TMP5 +- +- # Multiply TMP5 * HashKey using karatsuba +- +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- movaps 0x60(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 6 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 +- movaps 0x70(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 7 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- movdqu HashKey_2_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movaps 0x80(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 8 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pxor \TMP1, \TMP4 +-# accumulate the results in TMP4:XMM5, TMP6 holds the middle part +- pxor \XMM7, \XMM5 +- pxor \TMP2, \TMP6 +- +- # Multiply XMM8 * HashKey +- # XMM8 and TMP5 hold the values for the two operands +- +- movdqa \XMM8, \TMP1 +- pshufd $78, \XMM8, \TMP2 +- pxor \XMM8, \TMP2 +- movdqu HashKey(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- movaps 0x90(%arg1), \TMP3 +- aesenc \TMP3, \XMM1 # Round 9 +- aesenc \TMP3, \XMM2 +- aesenc \TMP3, \XMM3 +- aesenc \TMP3, \XMM4 +- pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 +- lea 0xa0(%arg1),%r10 +- mov keysize,%eax +- shr $2,%eax # 128->4, 192->6, 256->8 +- sub $4,%eax # 128->0, 192->2, 256->4 +- jz .Laes_loop_par_dec_done\@ +- +-.Laes_loop_par_dec\@: +- MOVADQ (%r10),\TMP3 +-.irpc index, 1234 +- aesenc \TMP3, %xmm\index +-.endr +- add $16,%r10 +- sub $1,%eax +- jnz .Laes_loop_par_dec\@ +- +-.Laes_loop_par_dec_done\@: +- MOVADQ (%r10), \TMP3 +- aesenclast \TMP3, \XMM1 # last round +- aesenclast \TMP3, \XMM2 +- aesenclast \TMP3, \XMM3 +- aesenclast \TMP3, \XMM4 +- movdqu HashKey_k(%arg2), \TMP5 +- pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movdqu (%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK +- movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer +- movdqa \TMP3, \XMM1 +- movdqu 16(%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK +- movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer +- movdqa \TMP3, \XMM2 +- movdqu 32(%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK +- movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer +- movdqa \TMP3, \XMM3 +- movdqu 48(%arg4,%r11,1), \TMP3 +- pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK +- movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer +- movdqa \TMP3, \XMM4 +- pshufb %xmm15, \XMM1 # perform a 16 byte swap +- pshufb %xmm15, \XMM2 # perform a 16 byte swap +- pshufb %xmm15, \XMM3 # perform a 16 byte swap +- pshufb %xmm15, \XMM4 # perform a 16 byte swap +- +- pxor \TMP4, \TMP1 +- pxor \XMM8, \XMM5 +- pxor \TMP6, \TMP2 +- pxor \TMP1, \TMP2 +- pxor \XMM5, \TMP2 +- movdqa \TMP2, \TMP3 +- pslldq $8, \TMP3 # left shift TMP3 2 DWs +- psrldq $8, \TMP2 # right shift TMP2 2 DWs +- pxor \TMP3, \XMM5 +- pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 +- +- # first phase of reduction +- +- movdqa \XMM5, \TMP2 +- movdqa \XMM5, \TMP3 +- movdqa \XMM5, \TMP4 +-# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently +- pslld $31, \TMP2 # packed right shift << 31 +- pslld $30, \TMP3 # packed right shift << 30 +- pslld $25, \TMP4 # packed right shift << 25 +- pxor \TMP3, \TMP2 # xor the shifted versions +- pxor \TMP4, \TMP2 +- movdqa \TMP2, \TMP5 +- psrldq $4, \TMP5 # right shift T5 1 DW +- pslldq $12, \TMP2 # left shift T2 3 DWs +- pxor \TMP2, \XMM5 +- +- # second phase of reduction +- +- movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 +- movdqa \XMM5,\TMP3 +- movdqa \XMM5,\TMP4 +- psrld $1, \TMP2 # packed left shift >>1 +- psrld $2, \TMP3 # packed left shift >>2 +- psrld $7, \TMP4 # packed left shift >>7 +- pxor \TMP3,\TMP2 # xor the shifted versions +- pxor \TMP4,\TMP2 +- pxor \TMP5, \TMP2 +- pxor \TMP2, \XMM5 +- pxor \TMP1, \XMM5 # result is in TMP1 +- +- pxor \XMM5, \XMM1 +-.endm +- +-/* GHASH the last 4 ciphertext blocks. */ +-.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ +-TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst +- +- # Multiply TMP6 * HashKey (using Karatsuba) +- +- movdqa \XMM1, \TMP6 +- pshufd $78, \XMM1, \TMP2 +- pxor \XMM1, \TMP2 +- movdqu HashKey_4(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 +- pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 +- movdqu HashKey_4_k(%arg2), \TMP4 +- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- movdqa \XMM1, \XMMDst +- movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 +- +- # Multiply TMP1 * HashKey (using Karatsuba) +- +- movdqa \XMM2, \TMP1 +- pshufd $78, \XMM2, \TMP2 +- pxor \XMM2, \TMP2 +- movdqu HashKey_3(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 +- movdqu HashKey_3_k(%arg2), \TMP4 +- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- pxor \TMP1, \TMP6 +- pxor \XMM2, \XMMDst +- pxor \TMP2, \XMM1 +-# results accumulated in TMP6, XMMDst, XMM1 +- +- # Multiply TMP1 * HashKey (using Karatsuba) +- +- movdqa \XMM3, \TMP1 +- pshufd $78, \XMM3, \TMP2 +- pxor \XMM3, \TMP2 +- movdqu HashKey_2(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 +- movdqu HashKey_2_k(%arg2), \TMP4 +- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- pxor \TMP1, \TMP6 +- pxor \XMM3, \XMMDst +- pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 +- +- # Multiply TMP1 * HashKey (using Karatsuba) +- movdqa \XMM4, \TMP1 +- pshufd $78, \XMM4, \TMP2 +- pxor \XMM4, \TMP2 +- movdqu HashKey(%arg2), \TMP5 +- pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 +- pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 +- movdqu HashKey_k(%arg2), \TMP4 +- pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) +- pxor \TMP1, \TMP6 +- pxor \XMM4, \XMMDst +- pxor \XMM1, \TMP2 +- pxor \TMP6, \TMP2 +- pxor \XMMDst, \TMP2 +- # middle section of the temp results combined as in karatsuba algorithm +- movdqa \TMP2, \TMP4 +- pslldq $8, \TMP4 # left shift TMP4 2 DWs +- psrldq $8, \TMP2 # right shift TMP2 2 DWs +- pxor \TMP4, \XMMDst +- pxor \TMP2, \TMP6 +-# TMP6:XMMDst holds the result of the accumulated carry-less multiplications +- # first phase of the reduction +- movdqa \XMMDst, \TMP2 +- movdqa \XMMDst, \TMP3 +- movdqa \XMMDst, \TMP4 +-# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently +- pslld $31, \TMP2 # packed right shifting << 31 +- pslld $30, \TMP3 # packed right shifting << 30 +- pslld $25, \TMP4 # packed right shifting << 25 +- pxor \TMP3, \TMP2 # xor the shifted versions +- pxor \TMP4, \TMP2 +- movdqa \TMP2, \TMP7 +- psrldq $4, \TMP7 # right shift TMP7 1 DW +- pslldq $12, \TMP2 # left shift TMP2 3 DWs +- pxor \TMP2, \XMMDst +- +- # second phase of the reduction +- movdqa \XMMDst, \TMP2 +- # make 3 copies of XMMDst for doing 3 shift operations +- movdqa \XMMDst, \TMP3 +- movdqa \XMMDst, \TMP4 +- psrld $1, \TMP2 # packed left shift >> 1 +- psrld $2, \TMP3 # packed left shift >> 2 +- psrld $7, \TMP4 # packed left shift >> 7 +- pxor \TMP3, \TMP2 # xor the shifted versions +- pxor \TMP4, \TMP2 +- pxor \TMP7, \TMP2 +- pxor \TMP2, \XMMDst +- pxor \TMP6, \XMMDst # reduced result is in XMMDst +-.endm +- +- +-/* Encryption of a single block +-* uses eax & r10 +-*/ +- +-.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 +- +- pxor (%arg1), \XMM0 +- mov keysize,%eax +- shr $2,%eax # 128->4, 192->6, 256->8 +- add $5,%eax # 128->9, 192->11, 256->13 +- lea 16(%arg1), %r10 # get first expanded key address +- +-_esb_loop_\@: +- MOVADQ (%r10),\TMP1 +- aesenc \TMP1,\XMM0 +- add $16,%r10 +- sub $1,%eax +- jnz _esb_loop_\@ +- +- MOVADQ (%r10),\TMP1 +- aesenclast \TMP1,\XMM0 +-.endm +- +-/***************************************************************************** +-* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +-* struct gcm_context_data *data, +-* // context data +-* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) +-* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) +-* // concatenated with 0x00000001. 16-byte aligned pointer. +-* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. +-* const u8 *aad, // Additional Authentication Data (AAD) +-* u64 aad_len) // Length of AAD in bytes. +-*/ +-SYM_FUNC_START(aesni_gcm_init) +- FUNC_SAVE +- GCM_INIT %arg3, %arg4,%arg5, %arg6 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_init) +- +-/***************************************************************************** +-* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +-* struct gcm_context_data *data, +-* // context data +-* u8 *out, // Ciphertext output. Encrypt in-place is allowed. +-* const u8 *in, // Plaintext input +-* u64 plaintext_len, // Length of data in bytes for encryption. +-*/ +-SYM_FUNC_START(aesni_gcm_enc_update) +- FUNC_SAVE +- GCM_ENC_DEC enc +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_enc_update) +- +-/***************************************************************************** +-* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +-* struct gcm_context_data *data, +-* // context data +-* u8 *out, // Ciphertext output. Encrypt in-place is allowed. +-* const u8 *in, // Plaintext input +-* u64 plaintext_len, // Length of data in bytes for encryption. +-*/ +-SYM_FUNC_START(aesni_gcm_dec_update) +- FUNC_SAVE +- GCM_ENC_DEC dec +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_dec_update) +- +-/***************************************************************************** +-* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. +-* struct gcm_context_data *data, +-* // context data +-* u8 *auth_tag, // Authenticated Tag output. +-* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), +-* // 12 or 8. +-*/ +-SYM_FUNC_START(aesni_gcm_finalize) +- FUNC_SAVE +- GCM_COMPLETE %arg3 %arg4 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_finalize) +- +-#endif +- + SYM_FUNC_START_LOCAL(_key_expansion_256a) + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 +diff --git a/arch/x86/crypto/aesni-intel_avx-x86_64.S b/arch/x86/crypto/aesni-intel_avx-x86_64.S +deleted file mode 100644 +index 8c9749ed0651..000000000000 +--- a/arch/x86/crypto/aesni-intel_avx-x86_64.S ++++ /dev/null +@@ -1,2804 +0,0 @@ +-######################################################################## +-# Copyright (c) 2013, Intel Corporation +-# +-# This software is available to you under a choice of one of two +-# licenses. You may choose to be licensed under the terms of the GNU +-# General Public License (GPL) Version 2, available from the file +-# COPYING in the main directory of this source tree, or the +-# OpenIB.org BSD license below: +-# +-# Redistribution and use in source and binary forms, with or without +-# modification, are permitted provided that the following conditions are +-# met: +-# +-# * Redistributions of source code must retain the above copyright +-# notice, this list of conditions and the following disclaimer. +-# +-# * Redistributions in binary form must reproduce the above copyright +-# notice, this list of conditions and the following disclaimer in the +-# documentation and/or other materials provided with the +-# distribution. +-# +-# * Neither the name of the Intel Corporation nor the names of its +-# contributors may be used to endorse or promote products derived from +-# this software without specific prior written permission. +-# +-# +-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR +-# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-######################################################################## +-## +-## Authors: +-## Erdinc Ozturk +-## Vinodh Gopal +-## James Guilford +-## Tim Chen +-## +-## References: +-## This code was derived and highly optimized from the code described in paper: +-## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation +-## on Intel Architecture Processors. August, 2010 +-## The details of the implementation is explained in: +-## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode +-## on Intel Architecture Processors. October, 2012. +-## +-## Assumptions: +-## +-## +-## +-## iv: +-## 0 1 2 3 +-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | Salt (From the SA) | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | Initialization Vector | +-## | (This is the sequence number from IPSec header) | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | 0x1 | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## +-## +-## +-## AAD: +-## AAD padded to 128 bits with 0 +-## for example, assume AAD is a u32 vector +-## +-## if AAD is 8 bytes: +-## AAD[3] = {A0, A1}# +-## padded AAD in xmm register = {A1 A0 0 0} +-## +-## 0 1 2 3 +-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | SPI (A1) | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | 32-bit Sequence Number (A0) | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | 0x0 | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## +-## AAD Format with 32-bit Sequence Number +-## +-## if AAD is 12 bytes: +-## AAD[3] = {A0, A1, A2}# +-## padded AAD in xmm register = {A2 A1 A0 0} +-## +-## 0 1 2 3 +-## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | SPI (A2) | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | 64-bit Extended Sequence Number {A1,A0} | +-## | | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## | 0x0 | +-## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +-## +-## AAD Format with 64-bit Extended Sequence Number +-## +-## +-## aadLen: +-## from the definition of the spec, aadLen can only be 8 or 12 bytes. +-## The code additionally supports aadLen of length 16 bytes. +-## +-## TLen: +-## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. +-## +-## poly = x^128 + x^127 + x^126 + x^121 + 1 +-## throughout the code, one tab and two tab indentations are used. one tab is +-## for GHASH part, two tabs is for AES part. +-## +- +-#include +- +-# constants in mergeable sections, linker can reorder and merge +-.section .rodata.cst16.POLY, "aM", @progbits, 16 +-.align 16 +-POLY: .octa 0xC2000000000000000000000000000001 +- +-.section .rodata.cst16.POLY2, "aM", @progbits, 16 +-.align 16 +-POLY2: .octa 0xC20000000000000000000001C2000000 +- +-.section .rodata.cst16.TWOONE, "aM", @progbits, 16 +-.align 16 +-TWOONE: .octa 0x00000001000000000000000000000001 +- +-.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 +-.align 16 +-SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F +- +-.section .rodata.cst16.ONE, "aM", @progbits, 16 +-.align 16 +-ONE: .octa 0x00000000000000000000000000000001 +- +-.section .rodata.cst16.ONEf, "aM", @progbits, 16 +-.align 16 +-ONEf: .octa 0x01000000000000000000000000000000 +- +-# order of these constants should not change. +-# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F +-.section .rodata, "a", @progbits +-.align 16 +-SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 +-ALL_F: .octa 0xffffffffffffffffffffffffffffffff +- .octa 0x00000000000000000000000000000000 +- +-.text +- +- +-#define AadHash 16*0 +-#define AadLen 16*1 +-#define InLen (16*1)+8 +-#define PBlockEncKey 16*2 +-#define OrigIV 16*3 +-#define CurCount 16*4 +-#define PBlockLen 16*5 +- +-HashKey = 16*6 # store HashKey <<1 mod poly here +-HashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here +-HashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here +-HashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here +-HashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here +-HashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here +-HashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here +-HashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here +-HashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) +-HashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) +-HashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) +-HashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) +-HashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) +-HashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) +-HashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) +-HashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) +- +-#define arg1 %rdi +-#define arg2 %rsi +-#define arg3 %rdx +-#define arg4 %rcx +-#define arg5 %r8 +-#define arg6 %r9 +-#define keysize 2*15*16(arg1) +- +-i = 0 +-j = 0 +- +-out_order = 0 +-in_order = 1 +-DEC = 0 +-ENC = 1 +- +-.macro define_reg r n +-reg_\r = %xmm\n +-.endm +- +-.macro setreg +-.altmacro +-define_reg i %i +-define_reg j %j +-.noaltmacro +-.endm +- +-TMP1 = 16*0 # Temporary storage for AAD +-TMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) +-TMP3 = 16*2 # Temporary storage for AES State 3 +-TMP4 = 16*3 # Temporary storage for AES State 4 +-TMP5 = 16*4 # Temporary storage for AES State 5 +-TMP6 = 16*5 # Temporary storage for AES State 6 +-TMP7 = 16*6 # Temporary storage for AES State 7 +-TMP8 = 16*7 # Temporary storage for AES State 8 +- +-VARIABLE_OFFSET = 16*8 +- +-################################ +-# Utility Macros +-################################ +- +-.macro FUNC_SAVE +- push %r12 +- push %r13 +- push %r15 +- +- push %rbp +- mov %rsp, %rbp +- +- sub $VARIABLE_OFFSET, %rsp +- and $~63, %rsp # align rsp to 64 bytes +-.endm +- +-.macro FUNC_RESTORE +- mov %rbp, %rsp +- pop %rbp +- +- pop %r15 +- pop %r13 +- pop %r12 +-.endm +- +-# Encryption of a single block +-.macro ENCRYPT_SINGLE_BLOCK REP XMM0 +- vpxor (arg1), \XMM0, \XMM0 +- i = 1 +- setreg +-.rep \REP +- vaesenc 16*i(arg1), \XMM0, \XMM0 +- i = (i+1) +- setreg +-.endr +- vaesenclast 16*i(arg1), \XMM0, \XMM0 +-.endm +- +-# combined for GCM encrypt and decrypt functions +-# clobbering all xmm registers +-# clobbering r10, r11, r12, r13, r15, rax +-.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP +- vmovdqu AadHash(arg2), %xmm8 +- vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey +- add arg5, InLen(arg2) +- +- # initialize the data pointer offset as zero +- xor %r11d, %r11d +- +- PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC +- sub %r11, arg5 +- +- mov arg5, %r13 # save the number of bytes of plaintext/ciphertext +- and $-16, %r13 # r13 = r13 - (r13 mod 16) +- +- mov %r13, %r12 +- shr $4, %r12 +- and $7, %r12 +- jz .L_initial_num_blocks_is_0\@ +- +- cmp $7, %r12 +- je .L_initial_num_blocks_is_7\@ +- cmp $6, %r12 +- je .L_initial_num_blocks_is_6\@ +- cmp $5, %r12 +- je .L_initial_num_blocks_is_5\@ +- cmp $4, %r12 +- je .L_initial_num_blocks_is_4\@ +- cmp $3, %r12 +- je .L_initial_num_blocks_is_3\@ +- cmp $2, %r12 +- je .L_initial_num_blocks_is_2\@ +- +- jmp .L_initial_num_blocks_is_1\@ +- +-.L_initial_num_blocks_is_7\@: +- \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*7, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_6\@: +- \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*6, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_5\@: +- \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*5, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_4\@: +- \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*4, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_3\@: +- \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*3, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_2\@: +- \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*2, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_1\@: +- \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- sub $16*1, %r13 +- jmp .L_initial_blocks_encrypted\@ +- +-.L_initial_num_blocks_is_0\@: +- \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC +- +- +-.L_initial_blocks_encrypted\@: +- test %r13, %r13 +- je .L_zero_cipher_left\@ +- +- sub $128, %r13 +- je .L_eight_cipher_left\@ +- +- +- +- +- vmovd %xmm9, %r15d +- and $255, %r15d +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- +- +-.L_encrypt_by_8_new\@: +- cmp $(255-8), %r15d +- jg .L_encrypt_by_8\@ +- +- +- +- add $8, %r15b +- \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC +- add $128, %r11 +- sub $128, %r13 +- jne .L_encrypt_by_8_new\@ +- +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- jmp .L_eight_cipher_left\@ +- +-.L_encrypt_by_8\@: +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- add $8, %r15b +- \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- add $128, %r11 +- sub $128, %r13 +- jne .L_encrypt_by_8_new\@ +- +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- +- +- +- +-.L_eight_cipher_left\@: +- \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 +- +- +-.L_zero_cipher_left\@: +- vmovdqu %xmm14, AadHash(arg2) +- vmovdqu %xmm9, CurCount(arg2) +- +- # check for 0 length +- mov arg5, %r13 +- and $15, %r13 # r13 = (arg5 mod 16) +- +- je .L_multiple_of_16_bytes\@ +- +- # handle the last <16 Byte block separately +- +- mov %r13, PBlockLen(arg2) +- +- vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn +- vmovdqu %xmm9, CurCount(arg2) +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- +- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) +- vmovdqu %xmm9, PBlockEncKey(arg2) +- +- cmp $16, arg5 +- jge .L_large_enough_update\@ +- +- lea (arg4,%r11,1), %r10 +- mov %r13, %r12 +- +- READ_PARTIAL_BLOCK %r10 %r12 %xmm1 +- +- lea SHIFT_MASK+16(%rip), %r12 +- sub %r13, %r12 # adjust the shuffle mask pointer to be +- # able to shift 16-r13 bytes (r13 is the +- # number of bytes in plaintext mod 16) +- +- jmp .L_final_ghash_mul\@ +- +-.L_large_enough_update\@: +- sub $16, %r11 +- add %r13, %r11 +- +- # receive the last <16 Byte block +- vmovdqu (arg4, %r11, 1), %xmm1 +- +- sub %r13, %r11 +- add $16, %r11 +- +- lea SHIFT_MASK+16(%rip), %r12 +- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes +- # (r13 is the number of bytes in plaintext mod 16) +- sub %r13, %r12 +- # get the appropriate shuffle mask +- vmovdqu (%r12), %xmm2 +- # shift right 16-r13 bytes +- vpshufb %xmm2, %xmm1, %xmm1 +- +-.L_final_ghash_mul\@: +- .if \ENC_DEC == DEC +- vmovdqa %xmm1, %xmm2 +- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) +- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to +- # mask out top 16-r13 bytes of xmm9 +- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 +- vpand %xmm1, %xmm2, %xmm2 +- vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 +- vpxor %xmm2, %xmm14, %xmm14 +- +- vmovdqu %xmm14, AadHash(arg2) +- .else +- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) +- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to +- # mask out top 16-r13 bytes of xmm9 +- vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 +- vpxor %xmm9, %xmm14, %xmm14 +- +- vmovdqu %xmm14, AadHash(arg2) +- vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext +- .endif +- +- +- ############################# +- # output r13 Bytes +- vmovq %xmm9, %rax +- cmp $8, %r13 +- jle .L_less_than_8_bytes_left\@ +- +- mov %rax, (arg3 , %r11) +- add $8, %r11 +- vpsrldq $8, %xmm9, %xmm9 +- vmovq %xmm9, %rax +- sub $8, %r13 +- +-.L_less_than_8_bytes_left\@: +- movb %al, (arg3 , %r11) +- add $1, %r11 +- shr $8, %rax +- sub $1, %r13 +- jne .L_less_than_8_bytes_left\@ +- ############################# +- +-.L_multiple_of_16_bytes\@: +-.endm +- +- +-# GCM_COMPLETE Finishes update of tag of last partial block +-# Output: Authorization Tag (AUTH_TAG) +-# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 +-.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN +- vmovdqu AadHash(arg2), %xmm14 +- vmovdqu HashKey(arg2), %xmm13 +- +- mov PBlockLen(arg2), %r12 +- test %r12, %r12 +- je .L_partial_done\@ +- +- #GHASH computation for the last <16 Byte block +- \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 +- +-.L_partial_done\@: +- mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) +- shl $3, %r12 # convert into number of bits +- vmovd %r12d, %xmm15 # len(A) in xmm15 +- +- mov InLen(arg2), %r12 +- shl $3, %r12 # len(C) in bits (*128) +- vmovq %r12, %xmm1 +- vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 +- vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) +- +- vpxor %xmm15, %xmm14, %xmm14 +- \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation +- vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap +- +- vmovdqu OrigIV(arg2), %xmm9 +- +- ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) +- +- vpxor %xmm14, %xmm9, %xmm9 +- +- +- +-.L_return_T\@: +- mov \AUTH_TAG, %r10 # r10 = authTag +- mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len +- +- cmp $16, %r11 +- je .L_T_16\@ +- +- cmp $8, %r11 +- jl .L_T_4\@ +- +-.L_T_8\@: +- vmovq %xmm9, %rax +- mov %rax, (%r10) +- add $8, %r10 +- sub $8, %r11 +- vpsrldq $8, %xmm9, %xmm9 +- test %r11, %r11 +- je .L_return_T_done\@ +-.L_T_4\@: +- vmovd %xmm9, %eax +- mov %eax, (%r10) +- add $4, %r10 +- sub $4, %r11 +- vpsrldq $4, %xmm9, %xmm9 +- test %r11, %r11 +- je .L_return_T_done\@ +-.L_T_123\@: +- vmovd %xmm9, %eax +- cmp $2, %r11 +- jl .L_T_1\@ +- mov %ax, (%r10) +- cmp $2, %r11 +- je .L_return_T_done\@ +- add $2, %r10 +- sar $16, %eax +-.L_T_1\@: +- mov %al, (%r10) +- jmp .L_return_T_done\@ +- +-.L_T_16\@: +- vmovdqu %xmm9, (%r10) +- +-.L_return_T_done\@: +-.endm +- +-.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 +- +- mov \AAD, %r10 # r10 = AAD +- mov \AADLEN, %r12 # r12 = aadLen +- +- +- mov %r12, %r11 +- +- vpxor \T8, \T8, \T8 +- vpxor \T7, \T7, \T7 +- cmp $16, %r11 +- jl .L_get_AAD_rest8\@ +-.L_get_AAD_blocks\@: +- vmovdqu (%r10), \T7 +- vpshufb SHUF_MASK(%rip), \T7, \T7 +- vpxor \T7, \T8, \T8 +- \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 +- add $16, %r10 +- sub $16, %r12 +- sub $16, %r11 +- cmp $16, %r11 +- jge .L_get_AAD_blocks\@ +- vmovdqu \T8, \T7 +- test %r11, %r11 +- je .L_get_AAD_done\@ +- +- vpxor \T7, \T7, \T7 +- +- /* read the last <16B of AAD. since we have at least 4B of +- data right after the AAD (the ICV, and maybe some CT), we can +- read 4B/8B blocks safely, and then get rid of the extra stuff */ +-.L_get_AAD_rest8\@: +- cmp $4, %r11 +- jle .L_get_AAD_rest4\@ +- movq (%r10), \T1 +- add $8, %r10 +- sub $8, %r11 +- vpslldq $8, \T1, \T1 +- vpsrldq $8, \T7, \T7 +- vpxor \T1, \T7, \T7 +- jmp .L_get_AAD_rest8\@ +-.L_get_AAD_rest4\@: +- test %r11, %r11 +- jle .L_get_AAD_rest0\@ +- mov (%r10), %eax +- movq %rax, \T1 +- add $4, %r10 +- sub $4, %r11 +- vpslldq $12, \T1, \T1 +- vpsrldq $4, \T7, \T7 +- vpxor \T1, \T7, \T7 +-.L_get_AAD_rest0\@: +- /* finalize: shift out the extra bytes we read, and align +- left. since pslldq can only shift by an immediate, we use +- vpshufb and a pair of shuffle masks */ +- leaq ALL_F(%rip), %r11 +- subq %r12, %r11 +- vmovdqu 16(%r11), \T1 +- andq $~3, %r11 +- vpshufb (%r11), \T7, \T7 +- vpand \T1, \T7, \T7 +-.L_get_AAD_rest_final\@: +- vpshufb SHUF_MASK(%rip), \T7, \T7 +- vpxor \T8, \T7, \T7 +- \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 +- +-.L_get_AAD_done\@: +- vmovdqu \T7, AadHash(arg2) +-.endm +- +-.macro INIT GHASH_MUL PRECOMPUTE +- mov arg6, %r11 +- mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length +- xor %r11d, %r11d +- mov %r11, InLen(arg2) # ctx_data.in_length = 0 +- +- mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 +- mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 +- mov arg3, %rax +- movdqu (%rax), %xmm0 +- movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv +- +- vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 +- movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv +- +- vmovdqu (arg4), %xmm6 # xmm6 = HashKey +- +- vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 +- ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey +- vmovdqa %xmm6, %xmm2 +- vpsllq $1, %xmm6, %xmm6 +- vpsrlq $63, %xmm2, %xmm2 +- vmovdqa %xmm2, %xmm1 +- vpslldq $8, %xmm2, %xmm2 +- vpsrldq $8, %xmm1, %xmm1 +- vpor %xmm2, %xmm6, %xmm6 +- #reduction +- vpshufd $0b00100100, %xmm1, %xmm2 +- vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 +- vpand POLY(%rip), %xmm2, %xmm2 +- vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly +- ####################################################################### +- vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly +- +- CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 +- +- \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 +-.endm +- +- +-# Reads DLEN bytes starting at DPTR and stores in XMMDst +-# where 0 < DLEN < 16 +-# Clobbers %rax, DLEN +-.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst +- vpxor \XMMDst, \XMMDst, \XMMDst +- +- cmp $8, \DLEN +- jl .L_read_lt8_\@ +- mov (\DPTR), %rax +- vpinsrq $0, %rax, \XMMDst, \XMMDst +- sub $8, \DLEN +- jz .L_done_read_partial_block_\@ +- xor %eax, %eax +-.L_read_next_byte_\@: +- shl $8, %rax +- mov 7(\DPTR, \DLEN, 1), %al +- dec \DLEN +- jnz .L_read_next_byte_\@ +- vpinsrq $1, %rax, \XMMDst, \XMMDst +- jmp .L_done_read_partial_block_\@ +-.L_read_lt8_\@: +- xor %eax, %eax +-.L_read_next_byte_lt8_\@: +- shl $8, %rax +- mov -1(\DPTR, \DLEN, 1), %al +- dec \DLEN +- jnz .L_read_next_byte_lt8_\@ +- vpinsrq $0, %rax, \XMMDst, \XMMDst +-.L_done_read_partial_block_\@: +-.endm +- +-# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks +-# between update calls. +-# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK +-# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context +-# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 +-.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ +- AAD_HASH ENC_DEC +- mov PBlockLen(arg2), %r13 +- test %r13, %r13 +- je .L_partial_block_done_\@ # Leave Macro if no partial blocks +- # Read in input data without over reading +- cmp $16, \PLAIN_CYPH_LEN +- jl .L_fewer_than_16_bytes_\@ +- vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm +- jmp .L_data_read_\@ +- +-.L_fewer_than_16_bytes_\@: +- lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 +- mov \PLAIN_CYPH_LEN, %r12 +- READ_PARTIAL_BLOCK %r10 %r12 %xmm1 +- +- mov PBlockLen(arg2), %r13 +- +-.L_data_read_\@: # Finished reading in data +- +- vmovdqu PBlockEncKey(arg2), %xmm9 +- vmovdqu HashKey(arg2), %xmm13 +- +- lea SHIFT_MASK(%rip), %r12 +- +- # adjust the shuffle mask pointer to be able to shift r13 bytes +- # r16-r13 is the number of bytes in plaintext mod 16) +- add %r13, %r12 +- vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask +- vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes +- +-.if \ENC_DEC == DEC +- vmovdqa %xmm1, %xmm3 +- pxor %xmm1, %xmm9 # Ciphertext XOR E(K, Yn) +- +- mov \PLAIN_CYPH_LEN, %r10 +- add %r13, %r10 +- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling +- sub $16, %r10 +- # Determine if partial block is not being filled and +- # shift mask accordingly +- jge .L_no_extra_mask_1_\@ +- sub %r10, %r12 +-.L_no_extra_mask_1_\@: +- +- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 +- # get the appropriate mask to mask out bottom r13 bytes of xmm9 +- vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 +- +- vpand %xmm1, %xmm3, %xmm3 +- vmovdqa SHUF_MASK(%rip), %xmm10 +- vpshufb %xmm10, %xmm3, %xmm3 +- vpshufb %xmm2, %xmm3, %xmm3 +- vpxor %xmm3, \AAD_HASH, \AAD_HASH +- +- test %r10, %r10 +- jl .L_partial_incomplete_1_\@ +- +- # GHASH computation for the last <16 Byte block +- \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 +- xor %eax,%eax +- +- mov %rax, PBlockLen(arg2) +- jmp .L_dec_done_\@ +-.L_partial_incomplete_1_\@: +- add \PLAIN_CYPH_LEN, PBlockLen(arg2) +-.L_dec_done_\@: +- vmovdqu \AAD_HASH, AadHash(arg2) +-.else +- vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) +- +- mov \PLAIN_CYPH_LEN, %r10 +- add %r13, %r10 +- # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling +- sub $16, %r10 +- # Determine if partial block is not being filled and +- # shift mask accordingly +- jge .L_no_extra_mask_2_\@ +- sub %r10, %r12 +-.L_no_extra_mask_2_\@: +- +- vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 +- # get the appropriate mask to mask out bottom r13 bytes of xmm9 +- vpand %xmm1, %xmm9, %xmm9 +- +- vmovdqa SHUF_MASK(%rip), %xmm1 +- vpshufb %xmm1, %xmm9, %xmm9 +- vpshufb %xmm2, %xmm9, %xmm9 +- vpxor %xmm9, \AAD_HASH, \AAD_HASH +- +- test %r10, %r10 +- jl .L_partial_incomplete_2_\@ +- +- # GHASH computation for the last <16 Byte block +- \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 +- xor %eax,%eax +- +- mov %rax, PBlockLen(arg2) +- jmp .L_encode_done_\@ +-.L_partial_incomplete_2_\@: +- add \PLAIN_CYPH_LEN, PBlockLen(arg2) +-.L_encode_done_\@: +- vmovdqu \AAD_HASH, AadHash(arg2) +- +- vmovdqa SHUF_MASK(%rip), %xmm10 +- # shuffle xmm9 back to output as ciphertext +- vpshufb %xmm10, %xmm9, %xmm9 +- vpshufb %xmm2, %xmm9, %xmm9 +-.endif +- # output encrypted Bytes +- test %r10, %r10 +- jl .L_partial_fill_\@ +- mov %r13, %r12 +- mov $16, %r13 +- # Set r13 to be the number of bytes to write out +- sub %r12, %r13 +- jmp .L_count_set_\@ +-.L_partial_fill_\@: +- mov \PLAIN_CYPH_LEN, %r13 +-.L_count_set_\@: +- vmovdqa %xmm9, %xmm0 +- vmovq %xmm0, %rax +- cmp $8, %r13 +- jle .L_less_than_8_bytes_left_\@ +- +- mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) +- add $8, \DATA_OFFSET +- psrldq $8, %xmm0 +- vmovq %xmm0, %rax +- sub $8, %r13 +-.L_less_than_8_bytes_left_\@: +- movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) +- add $1, \DATA_OFFSET +- shr $8, %rax +- sub $1, %r13 +- jne .L_less_than_8_bytes_left_\@ +-.L_partial_block_done_\@: +-.endm # PARTIAL_BLOCK +- +-############################################################################### +-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +-# Input: A and B (128-bits each, bit-reflected) +-# Output: C = A*B*x mod poly, (i.e. >>1 ) +-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +-############################################################################### +-.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 +- +- vpshufd $0b01001110, \GH, \T2 +- vpshufd $0b01001110, \HK, \T3 +- vpxor \GH , \T2, \T2 # T2 = (a1+a0) +- vpxor \HK , \T3, \T3 # T3 = (b1+b0) +- +- vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 +- vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 +- vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) +- vpxor \GH, \T2,\T2 +- vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 +- +- vpslldq $8, \T2,\T3 # shift-L T3 2 DWs +- vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs +- vpxor \T3, \GH, \GH +- vpxor \T2, \T1, \T1 # = GH x HK +- +- #first phase of the reduction +- vpslld $31, \GH, \T2 # packed right shifting << 31 +- vpslld $30, \GH, \T3 # packed right shifting shift << 30 +- vpslld $25, \GH, \T4 # packed right shifting shift << 25 +- +- vpxor \T3, \T2, \T2 # xor the shifted versions +- vpxor \T4, \T2, \T2 +- +- vpsrldq $4, \T2, \T5 # shift-R T5 1 DW +- +- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs +- vpxor \T2, \GH, \GH # first phase of the reduction complete +- +- #second phase of the reduction +- +- vpsrld $1,\GH, \T2 # packed left shifting >> 1 +- vpsrld $2,\GH, \T3 # packed left shifting >> 2 +- vpsrld $7,\GH, \T4 # packed left shifting >> 7 +- vpxor \T3, \T2, \T2 # xor the shifted versions +- vpxor \T4, \T2, \T2 +- +- vpxor \T5, \T2, \T2 +- vpxor \T2, \GH, \GH +- vpxor \T1, \GH, \GH # the result is in GH +- +- +-.endm +- +-.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 +- +- # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +- vmovdqa \HK, \T5 +- +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly +- vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_2_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly +- vmovdqu \T5, HashKey_3(arg2) +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_3_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly +- vmovdqu \T5, HashKey_4(arg2) +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_4_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly +- vmovdqu \T5, HashKey_5(arg2) +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_5_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly +- vmovdqu \T5, HashKey_6(arg2) +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_6_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly +- vmovdqu \T5, HashKey_7(arg2) +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_7_k(arg2) +- +- GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly +- vmovdqu \T5, HashKey_8(arg2) +- vpshufd $0b01001110, \T5, \T1 +- vpxor \T5, \T1, \T1 +- vmovdqu \T1, HashKey_8_k(arg2) +- +-.endm +- +-## if a = number of total plaintext bytes +-## b = floor(a/16) +-## num_initial_blocks = b mod 4# +-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +-## r10, r11, r12, rax are clobbered +-## arg1, arg2, arg3, arg4 are used as pointers only, not modified +- +-.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC +- i = (8-\num_initial_blocks) +- setreg +- vmovdqu AadHash(arg2), reg_i +- +- # start AES for num_initial_blocks blocks +- vmovdqu CurCount(arg2), \CTR +- +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, reg_i +- vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap +- i = (i+1) +- setreg +-.endr +- +- vmovdqa (arg1), \T_key +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vpxor \T_key, reg_i, reg_i +- i = (i+1) +- setreg +-.endr +- +- j = 1 +- setreg +-.rep \REP +- vmovdqa 16*j(arg1), \T_key +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vaesenc \T_key, reg_i, reg_i +- i = (i+1) +- setreg +-.endr +- +- j = (j+1) +- setreg +-.endr +- +- vmovdqa 16*j(arg1), \T_key +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vaesenclast \T_key, reg_i, reg_i +- i = (i+1) +- setreg +-.endr +- +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vmovdqu (arg4, %r11), \T1 +- vpxor \T1, reg_i, reg_i +- vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks +- add $16, %r11 +-.if \ENC_DEC == DEC +- vmovdqa \T1, reg_i +-.endif +- vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations +- i = (i+1) +- setreg +-.endr +- +- +- i = (8-\num_initial_blocks) +- j = (9-\num_initial_blocks) +- setreg +- +-.rep \num_initial_blocks +- vpxor reg_i, reg_j, reg_j +- GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks +- i = (i+1) +- j = (j+1) +- setreg +-.endr +- # XMM8 has the combined result here +- +- vmovdqa \XMM8, TMP1(%rsp) +- vmovdqa \XMM8, \T3 +- +- cmp $128, %r13 +- jl .L_initial_blocks_done\@ # no need for precomputed constants +- +-############################################################################### +-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM1 +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM2 +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM3 +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM4 +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM5 +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM6 +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM7 +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM8 +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +- +- vmovdqa (arg1), \T_key +- vpxor \T_key, \XMM1, \XMM1 +- vpxor \T_key, \XMM2, \XMM2 +- vpxor \T_key, \XMM3, \XMM3 +- vpxor \T_key, \XMM4, \XMM4 +- vpxor \T_key, \XMM5, \XMM5 +- vpxor \T_key, \XMM6, \XMM6 +- vpxor \T_key, \XMM7, \XMM7 +- vpxor \T_key, \XMM8, \XMM8 +- +- i = 1 +- setreg +-.rep \REP # do REP rounds +- vmovdqa 16*i(arg1), \T_key +- vaesenc \T_key, \XMM1, \XMM1 +- vaesenc \T_key, \XMM2, \XMM2 +- vaesenc \T_key, \XMM3, \XMM3 +- vaesenc \T_key, \XMM4, \XMM4 +- vaesenc \T_key, \XMM5, \XMM5 +- vaesenc \T_key, \XMM6, \XMM6 +- vaesenc \T_key, \XMM7, \XMM7 +- vaesenc \T_key, \XMM8, \XMM8 +- i = (i+1) +- setreg +-.endr +- +- vmovdqa 16*i(arg1), \T_key +- vaesenclast \T_key, \XMM1, \XMM1 +- vaesenclast \T_key, \XMM2, \XMM2 +- vaesenclast \T_key, \XMM3, \XMM3 +- vaesenclast \T_key, \XMM4, \XMM4 +- vaesenclast \T_key, \XMM5, \XMM5 +- vaesenclast \T_key, \XMM6, \XMM6 +- vaesenclast \T_key, \XMM7, \XMM7 +- vaesenclast \T_key, \XMM8, \XMM8 +- +- vmovdqu (arg4, %r11), \T1 +- vpxor \T1, \XMM1, \XMM1 +- vmovdqu \XMM1, (arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM1 +- .endif +- +- vmovdqu 16*1(arg4, %r11), \T1 +- vpxor \T1, \XMM2, \XMM2 +- vmovdqu \XMM2, 16*1(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM2 +- .endif +- +- vmovdqu 16*2(arg4, %r11), \T1 +- vpxor \T1, \XMM3, \XMM3 +- vmovdqu \XMM3, 16*2(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM3 +- .endif +- +- vmovdqu 16*3(arg4, %r11), \T1 +- vpxor \T1, \XMM4, \XMM4 +- vmovdqu \XMM4, 16*3(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM4 +- .endif +- +- vmovdqu 16*4(arg4, %r11), \T1 +- vpxor \T1, \XMM5, \XMM5 +- vmovdqu \XMM5, 16*4(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM5 +- .endif +- +- vmovdqu 16*5(arg4, %r11), \T1 +- vpxor \T1, \XMM6, \XMM6 +- vmovdqu \XMM6, 16*5(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM6 +- .endif +- +- vmovdqu 16*6(arg4, %r11), \T1 +- vpxor \T1, \XMM7, \XMM7 +- vmovdqu \XMM7, 16*6(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM7 +- .endif +- +- vmovdqu 16*7(arg4, %r11), \T1 +- vpxor \T1, \XMM8, \XMM8 +- vmovdqu \XMM8, 16*7(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM8 +- .endif +- +- add $128, %r11 +- +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +- +-############################################################################### +- +-.L_initial_blocks_done\@: +- +-.endm +- +-# encrypt 8 blocks at a time +-# ghash the 8 previously encrypted ciphertext blocks +-# arg1, arg2, arg3, arg4 are used as pointers only, not modified +-# r11 is the data offset value +-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +- +- vmovdqa \XMM1, \T2 +- vmovdqa \XMM2, TMP2(%rsp) +- vmovdqa \XMM3, TMP3(%rsp) +- vmovdqa \XMM4, TMP4(%rsp) +- vmovdqa \XMM5, TMP5(%rsp) +- vmovdqa \XMM6, TMP6(%rsp) +- vmovdqa \XMM7, TMP7(%rsp) +- vmovdqa \XMM8, TMP8(%rsp) +- +-.if \loop_idx == in_order +- vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT +- vpaddd ONE(%rip), \XMM1, \XMM2 +- vpaddd ONE(%rip), \XMM2, \XMM3 +- vpaddd ONE(%rip), \XMM3, \XMM4 +- vpaddd ONE(%rip), \XMM4, \XMM5 +- vpaddd ONE(%rip), \XMM5, \XMM6 +- vpaddd ONE(%rip), \XMM6, \XMM7 +- vpaddd ONE(%rip), \XMM7, \XMM8 +- vmovdqa \XMM8, \CTR +- +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +-.else +- vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT +- vpaddd ONEf(%rip), \XMM1, \XMM2 +- vpaddd ONEf(%rip), \XMM2, \XMM3 +- vpaddd ONEf(%rip), \XMM3, \XMM4 +- vpaddd ONEf(%rip), \XMM4, \XMM5 +- vpaddd ONEf(%rip), \XMM5, \XMM6 +- vpaddd ONEf(%rip), \XMM6, \XMM7 +- vpaddd ONEf(%rip), \XMM7, \XMM8 +- vmovdqa \XMM8, \CTR +-.endif +- +- +- ####################################################################### +- +- vmovdqu (arg1), \T1 +- vpxor \T1, \XMM1, \XMM1 +- vpxor \T1, \XMM2, \XMM2 +- vpxor \T1, \XMM3, \XMM3 +- vpxor \T1, \XMM4, \XMM4 +- vpxor \T1, \XMM5, \XMM5 +- vpxor \T1, \XMM6, \XMM6 +- vpxor \T1, \XMM7, \XMM7 +- vpxor \T1, \XMM8, \XMM8 +- +- ####################################################################### +- +- +- +- +- +- vmovdqu 16*1(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqu 16*2(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- +- ####################################################################### +- +- vmovdqu HashKey_8(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 +- vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 +- +- vpshufd $0b01001110, \T2, \T6 +- vpxor \T2, \T6, \T6 +- +- vmovdqu HashKey_8_k(arg2), \T5 +- vpclmulqdq $0x00, \T5, \T6, \T6 +- +- vmovdqu 16*3(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP2(%rsp), \T1 +- vmovdqu HashKey_7(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_7_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*4(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- ####################################################################### +- +- vmovdqa TMP3(%rsp), \T1 +- vmovdqu HashKey_6(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_6_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*5(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP4(%rsp), \T1 +- vmovdqu HashKey_5(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_5_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*6(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- +- vmovdqa TMP5(%rsp), \T1 +- vmovdqu HashKey_4(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_4_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*7(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP6(%rsp), \T1 +- vmovdqu HashKey_3(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_3_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- +- vmovdqu 16*8(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP7(%rsp), \T1 +- vmovdqu HashKey_2(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_2_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- ####################################################################### +- +- vmovdqu 16*9(arg1), \T5 +- vaesenc \T5, \XMM1, \XMM1 +- vaesenc \T5, \XMM2, \XMM2 +- vaesenc \T5, \XMM3, \XMM3 +- vaesenc \T5, \XMM4, \XMM4 +- vaesenc \T5, \XMM5, \XMM5 +- vaesenc \T5, \XMM6, \XMM6 +- vaesenc \T5, \XMM7, \XMM7 +- vaesenc \T5, \XMM8, \XMM8 +- +- vmovdqa TMP8(%rsp), \T1 +- vmovdqu HashKey(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpshufd $0b01001110, \T1, \T3 +- vpxor \T1, \T3, \T3 +- vmovdqu HashKey_k(arg2), \T5 +- vpclmulqdq $0x10, \T5, \T3, \T3 +- vpxor \T3, \T6, \T6 +- +- vpxor \T4, \T6, \T6 +- vpxor \T7, \T6, \T6 +- +- vmovdqu 16*10(arg1), \T5 +- +- i = 11 +- setreg +-.rep (\REP-9) +- +- vaesenc \T5, \XMM1, \XMM1 +- vaesenc \T5, \XMM2, \XMM2 +- vaesenc \T5, \XMM3, \XMM3 +- vaesenc \T5, \XMM4, \XMM4 +- vaesenc \T5, \XMM5, \XMM5 +- vaesenc \T5, \XMM6, \XMM6 +- vaesenc \T5, \XMM7, \XMM7 +- vaesenc \T5, \XMM8, \XMM8 +- +- vmovdqu 16*i(arg1), \T5 +- i = i + 1 +- setreg +-.endr +- +- i = 0 +- j = 1 +- setreg +-.rep 8 +- vpxor 16*i(arg4, %r11), \T5, \T2 +- .if \ENC_DEC == ENC +- vaesenclast \T2, reg_j, reg_j +- .else +- vaesenclast \T2, reg_j, \T3 +- vmovdqu 16*i(arg4, %r11), reg_j +- vmovdqu \T3, 16*i(arg3, %r11) +- .endif +- i = (i+1) +- j = (j+1) +- setreg +-.endr +- ####################################################################### +- +- +- vpslldq $8, \T6, \T3 # shift-L T3 2 DWs +- vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs +- vpxor \T3, \T7, \T7 +- vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 +- +- +- +- ####################################################################### +- #first phase of the reduction +- ####################################################################### +- vpslld $31, \T7, \T2 # packed right shifting << 31 +- vpslld $30, \T7, \T3 # packed right shifting shift << 30 +- vpslld $25, \T7, \T4 # packed right shifting shift << 25 +- +- vpxor \T3, \T2, \T2 # xor the shifted versions +- vpxor \T4, \T2, \T2 +- +- vpsrldq $4, \T2, \T1 # shift-R T1 1 DW +- +- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs +- vpxor \T2, \T7, \T7 # first phase of the reduction complete +- ####################################################################### +- .if \ENC_DEC == ENC +- vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer +- .endif +- +- ####################################################################### +- #second phase of the reduction +- vpsrld $1, \T7, \T2 # packed left shifting >> 1 +- vpsrld $2, \T7, \T3 # packed left shifting >> 2 +- vpsrld $7, \T7, \T4 # packed left shifting >> 7 +- vpxor \T3, \T2, \T2 # xor the shifted versions +- vpxor \T4, \T2, \T2 +- +- vpxor \T1, \T2, \T2 +- vpxor \T2, \T7, \T7 +- vpxor \T7, \T6, \T6 # the result is in T6 +- ####################################################################### +- +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +- +- +- vpxor \T6, \XMM1, \XMM1 +- +- +- +-.endm +- +- +-# GHASH the last 4 ciphertext blocks. +-.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 +- +- ## Karatsuba Method +- +- +- vpshufd $0b01001110, \XMM1, \T2 +- vpxor \XMM1, \T2, \T2 +- vmovdqu HashKey_8(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM1, \T6 +- vpclmulqdq $0x00, \T5, \XMM1, \T7 +- +- vmovdqu HashKey_8_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM2, \T2 +- vpxor \XMM2, \T2, \T2 +- vmovdqu HashKey_7(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM2, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM2, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_7_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM3, \T2 +- vpxor \XMM3, \T2, \T2 +- vmovdqu HashKey_6(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM3, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM3, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_6_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM4, \T2 +- vpxor \XMM4, \T2, \T2 +- vmovdqu HashKey_5(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM4, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM4, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_5_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM5, \T2 +- vpxor \XMM5, \T2, \T2 +- vmovdqu HashKey_4(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM5, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM5, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_4_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM6, \T2 +- vpxor \XMM6, \T2, \T2 +- vmovdqu HashKey_3(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM6, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM6, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_3_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM7, \T2 +- vpxor \XMM7, \T2, \T2 +- vmovdqu HashKey_2(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM7, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM7, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_2_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vpshufd $0b01001110, \XMM8, \T2 +- vpxor \XMM8, \T2, \T2 +- vmovdqu HashKey(arg2), \T5 +- vpclmulqdq $0x11, \T5, \XMM8, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM8, \T4 +- vpxor \T4, \T7, \T7 +- +- vmovdqu HashKey_k(arg2), \T3 +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- vpxor \T6, \XMM1, \XMM1 +- vpxor \T7, \XMM1, \T2 +- +- +- +- +- vpslldq $8, \T2, \T4 +- vpsrldq $8, \T2, \T2 +- +- vpxor \T4, \T7, \T7 +- vpxor \T2, \T6, \T6 # holds the result of +- # the accumulated carry-less multiplications +- +- ####################################################################### +- #first phase of the reduction +- vpslld $31, \T7, \T2 # packed right shifting << 31 +- vpslld $30, \T7, \T3 # packed right shifting shift << 30 +- vpslld $25, \T7, \T4 # packed right shifting shift << 25 +- +- vpxor \T3, \T2, \T2 # xor the shifted versions +- vpxor \T4, \T2, \T2 +- +- vpsrldq $4, \T2, \T1 # shift-R T1 1 DW +- +- vpslldq $12, \T2, \T2 # shift-L T2 3 DWs +- vpxor \T2, \T7, \T7 # first phase of the reduction complete +- ####################################################################### +- +- +- #second phase of the reduction +- vpsrld $1, \T7, \T2 # packed left shifting >> 1 +- vpsrld $2, \T7, \T3 # packed left shifting >> 2 +- vpsrld $7, \T7, \T4 # packed left shifting >> 7 +- vpxor \T3, \T2, \T2 # xor the shifted versions +- vpxor \T4, \T2, \T2 +- +- vpxor \T1, \T2, \T2 +- vpxor \T2, \T7, \T7 +- vpxor \T7, \T6, \T6 # the result is in T6 +- +-.endm +- +-############################################################# +-#void aesni_gcm_precomp_avx_gen2 +-# (gcm_data *my_ctx_data, +-# gcm_context_data *data, +-# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +-# u8 *iv, /* Pre-counter block j0: 4 byte salt +-# (from Security Association) concatenated with 8 byte +-# Initialisation Vector (from IPSec ESP Payload) +-# concatenated with 0x00000001. 16-byte aligned pointer. */ +-# const u8 *aad, /* Additional Authentication Data (AAD)*/ +-# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +-############################################################# +-SYM_FUNC_START(aesni_gcm_init_avx_gen2) +- FUNC_SAVE +- INIT GHASH_MUL_AVX, PRECOMPUTE_AVX +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_init_avx_gen2) +- +-############################################################################### +-#void aesni_gcm_enc_update_avx_gen2( +-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +-# gcm_context_data *data, +-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +-# const u8 *in, /* Plaintext input */ +-# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +-############################################################################### +-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) +- FUNC_SAVE +- mov keysize, %eax +- cmp $32, %eax +- je key_256_enc_update +- cmp $16, %eax +- je key_128_enc_update +- # must be 192 +- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 +- FUNC_RESTORE +- RET +-key_128_enc_update: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 +- FUNC_RESTORE +- RET +-key_256_enc_update: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) +- +-############################################################################### +-#void aesni_gcm_dec_update_avx_gen2( +-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +-# gcm_context_data *data, +-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +-# const u8 *in, /* Ciphertext input */ +-# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +-############################################################################### +-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) +- FUNC_SAVE +- mov keysize,%eax +- cmp $32, %eax +- je key_256_dec_update +- cmp $16, %eax +- je key_128_dec_update +- # must be 192 +- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 +- FUNC_RESTORE +- RET +-key_128_dec_update: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 +- FUNC_RESTORE +- RET +-key_256_dec_update: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) +- +-############################################################################### +-#void aesni_gcm_finalize_avx_gen2( +-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +-# gcm_context_data *data, +-# u8 *auth_tag, /* Authenticated Tag output. */ +-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +-# Valid values are 16 (most likely), 12 or 8. */ +-############################################################################### +-SYM_FUNC_START(aesni_gcm_finalize_avx_gen2) +- FUNC_SAVE +- mov keysize,%eax +- cmp $32, %eax +- je key_256_finalize +- cmp $16, %eax +- je key_128_finalize +- # must be 192 +- GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 +- FUNC_RESTORE +- RET +-key_128_finalize: +- GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 +- FUNC_RESTORE +- RET +-key_256_finalize: +- GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_finalize_avx_gen2) +- +-############################################################################### +-# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) +-# Input: A and B (128-bits each, bit-reflected) +-# Output: C = A*B*x mod poly, (i.e. >>1 ) +-# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input +-# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. +-############################################################################### +-.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 +- +- vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 +- vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 +- vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 +- vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 +- vpxor \T3, \GH, \GH +- +- +- vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs +- vpslldq $8 , \GH, \GH # shift-L GH 2 DWs +- +- vpxor \T3, \T1, \T1 +- vpxor \T2, \GH, \GH +- +- ####################################################################### +- #first phase of the reduction +- vmovdqa POLY2(%rip), \T3 +- +- vpclmulqdq $0x01, \GH, \T3, \T2 +- vpslldq $8, \T2, \T2 # shift-L T2 2 DWs +- +- vpxor \T2, \GH, \GH # first phase of the reduction complete +- ####################################################################### +- #second phase of the reduction +- vpclmulqdq $0x00, \GH, \T3, \T2 +- vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) +- +- vpclmulqdq $0x10, \GH, \T3, \GH +- vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) +- +- vpxor \T2, \GH, \GH # second phase of the reduction complete +- ####################################################################### +- vpxor \T1, \GH, \GH # the result is in GH +- +- +-.endm +- +-.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 +- +- # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +- vmovdqa \HK, \T5 +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly +- vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly +- +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly +- vmovdqu \T5, HashKey_3(arg2) +- +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly +- vmovdqu \T5, HashKey_4(arg2) +- +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly +- vmovdqu \T5, HashKey_5(arg2) +- +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly +- vmovdqu \T5, HashKey_6(arg2) +- +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly +- vmovdqu \T5, HashKey_7(arg2) +- +- GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly +- vmovdqu \T5, HashKey_8(arg2) +- +-.endm +- +-## if a = number of total plaintext bytes +-## b = floor(a/16) +-## num_initial_blocks = b mod 4# +-## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext +-## r10, r11, r12, rax are clobbered +-## arg1, arg2, arg3, arg4 are used as pointers only, not modified +- +-.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER +- i = (8-\num_initial_blocks) +- setreg +- vmovdqu AadHash(arg2), reg_i +- +- # start AES for num_initial_blocks blocks +- vmovdqu CurCount(arg2), \CTR +- +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, reg_i +- vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap +- i = (i+1) +- setreg +-.endr +- +- vmovdqa (arg1), \T_key +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vpxor \T_key, reg_i, reg_i +- i = (i+1) +- setreg +-.endr +- +- j = 1 +- setreg +-.rep \REP +- vmovdqa 16*j(arg1), \T_key +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vaesenc \T_key, reg_i, reg_i +- i = (i+1) +- setreg +-.endr +- +- j = (j+1) +- setreg +-.endr +- +- +- vmovdqa 16*j(arg1), \T_key +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vaesenclast \T_key, reg_i, reg_i +- i = (i+1) +- setreg +-.endr +- +- i = (9-\num_initial_blocks) +- setreg +-.rep \num_initial_blocks +- vmovdqu (arg4, %r11), \T1 +- vpxor \T1, reg_i, reg_i +- vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for +- # num_initial_blocks blocks +- add $16, %r11 +-.if \ENC_DEC == DEC +- vmovdqa \T1, reg_i +-.endif +- vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations +- i = (i+1) +- setreg +-.endr +- +- +- i = (8-\num_initial_blocks) +- j = (9-\num_initial_blocks) +- setreg +- +-.rep \num_initial_blocks +- vpxor reg_i, reg_j, reg_j +- GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks +- i = (i+1) +- j = (j+1) +- setreg +-.endr +- # XMM8 has the combined result here +- +- vmovdqa \XMM8, TMP1(%rsp) +- vmovdqa \XMM8, \T3 +- +- cmp $128, %r13 +- jl .L_initial_blocks_done\@ # no need for precomputed constants +- +-############################################################################### +-# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM1 +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM2 +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM3 +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM4 +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM5 +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM6 +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM7 +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- +- vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 +- vmovdqa \CTR, \XMM8 +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +- +- vmovdqa (arg1), \T_key +- vpxor \T_key, \XMM1, \XMM1 +- vpxor \T_key, \XMM2, \XMM2 +- vpxor \T_key, \XMM3, \XMM3 +- vpxor \T_key, \XMM4, \XMM4 +- vpxor \T_key, \XMM5, \XMM5 +- vpxor \T_key, \XMM6, \XMM6 +- vpxor \T_key, \XMM7, \XMM7 +- vpxor \T_key, \XMM8, \XMM8 +- +- i = 1 +- setreg +-.rep \REP # do REP rounds +- vmovdqa 16*i(arg1), \T_key +- vaesenc \T_key, \XMM1, \XMM1 +- vaesenc \T_key, \XMM2, \XMM2 +- vaesenc \T_key, \XMM3, \XMM3 +- vaesenc \T_key, \XMM4, \XMM4 +- vaesenc \T_key, \XMM5, \XMM5 +- vaesenc \T_key, \XMM6, \XMM6 +- vaesenc \T_key, \XMM7, \XMM7 +- vaesenc \T_key, \XMM8, \XMM8 +- i = (i+1) +- setreg +-.endr +- +- +- vmovdqa 16*i(arg1), \T_key +- vaesenclast \T_key, \XMM1, \XMM1 +- vaesenclast \T_key, \XMM2, \XMM2 +- vaesenclast \T_key, \XMM3, \XMM3 +- vaesenclast \T_key, \XMM4, \XMM4 +- vaesenclast \T_key, \XMM5, \XMM5 +- vaesenclast \T_key, \XMM6, \XMM6 +- vaesenclast \T_key, \XMM7, \XMM7 +- vaesenclast \T_key, \XMM8, \XMM8 +- +- vmovdqu (arg4, %r11), \T1 +- vpxor \T1, \XMM1, \XMM1 +- vmovdqu \XMM1, (arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM1 +- .endif +- +- vmovdqu 16*1(arg4, %r11), \T1 +- vpxor \T1, \XMM2, \XMM2 +- vmovdqu \XMM2, 16*1(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM2 +- .endif +- +- vmovdqu 16*2(arg4, %r11), \T1 +- vpxor \T1, \XMM3, \XMM3 +- vmovdqu \XMM3, 16*2(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM3 +- .endif +- +- vmovdqu 16*3(arg4, %r11), \T1 +- vpxor \T1, \XMM4, \XMM4 +- vmovdqu \XMM4, 16*3(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM4 +- .endif +- +- vmovdqu 16*4(arg4, %r11), \T1 +- vpxor \T1, \XMM5, \XMM5 +- vmovdqu \XMM5, 16*4(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM5 +- .endif +- +- vmovdqu 16*5(arg4, %r11), \T1 +- vpxor \T1, \XMM6, \XMM6 +- vmovdqu \XMM6, 16*5(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM6 +- .endif +- +- vmovdqu 16*6(arg4, %r11), \T1 +- vpxor \T1, \XMM7, \XMM7 +- vmovdqu \XMM7, 16*6(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM7 +- .endif +- +- vmovdqu 16*7(arg4, %r11), \T1 +- vpxor \T1, \XMM8, \XMM8 +- vmovdqu \XMM8, 16*7(arg3 , %r11) +- .if \ENC_DEC == DEC +- vmovdqa \T1, \XMM8 +- .endif +- +- add $128, %r11 +- +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with +- # the corresponding ciphertext +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +- +-############################################################################### +- +-.L_initial_blocks_done\@: +- +- +-.endm +- +- +- +-# encrypt 8 blocks at a time +-# ghash the 8 previously encrypted ciphertext blocks +-# arg1, arg2, arg3, arg4 are used as pointers only, not modified +-# r11 is the data offset value +-.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC +- +- vmovdqa \XMM1, \T2 +- vmovdqa \XMM2, TMP2(%rsp) +- vmovdqa \XMM3, TMP3(%rsp) +- vmovdqa \XMM4, TMP4(%rsp) +- vmovdqa \XMM5, TMP5(%rsp) +- vmovdqa \XMM6, TMP6(%rsp) +- vmovdqa \XMM7, TMP7(%rsp) +- vmovdqa \XMM8, TMP8(%rsp) +- +-.if \loop_idx == in_order +- vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT +- vpaddd ONE(%rip), \XMM1, \XMM2 +- vpaddd ONE(%rip), \XMM2, \XMM3 +- vpaddd ONE(%rip), \XMM3, \XMM4 +- vpaddd ONE(%rip), \XMM4, \XMM5 +- vpaddd ONE(%rip), \XMM5, \XMM6 +- vpaddd ONE(%rip), \XMM6, \XMM7 +- vpaddd ONE(%rip), \XMM7, \XMM8 +- vmovdqa \XMM8, \CTR +- +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +-.else +- vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT +- vpaddd ONEf(%rip), \XMM1, \XMM2 +- vpaddd ONEf(%rip), \XMM2, \XMM3 +- vpaddd ONEf(%rip), \XMM3, \XMM4 +- vpaddd ONEf(%rip), \XMM4, \XMM5 +- vpaddd ONEf(%rip), \XMM5, \XMM6 +- vpaddd ONEf(%rip), \XMM6, \XMM7 +- vpaddd ONEf(%rip), \XMM7, \XMM8 +- vmovdqa \XMM8, \CTR +-.endif +- +- +- ####################################################################### +- +- vmovdqu (arg1), \T1 +- vpxor \T1, \XMM1, \XMM1 +- vpxor \T1, \XMM2, \XMM2 +- vpxor \T1, \XMM3, \XMM3 +- vpxor \T1, \XMM4, \XMM4 +- vpxor \T1, \XMM5, \XMM5 +- vpxor \T1, \XMM6, \XMM6 +- vpxor \T1, \XMM7, \XMM7 +- vpxor \T1, \XMM8, \XMM8 +- +- ####################################################################### +- +- +- +- +- +- vmovdqu 16*1(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqu 16*2(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- +- ####################################################################### +- +- vmovdqu HashKey_8(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 +- vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 +- vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 +- vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 +- vpxor \T5, \T6, \T6 +- +- vmovdqu 16*3(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP2(%rsp), \T1 +- vmovdqu HashKey_7(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*4(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- ####################################################################### +- +- vmovdqa TMP3(%rsp), \T1 +- vmovdqu HashKey_6(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*5(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP4(%rsp), \T1 +- vmovdqu HashKey_5(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*6(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- +- vmovdqa TMP5(%rsp), \T1 +- vmovdqu HashKey_4(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*7(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP6(%rsp), \T1 +- vmovdqu HashKey_3(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vmovdqu 16*8(arg1), \T1 +- vaesenc \T1, \XMM1, \XMM1 +- vaesenc \T1, \XMM2, \XMM2 +- vaesenc \T1, \XMM3, \XMM3 +- vaesenc \T1, \XMM4, \XMM4 +- vaesenc \T1, \XMM5, \XMM5 +- vaesenc \T1, \XMM6, \XMM6 +- vaesenc \T1, \XMM7, \XMM7 +- vaesenc \T1, \XMM8, \XMM8 +- +- vmovdqa TMP7(%rsp), \T1 +- vmovdqu HashKey_2(arg2), \T5 +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T4 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- +- ####################################################################### +- +- vmovdqu 16*9(arg1), \T5 +- vaesenc \T5, \XMM1, \XMM1 +- vaesenc \T5, \XMM2, \XMM2 +- vaesenc \T5, \XMM3, \XMM3 +- vaesenc \T5, \XMM4, \XMM4 +- vaesenc \T5, \XMM5, \XMM5 +- vaesenc \T5, \XMM6, \XMM6 +- vaesenc \T5, \XMM7, \XMM7 +- vaesenc \T5, \XMM8, \XMM8 +- +- vmovdqa TMP8(%rsp), \T1 +- vmovdqu HashKey(arg2), \T5 +- +- vpclmulqdq $0x00, \T5, \T1, \T3 +- vpxor \T3, \T7, \T7 +- +- vpclmulqdq $0x01, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x10, \T5, \T1, \T3 +- vpxor \T3, \T6, \T6 +- +- vpclmulqdq $0x11, \T5, \T1, \T3 +- vpxor \T3, \T4, \T1 +- +- +- vmovdqu 16*10(arg1), \T5 +- +- i = 11 +- setreg +-.rep (\REP-9) +- vaesenc \T5, \XMM1, \XMM1 +- vaesenc \T5, \XMM2, \XMM2 +- vaesenc \T5, \XMM3, \XMM3 +- vaesenc \T5, \XMM4, \XMM4 +- vaesenc \T5, \XMM5, \XMM5 +- vaesenc \T5, \XMM6, \XMM6 +- vaesenc \T5, \XMM7, \XMM7 +- vaesenc \T5, \XMM8, \XMM8 +- +- vmovdqu 16*i(arg1), \T5 +- i = i + 1 +- setreg +-.endr +- +- i = 0 +- j = 1 +- setreg +-.rep 8 +- vpxor 16*i(arg4, %r11), \T5, \T2 +- .if \ENC_DEC == ENC +- vaesenclast \T2, reg_j, reg_j +- .else +- vaesenclast \T2, reg_j, \T3 +- vmovdqu 16*i(arg4, %r11), reg_j +- vmovdqu \T3, 16*i(arg3, %r11) +- .endif +- i = (i+1) +- j = (j+1) +- setreg +-.endr +- ####################################################################### +- +- +- vpslldq $8, \T6, \T3 # shift-L T3 2 DWs +- vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs +- vpxor \T3, \T7, \T7 +- vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 +- +- +- +- ####################################################################### +- #first phase of the reduction +- vmovdqa POLY2(%rip), \T3 +- +- vpclmulqdq $0x01, \T7, \T3, \T2 +- vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs +- +- vpxor \T2, \T7, \T7 # first phase of the reduction complete +- ####################################################################### +- .if \ENC_DEC == ENC +- vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer +- vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer +- .endif +- +- ####################################################################### +- #second phase of the reduction +- vpclmulqdq $0x00, \T7, \T3, \T2 +- vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) +- +- vpclmulqdq $0x10, \T7, \T3, \T4 +- vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) +- +- vpxor \T2, \T4, \T4 # second phase of the reduction complete +- ####################################################################### +- vpxor \T4, \T1, \T1 # the result is in T1 +- +- vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap +- vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap +- +- +- vpxor \T1, \XMM1, \XMM1 +- +- +- +-.endm +- +- +-# GHASH the last 4 ciphertext blocks. +-.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 +- +- ## Karatsuba Method +- +- vmovdqu HashKey_8(arg2), \T5 +- +- vpshufd $0b01001110, \XMM1, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM1, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM1, \T6 +- vpclmulqdq $0x00, \T5, \XMM1, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \XMM1 +- +- ###################### +- +- vmovdqu HashKey_7(arg2), \T5 +- vpshufd $0b01001110, \XMM2, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM2, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM2, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM2, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vmovdqu HashKey_6(arg2), \T5 +- vpshufd $0b01001110, \XMM3, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM3, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM3, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM3, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vmovdqu HashKey_5(arg2), \T5 +- vpshufd $0b01001110, \XMM4, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM4, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM4, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM4, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vmovdqu HashKey_4(arg2), \T5 +- vpshufd $0b01001110, \XMM5, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM5, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM5, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM5, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vmovdqu HashKey_3(arg2), \T5 +- vpshufd $0b01001110, \XMM6, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM6, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM6, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM6, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vmovdqu HashKey_2(arg2), \T5 +- vpshufd $0b01001110, \XMM7, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM7, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM7, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM7, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- +- ###################### +- +- vmovdqu HashKey(arg2), \T5 +- vpshufd $0b01001110, \XMM8, \T2 +- vpshufd $0b01001110, \T5, \T3 +- vpxor \XMM8, \T2, \T2 +- vpxor \T5, \T3, \T3 +- +- vpclmulqdq $0x11, \T5, \XMM8, \T4 +- vpxor \T4, \T6, \T6 +- +- vpclmulqdq $0x00, \T5, \XMM8, \T4 +- vpxor \T4, \T7, \T7 +- +- vpclmulqdq $0x00, \T3, \T2, \T2 +- +- vpxor \T2, \XMM1, \XMM1 +- vpxor \T6, \XMM1, \XMM1 +- vpxor \T7, \XMM1, \T2 +- +- +- +- +- vpslldq $8, \T2, \T4 +- vpsrldq $8, \T2, \T2 +- +- vpxor \T4, \T7, \T7 +- vpxor \T2, \T6, \T6 # holds the result of the +- # accumulated carry-less multiplications +- +- ####################################################################### +- #first phase of the reduction +- vmovdqa POLY2(%rip), \T3 +- +- vpclmulqdq $0x01, \T7, \T3, \T2 +- vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs +- +- vpxor \T2, \T7, \T7 # first phase of the reduction complete +- ####################################################################### +- +- +- #second phase of the reduction +- vpclmulqdq $0x00, \T7, \T3, \T2 +- vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) +- +- vpclmulqdq $0x10, \T7, \T3, \T4 +- vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) +- +- vpxor \T2, \T4, \T4 # second phase of the reduction complete +- ####################################################################### +- vpxor \T4, \T6, \T6 # the result is in T6 +-.endm +- +- +- +-############################################################# +-#void aesni_gcm_init_avx_gen4 +-# (gcm_data *my_ctx_data, +-# gcm_context_data *data, +-# u8 *iv, /* Pre-counter block j0: 4 byte salt +-# (from Security Association) concatenated with 8 byte +-# Initialisation Vector (from IPSec ESP Payload) +-# concatenated with 0x00000001. 16-byte aligned pointer. */ +-# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ +-# const u8 *aad, /* Additional Authentication Data (AAD)*/ +-# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ +-############################################################# +-SYM_FUNC_START(aesni_gcm_init_avx_gen4) +- FUNC_SAVE +- INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_init_avx_gen4) +- +-############################################################################### +-#void aesni_gcm_enc_avx_gen4( +-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +-# gcm_context_data *data, +-# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ +-# const u8 *in, /* Plaintext input */ +-# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +-############################################################################### +-SYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) +- FUNC_SAVE +- mov keysize,%eax +- cmp $32, %eax +- je key_256_enc_update4 +- cmp $16, %eax +- je key_128_enc_update4 +- # must be 192 +- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 +- FUNC_RESTORE +- RET +-key_128_enc_update4: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 +- FUNC_RESTORE +- RET +-key_256_enc_update4: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) +- +-############################################################################### +-#void aesni_gcm_dec_update_avx_gen4( +-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +-# gcm_context_data *data, +-# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ +-# const u8 *in, /* Ciphertext input */ +-# u64 plaintext_len) /* Length of data in Bytes for encryption. */ +-############################################################################### +-SYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) +- FUNC_SAVE +- mov keysize,%eax +- cmp $32, %eax +- je key_256_dec_update4 +- cmp $16, %eax +- je key_128_dec_update4 +- # must be 192 +- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 +- FUNC_RESTORE +- RET +-key_128_dec_update4: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 +- FUNC_RESTORE +- RET +-key_256_dec_update4: +- GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) +- +-############################################################################### +-#void aesni_gcm_finalize_avx_gen4( +-# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ +-# gcm_context_data *data, +-# u8 *auth_tag, /* Authenticated Tag output. */ +-# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. +-# Valid values are 16 (most likely), 12 or 8. */ +-############################################################################### +-SYM_FUNC_START(aesni_gcm_finalize_avx_gen4) +- FUNC_SAVE +- mov keysize,%eax +- cmp $32, %eax +- je key_256_finalize4 +- cmp $16, %eax +- je key_128_finalize4 +- # must be 192 +- GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 +- FUNC_RESTORE +- RET +-key_128_finalize4: +- GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 +- FUNC_RESTORE +- RET +-key_256_finalize4: +- GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 +- FUNC_RESTORE +- RET +-SYM_FUNC_END(aesni_gcm_finalize_avx_gen4) +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index ef031655b2d3..cd37de5ec404 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -1,7 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0-or-later /* -diff --git a/mm/vmscan.c b/mm/vmscan.c -index 346810e1b69d..fd1d9b4194e3 100644 ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -133,15 +133,6 @@ struct scan_control { - /* The file folios on the current node are dangerously low */ - unsigned int file_is_tiny:1; - -- /* The anonymous pages on the current node are below vm.anon_min_ratio */ -- unsigned int anon_below_min:1; -- -- /* The clean file pages on the current node are below vm.clean_low_ratio */ -- unsigned int clean_below_low:1; -- -- /* The clean file pages on the current node are below vm.clean_min_ratio */ -- unsigned int clean_below_min:1; -- - /* Always discard instead of demoting to lower tier memory */ - unsigned int no_demotion:1; - -@@ -191,15 +182,6 @@ struct scan_control { - #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) - #endif - --bool sysctl_workingset_protection __read_mostly = true; --u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; --u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; --u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; --static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; --static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; --static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; --static u64 workingset_protection_prev_totalram __read_mostly = 0; -- - /* - * From 0 .. 200. Higher means more swappy. +- * Support for Intel AES-NI instructions. This file contains glue +- * code, the real AES implementation is in intel-aes_asm.S. ++ * Support for AES-NI and VAES instructions. This file contains glue code. ++ * The real AES implementations are in aesni-intel_asm.S and other .S files. + * + * Copyright (C) 2008, Intel Corp. + * Author: Huang Ying +@@ -13,6 +13,8 @@ + * Tadeusz Struk (tadeusz.struk@intel.com) + * Aidan O'Mahony (aidan.o.mahony@intel.com) + * Copyright (c) 2010, Intel Corporation. ++ * ++ * Copyright 2024 Google LLC */ -@@ -1074,9 +1056,6 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; -- if (folio_is_file_lru(folio) ? sc->clean_below_min : sc->anon_below_min) -- goto keep_locked; -- - /* - * The number of dirty pages determines if a node is marked - * reclaim_congested. kswapd will stall and start writing -@@ -2378,23 +2357,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - goto out; - } + #include +@@ -44,41 +46,11 @@ + #define CRYPTO_AES_CTX_SIZE (sizeof(struct crypto_aes_ctx) + AESNI_ALIGN_EXTRA) + #define XTS_AES_CTX_SIZE (sizeof(struct aesni_xts_ctx) + AESNI_ALIGN_EXTRA) -- /* -- * Force-scan the other type if anon/clean pages is -- * under vm.{anon,clean}_{low,min}_ratio, respectively. -- */ -- if (sc->clean_below_min) { -- scan_balance = SCAN_ANON; -- goto out; -- } -- if (sc->anon_below_min) { -- scan_balance = SCAN_FILE; -- goto out; -- } -- if (sc->clean_below_low) { -- scan_balance = SCAN_ANON; -- goto out; -- } +-/* This data is stored at the end of the crypto_tfm struct. +- * It's a type of per "session" data storage location. +- * This needs to be 16 byte aligned. +- */ +-struct aesni_rfc4106_gcm_ctx { +- u8 hash_subkey[16] AESNI_ALIGN_ATTR; +- struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; +- u8 nonce[4]; +-}; - - /* - * Do not apply any pressure balancing cleverness when the - * system is close to OOM, scan both anon and file equally -@@ -2557,14 +2519,6 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - BUG(); - } +-struct generic_gcmaes_ctx { +- u8 hash_subkey[16] AESNI_ALIGN_ATTR; +- struct crypto_aes_ctx aes_key_expanded AESNI_ALIGN_ATTR; +-}; +- + struct aesni_xts_ctx { + struct crypto_aes_ctx tweak_ctx AESNI_ALIGN_ATTR; + struct crypto_aes_ctx crypt_ctx AESNI_ALIGN_ATTR; + }; -- /* -- * Hard protection of the working set. -- * Don't reclaim anon/file pages when the amount is -- * below the watermark of the same type. -- */ -- if (file ? sc->clean_below_min : sc->anon_below_min) -- scan = 0; +-#define GCM_BLOCK_LEN 16 - - nr[lru] = scan; - } - } -@@ -3978,23 +3932,6 @@ static unsigned long lru_gen_min_ttl __read_mostly = 1000; - static unsigned long lru_gen_min_ttl __read_mostly; +-struct gcm_context_data { +- /* init, update and finalize context data */ +- u8 aad_hash[GCM_BLOCK_LEN]; +- u64 aad_length; +- u64 in_length; +- u8 partial_block_enc_key[GCM_BLOCK_LEN]; +- u8 orig_IV[GCM_BLOCK_LEN]; +- u8 current_counter[GCM_BLOCK_LEN]; +- u64 partial_block_len; +- u64 unused; +- u8 hash_keys[GCM_BLOCK_LEN * 16]; +-}; +- + static inline void *aes_align_addr(void *addr) + { + if (crypto_tfm_ctx_alignment() >= AESNI_ALIGN) +@@ -103,9 +75,6 @@ asmlinkage void aesni_cts_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, + asmlinkage void aesni_cts_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +-#define AVX_GEN2_OPTSIZE 640 +-#define AVX_GEN4_OPTSIZE 4096 +- + asmlinkage void aesni_xts_enc(const struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + +@@ -118,23 +87,6 @@ asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, + const u8 *in, unsigned int len, u8 *iv); + DEFINE_STATIC_CALL(aesni_ctr_enc_tfm, aesni_ctr_enc); + +-/* Scatter / Gather routines, with args similar to above */ +-asmlinkage void aesni_gcm_init(void *ctx, +- struct gcm_context_data *gdata, +- u8 *iv, +- u8 *hash_subkey, const u8 *aad, +- unsigned long aad_len); +-asmlinkage void aesni_gcm_enc_update(void *ctx, +- struct gcm_context_data *gdata, u8 *out, +- const u8 *in, unsigned long plaintext_len); +-asmlinkage void aesni_gcm_dec_update(void *ctx, +- struct gcm_context_data *gdata, u8 *out, +- const u8 *in, +- unsigned long ciphertext_len); +-asmlinkage void aesni_gcm_finalize(void *ctx, +- struct gcm_context_data *gdata, +- u8 *auth_tag, unsigned long auth_tag_len); +- + asmlinkage void aes_ctr_enc_128_avx_by8(const u8 *in, u8 *iv, + void *keys, u8 *out, unsigned int num_bytes); + asmlinkage void aes_ctr_enc_192_avx_by8(const u8 *in, u8 *iv, +@@ -154,67 +106,6 @@ asmlinkage void aes_xctr_enc_192_avx_by8(const u8 *in, const u8 *iv, + asmlinkage void aes_xctr_enc_256_avx_by8(const u8 *in, const u8 *iv, + const void *keys, u8 *out, unsigned int num_bytes, + unsigned int byte_ctr); +- +-/* +- * asmlinkage void aesni_gcm_init_avx_gen2() +- * gcm_data *my_ctx_data, context data +- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. +- */ +-asmlinkage void aesni_gcm_init_avx_gen2(void *my_ctx_data, +- struct gcm_context_data *gdata, +- u8 *iv, +- u8 *hash_subkey, +- const u8 *aad, +- unsigned long aad_len); +- +-asmlinkage void aesni_gcm_enc_update_avx_gen2(void *ctx, +- struct gcm_context_data *gdata, u8 *out, +- const u8 *in, unsigned long plaintext_len); +-asmlinkage void aesni_gcm_dec_update_avx_gen2(void *ctx, +- struct gcm_context_data *gdata, u8 *out, +- const u8 *in, +- unsigned long ciphertext_len); +-asmlinkage void aesni_gcm_finalize_avx_gen2(void *ctx, +- struct gcm_context_data *gdata, +- u8 *auth_tag, unsigned long auth_tag_len); +- +-/* +- * asmlinkage void aesni_gcm_init_avx_gen4() +- * gcm_data *my_ctx_data, context data +- * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary. +- */ +-asmlinkage void aesni_gcm_init_avx_gen4(void *my_ctx_data, +- struct gcm_context_data *gdata, +- u8 *iv, +- u8 *hash_subkey, +- const u8 *aad, +- unsigned long aad_len); +- +-asmlinkage void aesni_gcm_enc_update_avx_gen4(void *ctx, +- struct gcm_context_data *gdata, u8 *out, +- const u8 *in, unsigned long plaintext_len); +-asmlinkage void aesni_gcm_dec_update_avx_gen4(void *ctx, +- struct gcm_context_data *gdata, u8 *out, +- const u8 *in, +- unsigned long ciphertext_len); +-asmlinkage void aesni_gcm_finalize_avx_gen4(void *ctx, +- struct gcm_context_data *gdata, +- u8 *auth_tag, unsigned long auth_tag_len); +- +-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx); +-static __ro_after_init DEFINE_STATIC_KEY_FALSE(gcm_use_avx2); +- +-static inline struct +-aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm) +-{ +- return aes_align_addr(crypto_aead_ctx(tfm)); +-} +- +-static inline struct +-generic_gcmaes_ctx *generic_gcmaes_ctx_get(struct crypto_aead *tfm) +-{ +- return aes_align_addr(crypto_aead_ctx(tfm)); +-} #endif --static void do_invoke_oom(struct scan_control *sc, bool try_memcg) { -- struct oom_control oc = { -- .gfp_mask = sc->gfp_mask, -- .order = sc->order, -- }; + static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) +@@ -588,280 +479,6 @@ static int xctr_crypt(struct skcipher_request *req) + } + return err; + } - -- if (try_memcg && mem_cgroup_oom_synchronize(true)) -- return; -- -- if (!mutex_trylock(&oom_lock)) -- return; -- out_of_memory(&oc); -- mutex_unlock(&oom_lock); --} --#define invoke_oom(sc) do_invoke_oom(sc, true) --#define invoke_oom_nomemcg(sc) do_invoke_oom(sc, false) -- - static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - { - struct mem_cgroup *memcg; -@@ -4023,96 +3960,14 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) - * younger than min_ttl. However, another possibility is all memcgs are - * either too small or below min. - */ -- invoke_oom_nomemcg(sc); --} -- --int vm_workingset_protection_update_handler(struct ctl_table *table, int write, -- void __user *buffer, size_t *lenp, loff_t *ppos) +-static int aes_gcm_derive_hash_subkey(const struct crypto_aes_ctx *aes_key, +- u8 hash_subkey[AES_BLOCK_SIZE]) -{ -- int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); -- if (ret || !write) -- return ret; +- static const u8 zeroes[AES_BLOCK_SIZE]; - -- workingset_protection_prev_totalram = 0; +- aes_encrypt(aes_key, hash_subkey, zeroes); +- return 0; +-} +- +-static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key, +- unsigned int key_len) +-{ +- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead); +- +- if (key_len < 4) +- return -EINVAL; +- +- /*Account for 4 byte nonce at the end.*/ +- key_len -= 4; +- +- memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce)); +- +- return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: +- aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, +- ctx->hash_subkey); +-} +- +-/* This is the Integrity Check Value (aka the authentication tag) length and can +- * be 8, 12 or 16 bytes long. */ +-static int common_rfc4106_set_authsize(struct crypto_aead *aead, +- unsigned int authsize) +-{ +- switch (authsize) { +- case 8: +- case 12: +- case 16: +- break; +- default: +- return -EINVAL; +- } - - return 0; -} - --static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) +-static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, +- unsigned int authsize) -{ -- unsigned long node_mem_total; -- struct sysinfo i; -- -- if (!(sysctl_workingset_protection)) { -- sc->anon_below_min = 0; -- sc->clean_below_low = 0; -- sc->clean_below_min = 0; -- return; +- switch (authsize) { +- case 4: +- case 8: +- case 12: +- case 13: +- case 14: +- case 15: +- case 16: +- break; +- default: +- return -EINVAL; - } - -- if (likely(sysctl_anon_min_ratio || -- sysctl_clean_low_ratio || -- sysctl_clean_min_ratio)) { --#ifdef CONFIG_NUMA -- si_meminfo_node(&i, pgdat->node_id); --#else //CONFIG_NUMA -- si_meminfo(&i); --#endif //CONFIG_NUMA -- node_mem_total = i.totalram; +- return 0; +-} - -- if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { -- sysctl_anon_min_ratio_kb = -- node_mem_total * sysctl_anon_min_ratio / 100; -- sysctl_clean_low_ratio_kb = -- node_mem_total * sysctl_clean_low_ratio / 100; -- sysctl_clean_min_ratio_kb = -- node_mem_total * sysctl_clean_min_ratio / 100; -- workingset_protection_prev_totalram = node_mem_total; -- } -- } +-static int gcmaes_crypt_by_sg(bool enc, struct aead_request *req, +- unsigned int assoclen, u8 *hash_subkey, +- u8 *iv, void *aes_ctx, u8 *auth_tag, +- unsigned long auth_tag_len) +-{ +- u8 databuf[sizeof(struct gcm_context_data) + (AESNI_ALIGN - 8)] __aligned(8); +- struct gcm_context_data *data = PTR_ALIGN((void *)databuf, AESNI_ALIGN); +- unsigned long left = req->cryptlen; +- struct scatter_walk assoc_sg_walk; +- struct skcipher_walk walk; +- bool do_avx, do_avx2; +- u8 *assocmem = NULL; +- u8 *assoc; +- int err; - -- /* -- * Check the number of anonymous pages to protect them from -- * reclaiming if their amount is below the specified. -- */ -- if (sysctl_anon_min_ratio) { -- unsigned long reclaimable_anon; +- if (!enc) +- left -= auth_tag_len; - -- reclaimable_anon = -- node_page_state(pgdat, NR_ACTIVE_ANON) + -- node_page_state(pgdat, NR_INACTIVE_ANON) + -- node_page_state(pgdat, NR_ISOLATED_ANON); -+ if (mutex_trylock(&oom_lock)) { -+ struct oom_control oc = { -+ .gfp_mask = sc->gfp_mask, -+ }; - -- sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; -- } else -- sc->anon_below_min = 0; -+ out_of_memory(&oc); - -- /* -- * Check the number of clean file pages to protect them from -- * reclaiming if their amount is below the specified. -- */ -- if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { -- unsigned long reclaimable_file, dirty, clean; +- do_avx = (left >= AVX_GEN2_OPTSIZE); +- do_avx2 = (left >= AVX_GEN4_OPTSIZE); - -- reclaimable_file = -- node_page_state(pgdat, NR_ACTIVE_FILE) + -- node_page_state(pgdat, NR_INACTIVE_FILE) + -- node_page_state(pgdat, NR_ISOLATED_FILE); -- dirty = node_page_state(pgdat, NR_FILE_DIRTY); -- /* -- * node_page_state() sum can go out of sync since -- * all the values are not read at once. -- */ -- if (likely(reclaimable_file > dirty)) -- clean = reclaimable_file - dirty; -- else -- clean = 0; -- -- sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; -- sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; +- /* Linearize assoc, if not already linear */ +- if (req->src->length >= assoclen && req->src->length) { +- scatterwalk_start(&assoc_sg_walk, req->src); +- assoc = scatterwalk_map(&assoc_sg_walk); - } else { -- sc->clean_below_low = 0; -- sc->clean_below_min = 0; -+ mutex_unlock(&oom_lock); - } - } - -@@ -4615,12 +4470,6 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw - */ - if (!swappiness) - type = LRU_GEN_FILE; -- else if (sc->clean_below_min) -- type = LRU_GEN_ANON; -- else if (sc->anon_below_min) -- type = LRU_GEN_FILE; -- else if (sc->clean_below_low) -- type = LRU_GEN_ANON; - else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) - type = LRU_GEN_ANON; - else if (swappiness == 1) -@@ -4630,7 +4479,7 @@ static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int sw - else - type = get_type_to_scan(lruvec, swappiness, &tier); - -- for (i = 0; i < ANON_AND_FILE; i++) { -+ for (i = !swappiness; i < ANON_AND_FILE; i++) { - if (tier < 0) - tier = get_tier_idx(lruvec, type); - -@@ -4908,7 +4757,6 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) - struct mem_cgroup *memcg = lruvec_memcg(lruvec); - struct pglist_data *pgdat = lruvec_pgdat(lruvec); - -- prepare_workingset_protection(pgdat, sc); - mem_cgroup_calculate_protection(NULL, memcg); - - if (mem_cgroup_below_min(NULL, memcg)) -@@ -6059,8 +5907,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - - prepare_scan_control(pgdat, sc); - -- prepare_workingset_protection(pgdat, sc); +- gfp_t flags = (req->base.flags & CRYPTO_TFM_REQ_MAY_SLEEP) ? +- GFP_KERNEL : GFP_ATOMIC; - - shrink_node_memcgs(pgdat, sc); +- /* assoc can be any length, so must be on heap */ +- assocmem = kmalloc(assoclen, flags); +- if (unlikely(!assocmem)) +- return -ENOMEM; +- assoc = assocmem; +- +- scatterwalk_map_and_copy(assoc, req->src, 0, assoclen, 0); +- } +- +- kernel_fpu_begin(); +- if (static_branch_likely(&gcm_use_avx2) && do_avx2) +- aesni_gcm_init_avx_gen4(aes_ctx, data, iv, hash_subkey, assoc, +- assoclen); +- else if (static_branch_likely(&gcm_use_avx) && do_avx) +- aesni_gcm_init_avx_gen2(aes_ctx, data, iv, hash_subkey, assoc, +- assoclen); +- else +- aesni_gcm_init(aes_ctx, data, iv, hash_subkey, assoc, assoclen); +- kernel_fpu_end(); +- +- if (!assocmem) +- scatterwalk_unmap(assoc); +- else +- kfree(assocmem); +- +- err = enc ? skcipher_walk_aead_encrypt(&walk, req, false) +- : skcipher_walk_aead_decrypt(&walk, req, false); +- +- while (walk.nbytes > 0) { +- kernel_fpu_begin(); +- if (static_branch_likely(&gcm_use_avx2) && do_avx2) { +- if (enc) +- aesni_gcm_enc_update_avx_gen4(aes_ctx, data, +- walk.dst.virt.addr, +- walk.src.virt.addr, +- walk.nbytes); +- else +- aesni_gcm_dec_update_avx_gen4(aes_ctx, data, +- walk.dst.virt.addr, +- walk.src.virt.addr, +- walk.nbytes); +- } else if (static_branch_likely(&gcm_use_avx) && do_avx) { +- if (enc) +- aesni_gcm_enc_update_avx_gen2(aes_ctx, data, +- walk.dst.virt.addr, +- walk.src.virt.addr, +- walk.nbytes); +- else +- aesni_gcm_dec_update_avx_gen2(aes_ctx, data, +- walk.dst.virt.addr, +- walk.src.virt.addr, +- walk.nbytes); +- } else if (enc) { +- aesni_gcm_enc_update(aes_ctx, data, walk.dst.virt.addr, +- walk.src.virt.addr, walk.nbytes); +- } else { +- aesni_gcm_dec_update(aes_ctx, data, walk.dst.virt.addr, +- walk.src.virt.addr, walk.nbytes); +- } +- kernel_fpu_end(); +- +- err = skcipher_walk_done(&walk, 0); +- } +- +- if (err) +- return err; +- +- kernel_fpu_begin(); +- if (static_branch_likely(&gcm_use_avx2) && do_avx2) +- aesni_gcm_finalize_avx_gen4(aes_ctx, data, auth_tag, +- auth_tag_len); +- else if (static_branch_likely(&gcm_use_avx) && do_avx) +- aesni_gcm_finalize_avx_gen2(aes_ctx, data, auth_tag, +- auth_tag_len); +- else +- aesni_gcm_finalize(aes_ctx, data, auth_tag, auth_tag_len); +- kernel_fpu_end(); +- +- return 0; +-} +- +-static int gcmaes_encrypt(struct aead_request *req, unsigned int assoclen, +- u8 *hash_subkey, u8 *iv, void *aes_ctx) +-{ +- struct crypto_aead *tfm = crypto_aead_reqtfm(req); +- unsigned long auth_tag_len = crypto_aead_authsize(tfm); +- u8 auth_tag[16]; +- int err; +- +- err = gcmaes_crypt_by_sg(true, req, assoclen, hash_subkey, iv, aes_ctx, +- auth_tag, auth_tag_len); +- if (err) +- return err; +- +- scatterwalk_map_and_copy(auth_tag, req->dst, +- req->assoclen + req->cryptlen, +- auth_tag_len, 1); +- return 0; +-} +- +-static int gcmaes_decrypt(struct aead_request *req, unsigned int assoclen, +- u8 *hash_subkey, u8 *iv, void *aes_ctx) +-{ +- struct crypto_aead *tfm = crypto_aead_reqtfm(req); +- unsigned long auth_tag_len = crypto_aead_authsize(tfm); +- u8 auth_tag_msg[16]; +- u8 auth_tag[16]; +- int err; +- +- err = gcmaes_crypt_by_sg(false, req, assoclen, hash_subkey, iv, aes_ctx, +- auth_tag, auth_tag_len); +- if (err) +- return err; +- +- /* Copy out original auth_tag */ +- scatterwalk_map_and_copy(auth_tag_msg, req->src, +- req->assoclen + req->cryptlen - auth_tag_len, +- auth_tag_len, 0); +- +- /* Compare generated tag with passed in tag. */ +- if (crypto_memneq(auth_tag_msg, auth_tag, auth_tag_len)) { +- memzero_explicit(auth_tag, sizeof(auth_tag)); +- return -EBADMSG; +- } +- return 0; +-} +- +-static int helper_rfc4106_encrypt(struct aead_request *req) +-{ +- struct crypto_aead *tfm = crypto_aead_reqtfm(req); +- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); +- void *aes_ctx = &(ctx->aes_key_expanded); +- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); +- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); +- unsigned int i; +- __be32 counter = cpu_to_be32(1); +- +- /* Assuming we are supporting rfc4106 64-bit extended */ +- /* sequence numbers We need to have the AAD length equal */ +- /* to 16 or 20 bytes */ +- if (unlikely(req->assoclen != 16 && req->assoclen != 20)) +- return -EINVAL; +- +- /* IV below built */ +- for (i = 0; i < 4; i++) +- *(iv+i) = ctx->nonce[i]; +- for (i = 0; i < 8; i++) +- *(iv+4+i) = req->iv[i]; +- *((__be32 *)(iv+12)) = counter; +- +- return gcmaes_encrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, +- aes_ctx); +-} +- +-static int helper_rfc4106_decrypt(struct aead_request *req) +-{ +- __be32 counter = cpu_to_be32(1); +- struct crypto_aead *tfm = crypto_aead_reqtfm(req); +- struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm); +- void *aes_ctx = &(ctx->aes_key_expanded); +- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); +- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); +- unsigned int i; +- +- if (unlikely(req->assoclen != 16 && req->assoclen != 20)) +- return -EINVAL; +- +- /* Assuming we are supporting rfc4106 64-bit extended */ +- /* sequence numbers We need to have the AAD length */ +- /* equal to 16 or 20 bytes */ +- +- /* IV below built */ +- for (i = 0; i < 4; i++) +- *(iv+i) = ctx->nonce[i]; +- for (i = 0; i < 8; i++) +- *(iv+4+i) = req->iv[i]; +- *((__be32 *)(iv+12)) = counter; +- +- return gcmaes_decrypt(req, req->assoclen - 8, ctx->hash_subkey, iv, +- aes_ctx); +-} + #endif - flush_reclaim_state(sc); -@@ -6149,8 +5995,6 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) - */ - if (reclaimable) - pgdat->kswapd_failures = 0; -- else if (sc->clean_below_min && !sc->priority) -- invoke_oom(sc); + static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, +@@ -1216,11 +833,717 @@ DEFINE_XTS_ALG(vaes_avx10_256, "xts-aes-vaes-avx10_256", 700); + DEFINE_XTS_ALG(vaes_avx10_512, "xts-aes-vaes-avx10_512", 800); + #endif + ++/* The common part of the x86_64 AES-GCM key struct */ ++struct aes_gcm_key { ++ /* Expanded AES key and the AES key length in bytes */ ++ struct crypto_aes_ctx aes_key; ++ ++ /* RFC4106 nonce (used only by the rfc4106 algorithms) */ ++ u32 rfc4106_nonce; ++}; ++ ++/* Key struct used by the AES-NI implementations of AES-GCM */ ++struct aes_gcm_key_aesni { ++ /* ++ * Common part of the key. The assembly code requires 16-byte alignment ++ * for the round keys; we get this by them being located at the start of ++ * the struct and the whole struct being 16-byte aligned. ++ */ ++ struct aes_gcm_key base; ++ ++ /* ++ * Powers of the hash key H^8 through H^1. These are 128-bit values. ++ * They all have an extra factor of x^-1 and are byte-reversed. 16-byte ++ * alignment is required by the assembly code. ++ */ ++ u64 h_powers[8][2] __aligned(16); ++ ++ /* ++ * h_powers_xored[i] contains the two 64-bit halves of h_powers[i] XOR'd ++ * together. It's used for Karatsuba multiplication. 16-byte alignment ++ * is required by the assembly code. ++ */ ++ u64 h_powers_xored[8] __aligned(16); ++ ++ /* ++ * H^1 times x^64 (and also the usual extra factor of x^-1). 16-byte ++ * alignment is required by the assembly code. ++ */ ++ u64 h_times_x64[2] __aligned(16); ++}; ++#define AES_GCM_KEY_AESNI(key) \ ++ container_of((key), struct aes_gcm_key_aesni, base) ++#define AES_GCM_KEY_AESNI_SIZE \ ++ (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) ++ ++/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ ++struct aes_gcm_key_avx10 { ++ /* ++ * Common part of the key. The assembly code prefers 16-byte alignment ++ * for the round keys; we get this by them being located at the start of ++ * the struct and the whole struct being 64-byte aligned. ++ */ ++ struct aes_gcm_key base; ++ ++ /* ++ * Powers of the hash key H^16 through H^1. These are 128-bit values. ++ * They all have an extra factor of x^-1 and are byte-reversed. This ++ * array is aligned to a 64-byte boundary to make it naturally aligned ++ * for 512-bit loads, which can improve performance. (The assembly code ++ * doesn't *need* the alignment; this is just an optimization.) ++ */ ++ u64 h_powers[16][2] __aligned(64); ++ ++ /* Three padding blocks required by the assembly code */ ++ u64 padding[3][2]; ++}; ++#define AES_GCM_KEY_AVX10(key) \ ++ container_of((key), struct aes_gcm_key_avx10, base) ++#define AES_GCM_KEY_AVX10_SIZE \ ++ (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) ++ ++/* ++ * These flags are passed to the AES-GCM helper functions to specify the ++ * specific version of AES-GCM (RFC4106 or not), whether it's encryption or ++ * decryption, and which assembly functions should be called. Assembly ++ * functions are selected using flags instead of function pointers to avoid ++ * indirect calls (which are very expensive on x86) regardless of inlining. ++ */ ++#define FLAG_RFC4106 BIT(0) ++#define FLAG_ENC BIT(1) ++#define FLAG_AVX BIT(2) ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++# define FLAG_AVX10_256 BIT(3) ++# define FLAG_AVX10_512 BIT(4) ++#else ++ /* ++ * This should cause all calls to the AVX10 assembly functions to be ++ * optimized out, avoiding the need to ifdef each call individually. ++ */ ++# define FLAG_AVX10_256 0 ++# define FLAG_AVX10_512 0 ++#endif ++ ++static inline struct aes_gcm_key * ++aes_gcm_key_get(struct crypto_aead *tfm, int flags) ++{ ++ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) ++ return PTR_ALIGN(crypto_aead_ctx(tfm), 64); ++ else ++ return PTR_ALIGN(crypto_aead_ctx(tfm), 16); ++} ++ ++asmlinkage void ++aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); ++asmlinkage void ++aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); ++asmlinkage void ++aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); ++asmlinkage void ++aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); ++ ++static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) ++{ ++ /* ++ * To make things a bit easier on the assembly side, the AVX10 ++ * implementations use the same key format. Therefore, a single ++ * function using 256-bit vectors would suffice here. However, it's ++ * straightforward to provide a 512-bit one because of how the assembly ++ * code is structured, and it works nicely because the total size of the ++ * key powers is a multiple of 512 bits. So we take advantage of that. ++ * ++ * A similar situation applies to the AES-NI implementations. ++ */ ++ if (flags & FLAG_AVX10_512) ++ aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); ++ else if (flags & FLAG_AVX10_256) ++ aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); ++ else if (flags & FLAG_AVX) ++ aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); ++ else ++ aes_gcm_precompute_aesni(AES_GCM_KEY_AESNI(key)); ++} ++ ++asmlinkage void ++aes_gcm_aad_update_aesni(const struct aes_gcm_key_aesni *key, ++ u8 ghash_acc[16], const u8 *aad, int aadlen); ++asmlinkage void ++aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, ++ u8 ghash_acc[16], const u8 *aad, int aadlen); ++asmlinkage void ++aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, ++ u8 ghash_acc[16], const u8 *aad, int aadlen); ++ ++static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], ++ const u8 *aad, int aadlen, int flags) ++{ ++ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) ++ aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, ++ aad, aadlen); ++ else if (flags & FLAG_AVX) ++ aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, ++ aad, aadlen); ++ else ++ aes_gcm_aad_update_aesni(AES_GCM_KEY_AESNI(key), ghash_acc, ++ aad, aadlen); ++} ++ ++asmlinkage void ++aes_gcm_enc_update_aesni(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++asmlinkage void ++aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++asmlinkage void ++aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++asmlinkage void ++aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++ ++asmlinkage void ++aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++asmlinkage void ++aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++asmlinkage void ++aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++asmlinkage void ++aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); ++ ++/* __always_inline to optimize out the branches based on @flags */ ++static __always_inline void ++aes_gcm_update(const struct aes_gcm_key *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen, int flags) ++{ ++ if (flags & FLAG_ENC) { ++ if (flags & FLAG_AVX10_512) ++ aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else if (flags & FLAG_AVX10_256) ++ aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else if (flags & FLAG_AVX) ++ aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else ++ aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, ++ ghash_acc, src, dst, datalen); ++ } else { ++ if (flags & FLAG_AVX10_512) ++ aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else if (flags & FLAG_AVX10_256) ++ aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else if (flags & FLAG_AVX) ++ aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else ++ aes_gcm_dec_update_aesni(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ } ++} ++ ++asmlinkage void ++aes_gcm_enc_final_aesni(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen); ++asmlinkage void ++aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen); ++asmlinkage void ++aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen); ++ ++/* __always_inline to optimize out the branches based on @flags */ ++static __always_inline void ++aes_gcm_enc_final(const struct aes_gcm_key *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen, int flags) ++{ ++ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) ++ aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen); ++ else if (flags & FLAG_AVX) ++ aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen); ++ else ++ aes_gcm_enc_final_aesni(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen); ++} ++ ++asmlinkage bool __must_check ++aes_gcm_dec_final_aesni(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], const u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen, ++ const u8 tag[16], int taglen); ++asmlinkage bool __must_check ++aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, ++ const u32 le_ctr[4], const u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen, ++ const u8 tag[16], int taglen); ++asmlinkage bool __must_check ++aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, ++ const u32 le_ctr[4], const u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen, ++ const u8 tag[16], int taglen); ++ ++/* __always_inline to optimize out the branches based on @flags */ ++static __always_inline bool __must_check ++aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], ++ u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, ++ u8 tag[16], int taglen, int flags) ++{ ++ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) ++ return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen, ++ tag, taglen); ++ else if (flags & FLAG_AVX) ++ return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen, ++ tag, taglen); ++ else ++ return aes_gcm_dec_final_aesni(AES_GCM_KEY_AESNI(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen, ++ tag, taglen); ++} ++ ++/* ++ * This is the Integrity Check Value (aka the authentication tag) length and can ++ * be 8, 12 or 16 bytes long. ++ */ ++static int common_rfc4106_set_authsize(struct crypto_aead *aead, ++ unsigned int authsize) ++{ ++ switch (authsize) { ++ case 8: ++ case 12: ++ case 16: ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++static int generic_gcmaes_set_authsize(struct crypto_aead *tfm, ++ unsigned int authsize) ++{ ++ switch (authsize) { ++ case 4: ++ case 8: ++ case 12: ++ case 13: ++ case 14: ++ case 15: ++ case 16: ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++/* ++ * This is the setkey function for the x86_64 implementations of AES-GCM. It ++ * saves the RFC4106 nonce if applicable, expands the AES key, and precomputes ++ * powers of the hash key. ++ * ++ * To comply with the crypto_aead API, this has to be usable in no-SIMD context. ++ * For that reason, this function includes a portable C implementation of the ++ * needed logic. However, the portable C implementation is very slow, taking ++ * about the same time as encrypting 37 KB of data. To be ready for users that ++ * may set a key even somewhat frequently, we therefore also include a SIMD ++ * assembly implementation, expanding the AES key using AES-NI and precomputing ++ * the hash key powers using PCLMULQDQ or VPCLMULQDQ. ++ */ ++static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, ++ unsigned int keylen, int flags) ++{ ++ struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); ++ int err; ++ ++ if (flags & FLAG_RFC4106) { ++ if (keylen < 4) ++ return -EINVAL; ++ keylen -= 4; ++ key->rfc4106_nonce = get_unaligned_be32(raw_key + keylen); ++ } ++ ++ /* The assembly code assumes the following offsets. */ ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_enc) != 0); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, base.aes_key.key_length) != 480); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); ++ ++ if (likely(crypto_simd_usable())) { ++ err = aes_check_keylen(keylen); ++ if (err) ++ return err; ++ kernel_fpu_begin(); ++ aesni_set_key(&key->aes_key, raw_key, keylen); ++ aes_gcm_precompute(key, flags); ++ kernel_fpu_end(); ++ } else { ++ static const u8 x_to_the_minus1[16] __aligned(__alignof__(be128)) = { ++ [0] = 0xc2, [15] = 1 ++ }; ++ static const u8 x_to_the_63[16] __aligned(__alignof__(be128)) = { ++ [7] = 1, ++ }; ++ be128 h1 = {}; ++ be128 h; ++ int i; ++ ++ err = aes_expandkey(&key->aes_key, raw_key, keylen); ++ if (err) ++ return err; ++ ++ /* Encrypt the all-zeroes block to get the hash key H^1 */ ++ aes_encrypt(&key->aes_key, (u8 *)&h1, (u8 *)&h1); ++ ++ /* Compute H^1 * x^-1 */ ++ h = h1; ++ gf128mul_lle(&h, (const be128 *)x_to_the_minus1); ++ ++ /* Compute the needed key powers */ ++ if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { ++ struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); ++ ++ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { ++ k->h_powers[i][0] = be64_to_cpu(h.b); ++ k->h_powers[i][1] = be64_to_cpu(h.a); ++ gf128mul_lle(&h, &h1); ++ } ++ memset(k->padding, 0, sizeof(k->padding)); ++ } else { ++ struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); ++ ++ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { ++ k->h_powers[i][0] = be64_to_cpu(h.b); ++ k->h_powers[i][1] = be64_to_cpu(h.a); ++ k->h_powers_xored[i] = k->h_powers[i][0] ^ ++ k->h_powers[i][1]; ++ gf128mul_lle(&h, &h1); ++ } ++ gf128mul_lle(&h1, (const be128 *)x_to_the_63); ++ k->h_times_x64[0] = be64_to_cpu(h1.b); ++ k->h_times_x64[1] = be64_to_cpu(h1.a); ++ } ++ } ++ return 0; ++} ++ ++/* ++ * Initialize @ghash_acc, then pass all @assoclen bytes of associated data ++ * (a.k.a. additional authenticated data) from @sg_src through the GHASH update ++ * assembly function. kernel_fpu_begin() must have already been called. ++ */ ++static void gcm_process_assoc(const struct aes_gcm_key *key, u8 ghash_acc[16], ++ struct scatterlist *sg_src, unsigned int assoclen, ++ int flags) ++{ ++ struct scatter_walk walk; ++ /* ++ * The assembly function requires that the length of any non-last ++ * segment of associated data be a multiple of 16 bytes, so this ++ * function does the buffering needed to achieve that. ++ */ ++ unsigned int pos = 0; ++ u8 buf[16]; ++ ++ memset(ghash_acc, 0, 16); ++ scatterwalk_start(&walk, sg_src); ++ ++ while (assoclen) { ++ unsigned int len_this_page = scatterwalk_clamp(&walk, assoclen); ++ void *mapped = scatterwalk_map(&walk); ++ const void *src = mapped; ++ unsigned int len; ++ ++ assoclen -= len_this_page; ++ scatterwalk_advance(&walk, len_this_page); ++ if (unlikely(pos)) { ++ len = min(len_this_page, 16 - pos); ++ memcpy(&buf[pos], src, len); ++ pos += len; ++ src += len; ++ len_this_page -= len; ++ if (pos < 16) ++ goto next; ++ aes_gcm_aad_update(key, ghash_acc, buf, 16, flags); ++ pos = 0; ++ } ++ len = len_this_page; ++ if (unlikely(assoclen)) /* Not the last segment yet? */ ++ len = round_down(len, 16); ++ aes_gcm_aad_update(key, ghash_acc, src, len, flags); ++ src += len; ++ len_this_page -= len; ++ if (unlikely(len_this_page)) { ++ memcpy(buf, src, len_this_page); ++ pos = len_this_page; ++ } ++next: ++ scatterwalk_unmap(mapped); ++ scatterwalk_pagedone(&walk, 0, assoclen); ++ if (need_resched()) { ++ kernel_fpu_end(); ++ kernel_fpu_begin(); ++ } ++ } ++ if (unlikely(pos)) ++ aes_gcm_aad_update(key, ghash_acc, buf, pos, flags); ++} ++ ++ ++/* __always_inline to optimize out the branches based on @flags */ ++static __always_inline int ++gcm_crypt(struct aead_request *req, int flags) ++{ ++ struct crypto_aead *tfm = crypto_aead_reqtfm(req); ++ const struct aes_gcm_key *key = aes_gcm_key_get(tfm, flags); ++ unsigned int assoclen = req->assoclen; ++ struct skcipher_walk walk; ++ unsigned int nbytes; ++ u8 ghash_acc[16]; /* GHASH accumulator */ ++ u32 le_ctr[4]; /* Counter in little-endian format */ ++ int taglen; ++ int err; ++ ++ /* Initialize the counter and determine the associated data length. */ ++ le_ctr[0] = 2; ++ if (flags & FLAG_RFC4106) { ++ if (unlikely(assoclen != 16 && assoclen != 20)) ++ return -EINVAL; ++ assoclen -= 8; ++ le_ctr[1] = get_unaligned_be32(req->iv + 4); ++ le_ctr[2] = get_unaligned_be32(req->iv + 0); ++ le_ctr[3] = key->rfc4106_nonce; /* already byte-swapped */ ++ } else { ++ le_ctr[1] = get_unaligned_be32(req->iv + 8); ++ le_ctr[2] = get_unaligned_be32(req->iv + 4); ++ le_ctr[3] = get_unaligned_be32(req->iv + 0); ++ } ++ ++ /* Begin walking through the plaintext or ciphertext. */ ++ if (flags & FLAG_ENC) ++ err = skcipher_walk_aead_encrypt(&walk, req, false); ++ else ++ err = skcipher_walk_aead_decrypt(&walk, req, false); ++ ++ /* ++ * Since the AES-GCM assembly code requires that at least three assembly ++ * functions be called to process any message (this is needed to support ++ * incremental updates cleanly), to reduce overhead we try to do all ++ * three calls in the same kernel FPU section if possible. We close the ++ * section and start a new one if there are multiple data segments or if ++ * rescheduling is needed while processing the associated data. ++ */ ++ kernel_fpu_begin(); ++ ++ /* Pass the associated data through GHASH. */ ++ gcm_process_assoc(key, ghash_acc, req->src, assoclen, flags); ++ ++ /* En/decrypt the data and pass the ciphertext through GHASH. */ ++ while ((nbytes = walk.nbytes) != 0) { ++ if (unlikely(nbytes < walk.total)) { ++ /* ++ * Non-last segment. In this case, the assembly ++ * function requires that the length be a multiple of 16 ++ * (AES_BLOCK_SIZE) bytes. The needed buffering of up ++ * to 16 bytes is handled by the skcipher_walk. Here we ++ * just need to round down to a multiple of 16. ++ */ ++ nbytes = round_down(nbytes, AES_BLOCK_SIZE); ++ aes_gcm_update(key, le_ctr, ghash_acc, ++ walk.src.virt.addr, walk.dst.virt.addr, ++ nbytes, flags); ++ le_ctr[0] += nbytes / AES_BLOCK_SIZE; ++ kernel_fpu_end(); ++ err = skcipher_walk_done(&walk, walk.nbytes - nbytes); ++ kernel_fpu_begin(); ++ } else { ++ /* Last segment: process all remaining data. */ ++ aes_gcm_update(key, le_ctr, ghash_acc, ++ walk.src.virt.addr, walk.dst.virt.addr, ++ nbytes, flags); ++ err = skcipher_walk_done(&walk, 0); ++ /* ++ * The low word of the counter isn't used by the ++ * finalize, so there's no need to increment it here. ++ */ ++ } ++ } ++ if (err) ++ goto out; ++ ++ /* Finalize */ ++ taglen = crypto_aead_authsize(tfm); ++ if (flags & FLAG_ENC) { ++ /* Finish computing the auth tag. */ ++ aes_gcm_enc_final(key, le_ctr, ghash_acc, assoclen, ++ req->cryptlen, flags); ++ ++ /* Store the computed auth tag in the dst scatterlist. */ ++ scatterwalk_map_and_copy(ghash_acc, req->dst, req->assoclen + ++ req->cryptlen, taglen, 1); ++ } else { ++ unsigned int datalen = req->cryptlen - taglen; ++ u8 tag[16]; ++ ++ /* Get the transmitted auth tag from the src scatterlist. */ ++ scatterwalk_map_and_copy(tag, req->src, req->assoclen + datalen, ++ taglen, 0); ++ /* ++ * Finish computing the auth tag and compare it to the ++ * transmitted one. The assembly function does the actual tag ++ * comparison. Here, just check the boolean result. ++ */ ++ if (!aes_gcm_dec_final(key, le_ctr, ghash_acc, assoclen, ++ datalen, tag, taglen, flags)) ++ err = -EBADMSG; ++ } ++out: ++ kernel_fpu_end(); ++ return err; ++} ++ ++#define DEFINE_GCM_ALGS(suffix, flags, generic_driver_name, rfc_driver_name, \ ++ ctxsize, priority) \ ++ \ ++static int gcm_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ ++ unsigned int keylen) \ ++{ \ ++ return gcm_setkey(tfm, raw_key, keylen, (flags)); \ ++} \ ++ \ ++static int gcm_encrypt_##suffix(struct aead_request *req) \ ++{ \ ++ return gcm_crypt(req, (flags) | FLAG_ENC); \ ++} \ ++ \ ++static int gcm_decrypt_##suffix(struct aead_request *req) \ ++{ \ ++ return gcm_crypt(req, (flags)); \ ++} \ ++ \ ++static int rfc4106_setkey_##suffix(struct crypto_aead *tfm, const u8 *raw_key, \ ++ unsigned int keylen) \ ++{ \ ++ return gcm_setkey(tfm, raw_key, keylen, (flags) | FLAG_RFC4106); \ ++} \ ++ \ ++static int rfc4106_encrypt_##suffix(struct aead_request *req) \ ++{ \ ++ return gcm_crypt(req, (flags) | FLAG_RFC4106 | FLAG_ENC); \ ++} \ ++ \ ++static int rfc4106_decrypt_##suffix(struct aead_request *req) \ ++{ \ ++ return gcm_crypt(req, (flags) | FLAG_RFC4106); \ ++} \ ++ \ ++static struct aead_alg aes_gcm_algs_##suffix[] = { { \ ++ .setkey = gcm_setkey_##suffix, \ ++ .setauthsize = generic_gcmaes_set_authsize, \ ++ .encrypt = gcm_encrypt_##suffix, \ ++ .decrypt = gcm_decrypt_##suffix, \ ++ .ivsize = GCM_AES_IV_SIZE, \ ++ .chunksize = AES_BLOCK_SIZE, \ ++ .maxauthsize = 16, \ ++ .base = { \ ++ .cra_name = "__gcm(aes)", \ ++ .cra_driver_name = "__" generic_driver_name, \ ++ .cra_priority = (priority), \ ++ .cra_flags = CRYPTO_ALG_INTERNAL, \ ++ .cra_blocksize = 1, \ ++ .cra_ctxsize = (ctxsize), \ ++ .cra_module = THIS_MODULE, \ ++ }, \ ++}, { \ ++ .setkey = rfc4106_setkey_##suffix, \ ++ .setauthsize = common_rfc4106_set_authsize, \ ++ .encrypt = rfc4106_encrypt_##suffix, \ ++ .decrypt = rfc4106_decrypt_##suffix, \ ++ .ivsize = GCM_RFC4106_IV_SIZE, \ ++ .chunksize = AES_BLOCK_SIZE, \ ++ .maxauthsize = 16, \ ++ .base = { \ ++ .cra_name = "__rfc4106(gcm(aes))", \ ++ .cra_driver_name = "__" rfc_driver_name, \ ++ .cra_priority = (priority), \ ++ .cra_flags = CRYPTO_ALG_INTERNAL, \ ++ .cra_blocksize = 1, \ ++ .cra_ctxsize = (ctxsize), \ ++ .cra_module = THIS_MODULE, \ ++ }, \ ++} }; \ ++ \ ++static struct simd_aead_alg *aes_gcm_simdalgs_##suffix[2] \ ++ ++/* aes_gcm_algs_aesni */ ++DEFINE_GCM_ALGS(aesni, /* no flags */ 0, ++ "generic-gcm-aesni", "rfc4106-gcm-aesni", ++ AES_GCM_KEY_AESNI_SIZE, 400); ++ ++/* aes_gcm_algs_aesni_avx */ ++DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, ++ "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", ++ AES_GCM_KEY_AESNI_SIZE, 500); ++ ++#if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) ++/* aes_gcm_algs_vaes_avx10_256 */ ++DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, ++ "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", ++ AES_GCM_KEY_AVX10_SIZE, 700); ++ ++/* aes_gcm_algs_vaes_avx10_512 */ ++DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, ++ "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", ++ AES_GCM_KEY_AVX10_SIZE, 800); ++#endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ ++ + /* + * This is a list of CPU models that are known to suffer from downclocking when +- * zmm registers (512-bit vectors) are used. On these CPUs, the AES-XTS +- * implementation with zmm registers won't be used by default. An +- * implementation with ymm registers (256-bit vectors) will be used instead. ++ * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode ++ * implementations with zmm registers won't be used by default. Implementations ++ * with ymm registers (256-bit vectors) will be used by default instead. + */ + static const struct x86_cpu_id zmm_exclusion_list[] = { + X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), +@@ -1236,7 +1559,7 @@ static const struct x86_cpu_id zmm_exclusion_list[] = { + {}, + }; + +-static int __init register_xts_algs(void) ++static int __init register_avx_algs(void) + { + int err; + +@@ -1246,6 +1569,11 @@ static int __init register_xts_algs(void) + &aes_xts_simdalg_aesni_avx); + if (err) + return err; ++ err = simd_register_aeads_compat(aes_gcm_algs_aesni_avx, ++ ARRAY_SIZE(aes_gcm_algs_aesni_avx), ++ aes_gcm_simdalgs_aesni_avx); ++ if (err) ++ return err; + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (!boot_cpu_has(X86_FEATURE_AVX2) || + !boot_cpu_has(X86_FEATURE_VAES) || +@@ -1269,23 +1597,42 @@ static int __init register_xts_algs(void) + &aes_xts_simdalg_vaes_avx10_256); + if (err) + return err; ++ err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_256, ++ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), ++ aes_gcm_simdalgs_vaes_avx10_256); ++ if (err) ++ return err; ++ ++ if (x86_match_cpu(zmm_exclusion_list)) { ++ int i; + +- if (x86_match_cpu(zmm_exclusion_list)) + aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; ++ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) ++ aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; ++ } + + err = simd_register_skciphers_compat(&aes_xts_alg_vaes_avx10_512, 1, + &aes_xts_simdalg_vaes_avx10_512); + if (err) + return err; ++ err = simd_register_aeads_compat(aes_gcm_algs_vaes_avx10_512, ++ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), ++ aes_gcm_simdalgs_vaes_avx10_512); ++ if (err) ++ return err; + #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + return 0; } - /* +-static void unregister_xts_algs(void) ++static void unregister_avx_algs(void) + { + if (aes_xts_simdalg_aesni_avx) + simd_unregister_skciphers(&aes_xts_alg_aesni_avx, 1, + &aes_xts_simdalg_aesni_avx); ++ if (aes_gcm_simdalgs_aesni_avx[0]) ++ simd_unregister_aeads(aes_gcm_algs_aesni_avx, ++ ARRAY_SIZE(aes_gcm_algs_aesni_avx), ++ aes_gcm_simdalgs_aesni_avx); + #if defined(CONFIG_AS_VAES) && defined(CONFIG_AS_VPCLMULQDQ) + if (aes_xts_simdalg_vaes_avx2) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx2, 1, +@@ -1293,106 +1640,33 @@ static void unregister_xts_algs(void) + if (aes_xts_simdalg_vaes_avx10_256) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_256, 1, + &aes_xts_simdalg_vaes_avx10_256); ++ if (aes_gcm_simdalgs_vaes_avx10_256[0]) ++ simd_unregister_aeads(aes_gcm_algs_vaes_avx10_256, ++ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256), ++ aes_gcm_simdalgs_vaes_avx10_256); + if (aes_xts_simdalg_vaes_avx10_512) + simd_unregister_skciphers(&aes_xts_alg_vaes_avx10_512, 1, + &aes_xts_simdalg_vaes_avx10_512); ++ if (aes_gcm_simdalgs_vaes_avx10_512[0]) ++ simd_unregister_aeads(aes_gcm_algs_vaes_avx10_512, ++ ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512), ++ aes_gcm_simdalgs_vaes_avx10_512); + #endif + } + #else /* CONFIG_X86_64 */ +-static int __init register_xts_algs(void) ++static struct aead_alg aes_gcm_algs_aesni[0]; ++static struct simd_aead_alg *aes_gcm_simdalgs_aesni[0]; ++ ++static int __init register_avx_algs(void) + { + return 0; + } + +-static void unregister_xts_algs(void) ++static void unregister_avx_algs(void) + { + } + #endif /* !CONFIG_X86_64 */ + +-#ifdef CONFIG_X86_64 +-static int generic_gcmaes_set_key(struct crypto_aead *aead, const u8 *key, +- unsigned int key_len) +-{ +- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(aead); +- +- return aes_set_key_common(&ctx->aes_key_expanded, key, key_len) ?: +- aes_gcm_derive_hash_subkey(&ctx->aes_key_expanded, +- ctx->hash_subkey); +-} +- +-static int generic_gcmaes_encrypt(struct aead_request *req) +-{ +- struct crypto_aead *tfm = crypto_aead_reqtfm(req); +- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); +- void *aes_ctx = &(ctx->aes_key_expanded); +- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); +- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); +- __be32 counter = cpu_to_be32(1); +- +- memcpy(iv, req->iv, 12); +- *((__be32 *)(iv+12)) = counter; +- +- return gcmaes_encrypt(req, req->assoclen, ctx->hash_subkey, iv, +- aes_ctx); +-} +- +-static int generic_gcmaes_decrypt(struct aead_request *req) +-{ +- __be32 counter = cpu_to_be32(1); +- struct crypto_aead *tfm = crypto_aead_reqtfm(req); +- struct generic_gcmaes_ctx *ctx = generic_gcmaes_ctx_get(tfm); +- void *aes_ctx = &(ctx->aes_key_expanded); +- u8 ivbuf[16 + (AESNI_ALIGN - 8)] __aligned(8); +- u8 *iv = PTR_ALIGN(&ivbuf[0], AESNI_ALIGN); +- +- memcpy(iv, req->iv, 12); +- *((__be32 *)(iv+12)) = counter; +- +- return gcmaes_decrypt(req, req->assoclen, ctx->hash_subkey, iv, +- aes_ctx); +-} +- +-static struct aead_alg aesni_aeads[] = { { +- .setkey = common_rfc4106_set_key, +- .setauthsize = common_rfc4106_set_authsize, +- .encrypt = helper_rfc4106_encrypt, +- .decrypt = helper_rfc4106_decrypt, +- .ivsize = GCM_RFC4106_IV_SIZE, +- .maxauthsize = 16, +- .base = { +- .cra_name = "__rfc4106(gcm(aes))", +- .cra_driver_name = "__rfc4106-gcm-aesni", +- .cra_priority = 400, +- .cra_flags = CRYPTO_ALG_INTERNAL, +- .cra_blocksize = 1, +- .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx), +- .cra_alignmask = 0, +- .cra_module = THIS_MODULE, +- }, +-}, { +- .setkey = generic_gcmaes_set_key, +- .setauthsize = generic_gcmaes_set_authsize, +- .encrypt = generic_gcmaes_encrypt, +- .decrypt = generic_gcmaes_decrypt, +- .ivsize = GCM_AES_IV_SIZE, +- .maxauthsize = 16, +- .base = { +- .cra_name = "__gcm(aes)", +- .cra_driver_name = "__generic-gcm-aesni", +- .cra_priority = 400, +- .cra_flags = CRYPTO_ALG_INTERNAL, +- .cra_blocksize = 1, +- .cra_ctxsize = sizeof(struct generic_gcmaes_ctx), +- .cra_alignmask = 0, +- .cra_module = THIS_MODULE, +- }, +-} }; +-#else +-static struct aead_alg aesni_aeads[0]; +-#endif +- +-static struct simd_aead_alg *aesni_simd_aeads[ARRAY_SIZE(aesni_aeads)]; +- + static const struct x86_cpu_id aesni_cpu_id[] = { + X86_MATCH_FEATURE(X86_FEATURE_AES, NULL), + {} +@@ -1406,17 +1680,6 @@ static int __init aesni_init(void) + if (!x86_match_cpu(aesni_cpu_id)) + return -ENODEV; + #ifdef CONFIG_X86_64 +- if (boot_cpu_has(X86_FEATURE_AVX2)) { +- pr_info("AVX2 version of gcm_enc/dec engaged.\n"); +- static_branch_enable(&gcm_use_avx); +- static_branch_enable(&gcm_use_avx2); +- } else +- if (boot_cpu_has(X86_FEATURE_AVX)) { +- pr_info("AVX version of gcm_enc/dec engaged.\n"); +- static_branch_enable(&gcm_use_avx); +- } else { +- pr_info("SSE version of gcm_enc/dec engaged.\n"); +- } + if (boot_cpu_has(X86_FEATURE_AVX)) { + /* optimize performance of ctr mode encryption transform */ + static_call_update(aesni_ctr_enc_tfm, aesni_ctr_enc_avx_tfm); +@@ -1434,8 +1697,9 @@ static int __init aesni_init(void) + if (err) + goto unregister_cipher; + +- err = simd_register_aeads_compat(aesni_aeads, ARRAY_SIZE(aesni_aeads), +- aesni_simd_aeads); ++ err = simd_register_aeads_compat(aes_gcm_algs_aesni, ++ ARRAY_SIZE(aes_gcm_algs_aesni), ++ aes_gcm_simdalgs_aesni); + if (err) + goto unregister_skciphers; + +@@ -1447,22 +1711,22 @@ static int __init aesni_init(void) + goto unregister_aeads; + #endif /* CONFIG_X86_64 */ + +- err = register_xts_algs(); ++ err = register_avx_algs(); + if (err) +- goto unregister_xts; ++ goto unregister_avx; + + return 0; + +-unregister_xts: +- unregister_xts_algs(); ++unregister_avx: ++ unregister_avx_algs(); + #ifdef CONFIG_X86_64 + if (aesni_simd_xctr) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); + unregister_aeads: + #endif /* CONFIG_X86_64 */ +- simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), +- aesni_simd_aeads); +- ++ simd_unregister_aeads(aes_gcm_algs_aesni, ++ ARRAY_SIZE(aes_gcm_algs_aesni), ++ aes_gcm_simdalgs_aesni); + unregister_skciphers: + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); +@@ -1473,8 +1737,9 @@ static int __init aesni_init(void) + + static void __exit aesni_exit(void) + { +- simd_unregister_aeads(aesni_aeads, ARRAY_SIZE(aesni_aeads), +- aesni_simd_aeads); ++ simd_unregister_aeads(aes_gcm_algs_aesni, ++ ARRAY_SIZE(aes_gcm_algs_aesni), ++ aes_gcm_simdalgs_aesni); + simd_unregister_skciphers(aesni_skciphers, ARRAY_SIZE(aesni_skciphers), + aesni_simd_skciphers); + crypto_unregister_alg(&aesni_cipher_alg); +@@ -1482,7 +1747,7 @@ static void __exit aesni_exit(void) + if (boot_cpu_has(X86_FEATURE_AVX)) + simd_unregister_skciphers(&aesni_xctr, 1, &aesni_simd_xctr); + #endif /* CONFIG_X86_64 */ +- unregister_xts_algs(); ++ unregister_avx_algs(); + } + + late_initcall(aesni_init); -- -2.44.0 +2.46.0.rc1 -From 4833f48c9738d6bb475df2e4c16be2ea26a7d91d Mon Sep 17 00:00:00 2001 +From 3a6187f4ef69fa4f0bf82ee5138e23bd83b85691 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Wed, 3 Apr 2024 17:07:02 +0200 -Subject: [PATCH 6/8] fixes +Date: Mon, 15 Jul 2024 13:25:57 +0200 +Subject: [PATCH 06/11] fixes Signed-off-by: Peter Jung --- - .../ABI/testing/sysfs-driver-hid-asus | 85 + - arch/Kconfig | 4 +- - drivers/hid/Makefile | 2 + - drivers/hid/{hid-asus.c => hid-asus-core.c} | 193 +-- - drivers/hid/hid-asus-rog.c | 1468 +++++++++++++++++ - drivers/hid/hid-asus-rog.h | 482 ++++++ - drivers/hid/hid-asus.h | 58 + - drivers/hid/hid-ids.h | 1 + - 8 files changed, 2174 insertions(+), 119 deletions(-) - create mode 100644 Documentation/ABI/testing/sysfs-driver-hid-asus - rename drivers/hid/{hid-asus.c => hid-asus-core.c} (89%) - create mode 100644 drivers/hid/hid-asus-rog.c - create mode 100644 drivers/hid/hid-asus-rog.h - create mode 100644 drivers/hid/hid-asus.h + arch/Kconfig | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) -diff --git a/Documentation/ABI/testing/sysfs-driver-hid-asus b/Documentation/ABI/testing/sysfs-driver-hid-asus -new file mode 100644 -index 000000000000..df5b0c5b0702 ---- /dev/null -+++ b/Documentation/ABI/testing/sysfs-driver-hid-asus -@@ -0,0 +1,85 @@ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/gamepad_mode -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Set the mode the ROG Ally xpad operates in: -+ - 1 = Game mode -+ - 2 = WASD mode -+ - 3 = Mouse mode -+ This setting applies instantly and applies settings that were previously changed -+ under that mode which are: -+ - deadzones -+ - anti-deadzones -+ - button mapping -+ - button turbo settings -+ - response curves -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/apply -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Apply the settings that have been stored in attributes so far. Because there are -+ many individual settings across a dozen packets this separation is required to -+ prevent spamming the MCU when userspace applications apply many changes at once. -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/reset_btn_mapping -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Reset a gamepad mode to its default button mapping. -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/deadzone -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Set the inner and outer deadzones of joysticks and triggers. These settings are not -+ written to the MCU until `apply` is set. -+ - range 0-64 (corresponds to 0-100%) -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/deadzone_index -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Descriptive labels for joystick deadzone array. -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/anti-deadzone -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Set the joystick anti-deadzone feature: -+ - range 0-32 (corresponds to 0-50%) -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/calibration -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Calibration values for the joysticks and trigger analogues. There are no default -+ values as the calibration is determined in userspace. -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/calibration_index -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Descriptive labels for joystick and triggers calibration array. -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/rc_point -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Set the joystick response curve. There are 4 points available with 1 being the lowest -+ point and 4 being the highest point. -+ - range 0-64 (corresponds to 0-100%) -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/axis__/rc_point_index -+Date: December 2023 -+Contact: linux-input@vger.kernel.org -+Description: Descriptive labels for joystick response curve points. -+ -+What: /sys/bus/usb/devices/1-3:1.0/0003:0B05:1ABE.0001/btn_