From 151a70a0edfc9d717642595753e2b40ef725f363 Mon Sep 17 00:00:00 2001
From: Jacob Martin <jacob.martin@canonical.com>
Date: Thu, 26 Sep 2024 13:40:42 -0500
Subject: [PATCH 001/352] UBUNTU: [Packaging] Initialize noble:linux-nvidia-adv

Ignore: yes
Signed-off-by: Jacob Martin <jacob.martin@canonical.com>
---
 Ubuntu.md                                     |   14 +-
 debian.nvidia-adv/changelog                   | 1190 +++++++++++++++++
 debian.nvidia-adv/config/README.rst           |  185 +++
 debian.nvidia-adv/config/annotations          |  182 +++
 .../control.d/flavour-control.stub            |  153 +++
 .../control.d/nvidia-adv-64k.inclusion-list   |  304 +++++
 .../control.d/nvidia-adv.inclusion-list       |  304 +++++
 debian.nvidia-adv/control.d/vars.nvidia-adv   |    6 +
 .../control.d/vars.nvidia-adv-64k             |    6 +
 debian.nvidia-adv/control.stub.in             |   97 ++
 debian.nvidia-adv/copyright                   |   29 +
 debian.nvidia-adv/dkms-versions               |    3 +
 debian.nvidia-adv/etc/update.conf             |    7 +
 debian.nvidia-adv/modprobe.d/common.conf      |    3 +
 debian.nvidia-adv/reconstruct                 |   49 +
 debian.nvidia-adv/rules.d/amd64.mk            |   20 +
 debian.nvidia-adv/rules.d/arm64.mk            |   21 +
 debian/debian.env                             |    2 +-
 18 files changed, 2567 insertions(+), 8 deletions(-)
 create mode 100644 debian.nvidia-adv/changelog
 create mode 100644 debian.nvidia-adv/config/README.rst
 create mode 100644 debian.nvidia-adv/config/annotations
 create mode 100644 debian.nvidia-adv/control.d/flavour-control.stub
 create mode 100644 debian.nvidia-adv/control.d/nvidia-adv-64k.inclusion-list
 create mode 100644 debian.nvidia-adv/control.d/nvidia-adv.inclusion-list
 create mode 100644 debian.nvidia-adv/control.d/vars.nvidia-adv
 create mode 100644 debian.nvidia-adv/control.d/vars.nvidia-adv-64k
 create mode 100644 debian.nvidia-adv/control.stub.in
 create mode 100644 debian.nvidia-adv/copyright
 create mode 100644 debian.nvidia-adv/dkms-versions
 create mode 100644 debian.nvidia-adv/etc/update.conf
 create mode 100644 debian.nvidia-adv/modprobe.d/common.conf
 create mode 100644 debian.nvidia-adv/reconstruct
 create mode 100644 debian.nvidia-adv/rules.d/amd64.mk
 create mode 100644 debian.nvidia-adv/rules.d/arm64.mk

diff --git a/Ubuntu.md b/Ubuntu.md
index 1d7bea1caf7fc..0d2d196d7f73f 100644
--- a/Ubuntu.md
+++ b/Ubuntu.md
@@ -1,8 +1,8 @@
-Name:    linux
-Version: 6.1.0
-Series:  23.04 (lunar)
+Name:    linux-nvidia-adv
+Version: 6.8.0
+Series:  24.04 (noble)
 Description:
-    This is the source code for the Ubuntu linux kernel for the 23.04 series. This
-    source tree is used to produce the flavours: generic, generic-64k, generic-lpae.
-    This kernel is configured to support the widest range of desktop, laptop and
-    server configurations.
+    This is the source code for the Ubuntu Nvidia Tech Preview linux kernel for
+    the 24.04 series. This source tree is used to produce the flavours: nvidia-adv,
+    nvidia-adv-64k.  This kernel is configured for use with Nvidia Tech Preview
+    images.
diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
new file mode 100644
index 0000000000000..da61fe688060c
--- /dev/null
+++ b/debian.nvidia-adv/changelog
@@ -0,0 +1,1190 @@
+linux-nvidia-adv (6.8.0-1000.0) noble; urgency=medium
+
+  * Initialize n/linux-nvidia-adv.
+
+ -- Jacob Martin <jacob.martin@canonical.com>  Thu, 26 Sep 2024 13:41:25 -0500
+
+linux (6.8.0-44.44) noble; urgency=medium
+
+  * noble/linux: 6.8.0-44.44 -proposed tracker (LP: #2076647)
+
+  * Packaging resync (LP: #1786013)
+    - [Packaging] debian.master/dkms-versions -- update from kernel-versions
+      (main/2024.08.05)
+
+  * Disable PCI_DYNAMIC_OF_NODES in Ubuntu (LP: #2074376)
+    - [Config] Disable PCI_DYNAMIC_OF_NODES
+
+  * [SRU] Turbostat support for Arrow Lake H (LP: #2074372)
+    - tools/power turbostat: Enhance ARL/LNL support
+    - x86/cpu: Add model number for another Intel Arrow Lake mobile processor
+    - tools/power turbostat: Add ARL-H support
+
+  * Noble update: upstream stable patchset 2024-07-30 (LP: #2075154)
+    - fs/writeback: bail out if there is no more inodes for IO and queued once
+    - padata: Disable BH when taking works lock on MT path
+    - crypto: hisilicon/sec - Fix memory leak for sec resource release
+    - crypto: hisilicon/qm - Add the err memory release process to qm uninit
+    - io_uring/sqpoll: work around a potential audit memory leak
+    - rcutorture: Fix rcu_torture_one_read() pipe_count overflow comment
+    - rcutorture: Make stall-tasks directly exit when rcutorture tests end
+    - rcutorture: Fix invalid context warning when enable srcu barrier testing
+    - block/ioctl: prefer different overflow check
+    - ssb: Fix potential NULL pointer dereference in ssb_device_uevent()
+    - selftests/bpf: Prevent client connect before server bind in
+      test_tc_tunnel.sh
+    - selftests/bpf: Fix flaky test btf_map_in_map/lookup_update
+    - batman-adv: bypass empty buckets in batadv_purge_orig_ref()
+    - wifi: ath9k: work around memset overflow warning
+    - af_packet: avoid a false positive warning in packet_setsockopt()
+    - ACPI: x86: Add PNP_UART1_SKIP quirk for Lenovo Blade2 tablets
+    - drop_monitor: replace spin_lock by raw_spin_lock
+    - scsi: qedi: Fix crash while reading debugfs attribute
+    - net: sfp: add quirk for ATS SFP-GE-T 1000Base-TX module
+    - net/sched: fix false lockdep warning on qdisc root lock
+    - kselftest: arm64: Add a null pointer check
+    - net: dsa: realtek: keep default LED state in rtl8366rb
+    - netpoll: Fix race condition in netpoll_owner_active
+    - wifi: mt76: mt7921s: fix potential hung tasks during chip recovery
+    - HID: Add quirk for Logitech Casa touchpad
+    - HID: asus: fix more n-key report descriptors if n-key quirked
+    - ACPI: video: Add backlight=native quirk for Lenovo Slim 7 16ARH7
+    - Bluetooth: ath3k: Fix multiple issues reported by checkpatch.pl
+    - drm/amd/display: Exit idle optimizations before HDCP execution
+    - platform/x86: toshiba_acpi: Add quirk for buttons on Z830
+    - ASoC: Intel: sof_sdw: add JD2 quirk for HP Omen 14
+    - ASoC: Intel: sof_sdw: add quirk for Dell SKU 0C0F
+    - drm/lima: add mask irq callback to gp and pp
+    - drm/lima: mask irqs in timeout path before hard reset
+    - ALSA: hda/realtek: Add quirks for Lenovo 13X
+    - powerpc/pseries: Enforce hcall result buffer validity and size
+    - media: intel/ipu6: Fix build with !ACPI
+    - media: mtk-vcodec: potential null pointer deference in SCP
+    - powerpc/io: Avoid clang null pointer arithmetic warnings
+    - platform/x86: p2sb: Don't init until unassigned resources have been assigned
+    - power: supply: cros_usbpd: provide ID table for avoiding fallback match
+    - iommu/arm-smmu-v3: Free MSIs in case of ENOMEM
+    - ext4: fix uninitialized ratelimit_state->lock access in __ext4_fill_super()
+    - kprobe/ftrace: bail out if ftrace was killed
+    - usb: gadget: uvc: configfs: ensure guid to be valid before set
+    - f2fs: remove clear SB_INLINECRYPT flag in default_options
+    - usb: misc: uss720: check for incompatible versions of the Belkin F5U002
+    - Avoid hw_desc array overrun in dw-axi-dmac
+    - usb: dwc3: pci: Don't set "linux,phy_charger_detect" property on Lenovo Yoga
+      Tab2 1380
+    - usb: typec: ucsi_glink: drop special handling for CCI_BUSY
+    - udf: udftime: prevent overflow in udf_disk_stamp_to_time()
+    - PCI/PM: Avoid D3cold for HP Pavilion 17 PC/1972 PCIe Ports
+    - f2fs: don't set RO when shutting down f2fs
+    - MIPS: Octeon: Add PCIe link status check
+    - serial: imx: Introduce timeout when waiting on transmitter empty
+    - serial: exar: adding missing CTI and Exar PCI ids
+    - usb: gadget: function: Remove usage of the deprecated ida_simple_xx() API
+    - tty: add the option to have a tty reject a new ldisc
+    - vfio/pci: Collect hot-reset devices to local buffer
+    - cpufreq: amd-pstate: fix memory leak on CPU EPP exit
+    - ACPI: EC: Install address space handler at the namespace root
+    - PCI: Do not wait for disconnected devices when resuming
+    - ALSA: hda: cs35l41: Possible null pointer dereference in
+      cs35l41_hda_unbind()
+    - ALSA: seq: ump: Fix missing System Reset message handling
+    - MIPS: Routerboard 532: Fix vendor retry check code
+    - mips: bmips: BCM6358: make sure CBR is correctly set
+    - tracing: Build event generation tests only as modules
+    - ALSA: hda/realtek: Remove Framework Laptop 16 from quirks
+    - ALSA/hda: intel-dsp-config: Document AVS as dsp_driver option
+    - ice: avoid IRQ collision to fix init failure on ACPI S3 resume
+    - btrfs: zoned: allocate dummy checksums for zoned NODATASUM writes
+    - net: mvpp2: use slab_build_skb for oversized frames
+    - cipso: fix total option length computation
+    - ALSA: hda: cs35l56: Component should be unbound before deconstruction
+    - ALSA: hda: tas2781: Component should be unbound before deconstruction
+    - bpf: Avoid splat in pskb_pull_reason
+    - ALSA: hda/realtek: Enable headset mic on IdeaPad 330-17IKB 81DM
+    - netrom: Fix a memory leak in nr_heartbeat_expiry()
+    - ipv6: prevent possible NULL deref in fib6_nh_init()
+    - ipv6: prevent possible NULL dereference in rt6_probe()
+    - xfrm6: check ip6_dst_idev() return value in xfrm6_get_saddr()
+    - netns: Make get_net_ns() handle zero refcount net
+    - qca_spi: Make interrupt remembering atomic
+    - net: lan743x: disable WOL upon resume to restore full data path operation
+    - net: lan743x: Support WOL at both the PHY and MAC appropriately
+    - net: phy: mxl-gpy: Remove interrupt mask clearing from config_init
+    - net/sched: act_api: fix possible infinite loop in tcf_idr_check_alloc()
+    - tipc: force a dst refcount before doing decryption
+    - sched: act_ct: add netns into the key of tcf_ct_flow_table
+    - ptp: fix integer overflow in max_vclocks_store
+    - selftests: openvswitch: Use bash as interpreter
+    - net: stmmac: No need to calculate speed divider when offload is disabled
+    - virtio_net: checksum offloading handling fix
+    - virtio_net: fixing XDP for fully checksummed packets handling
+    - octeontx2-pf: Add error handling to VLAN unoffload handling
+    - octeontx2-pf: Fix linking objects into multiple modules
+    - netfilter: ipset: Fix suspicious rcu_dereference_protected()
+    - seg6: fix parameter passing when calling NF_HOOK() in End.DX4 and End.DX6
+      behaviors
+    - netfilter: move the sysctl nf_hooks_lwtunnel into the netfilter core
+    - ice: Fix VSI list rule with ICE_SW_LKUP_LAST type
+    - bnxt_en: Restore PTP tx_avail count in case of skb_pad() error
+    - net: usb: rtl8150 fix unintiatilzed variables in rtl8150_get_link_ksettings
+    - RDMA/bnxt_re: Fix the max msix vectors macro
+    - spi: cs42l43: Correct SPI root clock speed
+    - RDMA/rxe: Fix responder length checking for UD request packets
+    - regulator: core: Fix modpost error "regulator_get_regmap" undefined
+    - dmaengine: idxd: Fix possible Use-After-Free in irq_process_work_list
+    - dmaengine: ioatdma: Fix leaking on version mismatch
+    - dmaengine: ioatdma: Fix error path in ioat3_dma_probe()
+    - dmaengine: ioatdma: Fix kmemleak in ioat_pci_probe()
+    - dmaengine: fsl-edma: avoid linking both modules
+    - dmaengine: ioatdma: Fix missing kmem_cache_destroy()
+    - regulator: bd71815: fix ramp values
+    - thermal/drivers/mediatek/lvts_thermal: Return error in case of invalid efuse
+      data
+    - arm64: dts: imx8mp: Fix TC9595 input clock on DH i.MX8M Plus DHCOM SoM
+    - arm64: dts: freescale: imx8mp-venice-gw73xx-2x: fix BT shutdown GPIO
+    - arm64: dts: imx93-11x11-evk: Remove the 'no-sdio' property
+    - arm64: dts: freescale: imx8mm-verdin: enable hysteresis on slow input pin
+    - ACPICA: Revert "ACPICA: avoid Info: mapping multiple BARs. Your kernel is
+      fine."
+    - spi: spi-imx: imx51: revert burst length calculation back to bits_per_word
+    - io_uring/rsrc: fix incorrect assignment of iter->nr_segs in io_import_fixed
+    - firmware: psci: Fix return value from psci_system_suspend()
+    - RDMA/mlx5: Fix unwind flow as part of mlx5_ib_stage_init_init
+    - RDMA/mlx5: Add check for srq max_sge attribute
+    - RDMA/mana_ib: Ignore optional access flags for MRs
+    - ACPI: EC: Evaluate orphan _REG under EC device
+    - arm64: defconfig: enable the vf610 gpio driver
+    - ext4: avoid overflow when setting values via sysfs
+    - ext4: fix slab-out-of-bounds in ext4_mb_find_good_group_avg_frag_lists()
+    - net: stmmac: Assign configured channel value to EXTTS event
+    - net: usb: ax88179_178a: improve reset check
+    - net: do not leave a dangling sk pointer, when socket creation fails
+    - btrfs: retry block group reclaim without infinite loop
+    - scsi: ufs: core: Free memory allocated for model before reinit
+    - cifs: fix typo in module parameter enable_gcm_256
+    - LoongArch: Fix watchpoint setting error
+    - LoongArch: Trigger user-space watchpoints correctly
+    - LoongArch: Fix multiple hardware watchpoint issues
+    - KVM: Fix a data race on last_boosted_vcpu in kvm_vcpu_on_spin()
+    - KVM: arm64: Disassociate vcpus from redistributor region on teardown
+    - KVM: x86: Always sync PIR to IRR prior to scanning I/O APIC routes
+    - RDMA/rxe: Fix data copy for IB_SEND_INLINE
+    - RDMA/mlx5: Remove extra unlock on error path
+    - RDMA/mlx5: Follow rb_key.ats when creating new mkeys
+    - ovl: fix encoding fid for lower only root
+    - ALSA: hda/realtek: Limit mic boost on N14AP7
+    - ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14AHP9
+    - drm/i915/mso: using joiner is not possible with eDP MSO
+    - drm/radeon: fix UBSAN warning in kv_dpm.c
+    - drm/amdgpu: fix UBSAN warning in kv_dpm.c
+    - dt-bindings: dma: fsl-edma: fix dma-channels constraints
+    - ocfs2: fix NULL pointer dereference in ocfs2_journal_dirty()
+    - ocfs2: fix NULL pointer dereference in ocfs2_abort_trigger()
+    - gcov: add support for GCC 14
+    - kcov: don't lose track of remote references during softirqs
+    - efi/x86: Free EFI memory map only when installing a new one.
+    - serial: 8250_dw: Revert "Move definitions to the shared header"
+    - mm: mmap: allow for the maximum number of bits for randomizing mmap_base by
+      default
+    - tcp: clear tp->retrans_stamp in tcp_rcv_fastopen_synack()
+    - mm/page_table_check: fix crash on ZONE_DEVICE
+    - i2c: ocores: set IACK bit after core is enabled
+    - dt-bindings: i2c: atmel,at91sam: correct path to i2c-controller schema
+    - dt-bindings: i2c: google,cros-ec-i2c-tunnel: correct path to i2c-controller
+      schema
+    - spi: stm32: qspi: Fix dual flash mode sanity test in stm32_qspi_setup()
+    - arm64: dts: imx8qm-mek: fix gpio number for reg_usdhc2_vmmc
+    - spi: stm32: qspi: Clamp stm32_qspi_get_mode() output to CCR_BUSWIDTH_4
+    - perf: script: add raw|disasm arguments to --insn-trace option
+    - nbd: Improve the documentation of the locking assumptions
+    - nbd: Fix signal handling
+    - tracing: Add MODULE_DESCRIPTION() to preemptirq_delay_test
+    - x86/cpu/vfm: Add new macros to work with (vendor/family/model) values
+    - x86/cpu: Fix x86_match_cpu() to match just X86_VENDOR_INTEL
+    - drm/amd/display: revert Exit idle optimizations before HDCP execution
+    - ASoC: Intel: sof-sdw: really remove FOUR_SPEAKER quirk
+    - net/sched: unregister lockdep keys in qdisc_create/qdisc_alloc error path
+    - kprobe/ftrace: fix build error due to bad function definition
+    - hid: asus: asus_report_fixup: fix potential read out of bounds
+    - Revert "mm: mmap: allow for the maximum number of bits for randomizing
+      mmap_base by default"
+    - platform/chrome: cros_usbpd_logger: provide ID table for avoiding fallback
+      match
+    - platform/chrome: cros_usbpd_notify: provide ID table for avoiding fallback
+      match
+    - ubsan: Avoid i386 UBSAN handler crashes with Clang
+    - arm64: defconfig: select INTERCONNECT_QCOM_SM6115 as built-in
+    - bpf: Avoid kfree_rcu() under lock in bpf_lpm_trie.
+    - devlink: use kvzalloc() to allocate devlink instance resources
+    - wifi: rtw89: 8852c: add quirk to set PCI BER for certain platforms
+    - clocksource: Make watchdog and suspend-timing multiplication overflow safe
+    - ACPI: resource: Do IRQ override on GMxBGxx (XMG APEX 17 M23)
+    - wifi: ath12k: add string type to search board data in board-2.bin for
+      WCN7850
+    - wifi: ath12k: add firmware-2.bin support
+    - wifi: ath12k: fix kernel crash during resume
+    - arm64/sysreg: Update PIE permission encodings
+    - ACPI: resource: Skip IRQ override on Asus Vivobook Pro N6506MV
+    - wifi: ath12k: fix the problem that down grade phy mode operation
+    - bpf: avoid uninitialized warnings in verifier_global_subprogs.c
+    - selftests: net: fix timestamp not arriving in cmsg_time.sh
+    - net: ena: Add validation for completion descriptors consistency
+    - drm/amd/display: Workaround register access in idle race with cursor
+    - cgroup/cpuset: Make cpuset hotplug processing synchronous
+    - platform/x86: x86-android-tablets: Unregister devices in reverse order
+    - platform/x86: x86-android-tablets: Add Lenovo Yoga Tablet 2 Pro 1380F/L data
+    - ALSA: hda/realtek: Add quirks for HP Omen models using CS35L41
+    - ext4: fold quota accounting into ext4_xattr_inode_lookup_create()
+    - ext4: do not create EA inode under buffer lock
+    - f2fs: fix to detect inconsistent nat entry during truncation
+    - usb: typec: ucsi_glink: rework quirks implementation
+    - xhci: remove XHCI_TRUST_TX_LENGTH quirk
+    - clk: Add a devm variant of clk_rate_exclusive_get()
+    - clk: Provide !COMMON_CLK dummy for devm_clk_rate_exclusive_get()
+    - i2c: lpi2c: Avoid calling clk_get_rate during transfer
+    - cxl: Add post-reset warning if reset results in loss of previously committed
+      HDM decoders
+    - OPP: Fix required_opp_tables for multiple genpds using same table
+    - wifi: iwlwifi: mvm: fix ROC version check
+    - wifi: mac80211: Recalc offload when monitor stop
+    - ice: fix 200G link speed message log
+    - ice: implement AQ download pkg retry
+    - bpf: Fix reg_set_min_max corruption of fake_reg
+    - ALSA: hda: cs35l41: Component should be unbound before deconstruction
+    - netdev-genl: fix error codes when outputting XDP features
+    - arm64: dts: freescale: imx8mm-verdin: Fix GPU speed
+    - phy: qcom-qmp: qserdes-txrx: Add missing registers offsets
+    - phy: qcom-qmp: pcs: Add missing v6 N4 register offsets
+    - phy: qcom: qmp-combo: Switch from V6 to V6 N4 register offsets
+    - powerpc/crypto: Add generated P8 asm to .gitignore
+    - spi: Exctract spi_dev_check_cs() helper
+    - spi: Fix SPI slave probe failure
+    - net: phy: dp83tg720: wake up PHYs in managed mode
+    - net: phy: dp83tg720: get master/slave configuration in link down state
+    - RDMA/mlx5: Ensure created mkeys always have a populated rb_key
+    - drm/amdgpu: fix locking scope when flushing tlb
+    - drm/amd/display: Remove redundant idle optimization check
+    - drm/amd/display: Attempt to avoid empty TUs when endpoint is DPIA
+    - ata: ahci: Do not enable LPM if no LPM states are supported by the HBA
+    - dmaengine: xilinx: xdma: Fix data synchronisation in xdma_channel_isr()
+    - net/tcp_ao: Don't leak ao_info on error-path
+    - mm: shmem: fix getting incorrect lruvec when replacing a shmem folio
+    - selftests: mptcp: print_test out of verify_listener_events
+    - selftests: mptcp: userspace_pm: fixed subtest names
+    - ima: Avoid blocking in RCU read-side critical section
+    - virt: guest_memfd: fix reference leak on hwpoisoned page
+    - thermal: int340x: processor_thermal: Support shared interrupts
+    - thermal: core: Change PM notifier priority to the minimum
+    - wifi: ath12k: check M3 buffer size as well whey trying to reuse it
+    - Upstream stable to v6.6.36, v6.9.7
+
+  * [SRU] Add Dynamic Tuning Technology (DTT)  support for Lunar Lake
+    (LP: #2073961)
+    - thermal: int340x: processor_thermal: Add Lunar Lake-M PCI ID
+
+  * Kubuntu 24.04 freezes after plugging in ethernet cable (LP: #2073358)
+    - e1000e: move force SMBUS near the end of enable_ulp function
+    - e1000e: fix force smbus during suspend flow
+
+  * Noble update: upstream stable patchset 2024-07-25 (LP: #2074091)
+    - wifi: mac80211: mesh: Fix leak of mesh_preq_queue objects
+    - wifi: mac80211: Fix deadlock in ieee80211_sta_ps_deliver_wakeup()
+    - wifi: cfg80211: fully move wiphy work to unbound workqueue
+    - wifi: cfg80211: Lock wiphy in cfg80211_get_station
+    - wifi: cfg80211: pmsr: use correct nla_get_uX functions
+    - wifi: iwlwifi: mvm: don't initialize csa_work twice
+    - wifi: iwlwifi: mvm: revert gen2 TX A-MPDU size to 64
+    - wifi: iwlwifi: mvm: set properly mac header
+    - wifi: iwlwifi: dbg_ini: move iwl_dbg_tlv_free outside of debugfs ifdef
+    - wifi: iwlwifi: mvm: check n_ssids before accessing the ssids
+    - wifi: iwlwifi: mvm: don't read past the mfuart notifcation
+    - wifi: mac80211: correctly parse Spatial Reuse Parameter Set element
+    - scsi: ufs: mcq: Fix error output and clean up ufshcd_mcq_abort()
+    - RISC-V: KVM: No need to use mask when hart-index-bit is 0
+    - RISC-V: KVM: Fix incorrect reg_subtype labels in
+      kvm_riscv_vcpu_set_reg_isa_ext function
+    - ax25: Fix refcount imbalance on inbound connections
+    - ax25: Replace kfree() in ax25_dev_free() with ax25_dev_put()
+    - net/ncsi: Fix the multi thread manner of NCSI driver
+    - net: phy: micrel: fix KSZ9477 PHY issues after suspend/resume
+    - bpf: Fix a potential use-after-free in bpf_link_free()
+    - KVM: SEV-ES: Disallow SEV-ES guests when X86_FEATURE_LBRV is absent
+    - KVM: SEV-ES: Delegate LBR virtualization to the processor
+    - vmxnet3: disable rx data ring on dma allocation failure
+    - ipv6: ioam: block BH from ioam6_output()
+    - ipv6: sr: block BH in seg6_output_core() and seg6_input_core()
+    - net: tls: fix marking packets as decrypted
+    - bpf: Set run context for rawtp test_run callback
+    - octeontx2-af: Always allocate PF entries from low prioriy zone
+    - net/smc: avoid overwriting when adjusting sock bufsizes
+    - net: phy: Micrel KSZ8061: fix errata solution not taking effect problem
+    - net: sched: sch_multiq: fix possible OOB write in multiq_tune()
+    - vxlan: Fix regression when dropping packets due to invalid src addresses
+    - tcp: count CLOSE-WAIT sockets for TCP_MIB_CURRESTAB
+    - mptcp: count CLOSE-WAIT sockets for MPTCP_MIB_CURRESTAB
+    - net/mlx5: Stop waiting for PCI if pci channel is offline
+    - net/mlx5: Always stop health timer during driver removal
+    - net/mlx5: Fix tainted pointer delete is case of flow rules creation fail
+    - net/sched: taprio: always validate TCA_TAPRIO_ATTR_PRIOMAP
+    - ptp: Fix error message on failed pin verification
+    - ice: fix iteration of TLVs in Preserved Fields Area
+    - ice: remove af_xdp_zc_qps bitmap
+    - ice: add flag to distinguish reset from .ndo_bpf in XDP rings config
+    - net: wwan: iosm: Fix tainted pointer delete is case of region creation fail
+    - af_unix: Set sk->sk_state under unix_state_lock() for truly disconencted
+      peer.
+    - af_unix: Annodate data-races around sk->sk_state for writers.
+    - af_unix: Annotate data-race of sk->sk_state in unix_inq_len().
+    - af_unix: Annotate data-races around sk->sk_state in unix_write_space() and
+      poll().
+    - af_unix: Annotate data-race of sk->sk_state in unix_stream_connect().
+    - af_unix: Annotate data-races around sk->sk_state in sendmsg() and recvmsg().
+    - af_unix: Annotate data-race of sk->sk_state in unix_stream_read_skb().
+    - af_unix: Annotate data-races around sk->sk_state in UNIX_DIAG.
+    - af_unix: Annotate data-races around sk->sk_sndbuf.
+    - af_unix: Annotate data-race of net->unx.sysctl_max_dgram_qlen.
+    - af_unix: Use unix_recvq_full_lockless() in unix_stream_connect().
+    - af_unix: Use skb_queue_empty_lockless() in unix_release_sock().
+    - af_unix: Use skb_queue_len_lockless() in sk_diag_show_rqlen().
+    - af_unix: Annotate data-race of sk->sk_shutdown in sk_diag_fill().
+    - ipv6: fix possible race in __fib6_drop_pcpu_from()
+    - net: ethtool: fix the error condition in ethtool_get_phy_stats_ethtool()
+    - selftests/mm: log a consistent test name for check_compaction
+    - irqchip/riscv-intc: Allow large non-standard interrupt number
+    - irqchip/riscv-intc: Introduce Andes hart-level interrupt controller
+    - eventfs: Update all the eventfs_inodes from the events descriptor
+    - io_uring/rsrc: don't lock while !TASK_RUNNING
+    - io_uring: check for non-NULL file pointer in io_file_can_poll()
+    - USB: class: cdc-wdm: Fix CPU lockup caused by excessive log messages
+    - USB: xen-hcd: Traverse host/ when CONFIG_USB_XEN_HCD is selected
+    - usb: typec: tcpm: fix use-after-free case in tcpm_register_source_caps
+    - usb: typec: tcpm: Ignore received Hard Reset in TOGGLING state
+    - mei: me: release irq in mei_me_pci_resume error path
+    - tty: n_tty: Fix buffer offsets when lookahead is used
+    - serial: port: Don't block system suspend even if bytes are left to xmit
+    - landlock: Fix d_parent walk
+    - jfs: xattr: fix buffer overflow for invalid xattr
+    - xhci: Set correct transferred length for cancelled bulk transfers
+    - xhci: Apply reset resume quirk to Etron EJ188 xHCI host
+    - xhci: Handle TD clearing for multiple streams case
+    - xhci: Apply broken streams quirk to Etron EJ188 xHCI host
+    - thunderbolt: debugfs: Fix margin debugfs node creation condition
+    - scsi: core: Disable CDL by default
+    - scsi: mpi3mr: Fix ATA NCQ priority support
+    - scsi: mpt3sas: Avoid test/set_bit() operating in non-allocated memory
+    - scsi: sd: Use READ(16) when reading block zero on large capacity disks
+    - gve: Clear napi->skb before dev_kfree_skb_any()
+    - powerpc/uaccess: Fix build errors seen with GCC 13/14
+    - HID: nvidia-shield: Add missing check for input_ff_create_memless
+    - cxl/test: Add missing vmalloc.h for tools/testing/cxl/test/mem.c
+    - cxl/region: Fix memregion leaks in devm_cxl_add_region()
+    - cachefiles: add output string to cachefiles_obj_[get|put]_ondemand_fd
+    - cachefiles: remove requests from xarray during flushing requests
+    - cachefiles: add spin_lock for cachefiles_ondemand_info
+    - cachefiles: fix slab-use-after-free in cachefiles_ondemand_get_fd()
+    - cachefiles: fix slab-use-after-free in cachefiles_ondemand_daemon_read()
+    - cachefiles: remove err_put_fd label in cachefiles_ondemand_daemon_read()
+    - cachefiles: never get a new anonymous fd if ondemand_id is valid
+    - cachefiles: defer exposing anon_fd until after copy_to_user() succeeds
+    - cachefiles: flush all requests after setting CACHEFILES_DEAD
+    - selftests/ftrace: Fix to check required event file
+    - clk: sifive: Do not register clkdevs for PRCI clocks
+    - NFSv4.1 enforce rootpath check in fs_location query
+    - SUNRPC: return proper error from gss_wrap_req_priv
+    - NFS: add barriers when testing for NFS_FSDATA_BLOCKED
+    - selftests/tracing: Fix event filter test to retry up to 10 times
+    - nvme: fix nvme_pr_* status code parsing
+    - drm/panel: sitronix-st7789v: Add check for of_drm_get_panel_orientation
+    - platform/x86: dell-smbios: Fix wrong token data in sysfs
+    - gpio: tqmx86: fix typo in Kconfig label
+    - gpio: tqmx86: introduce shadow register for GPIO output value
+    - gpio: tqmx86: store IRQ trigger type and unmask status separately
+    - gpio: tqmx86: fix broken IRQ_TYPE_EDGE_BOTH interrupt type
+    - HID: core: remove unnecessary WARN_ON() in implement()
+    - iommu/amd: Fix sysfs leak in iommu init
+    - iommu: Return right value in iommu_sva_bind_device()
+    - io_uring/io-wq: Use set_bit() and test_bit() at worker->flags
+    - io_uring/io-wq: avoid garbage value of 'match' in io_wq_enqueue()
+    - HID: logitech-dj: Fix memory leak in logi_dj_recv_switch_to_dj_mode()
+    - drm/vmwgfx: Refactor drm connector probing for display modes
+    - drm/vmwgfx: Filter modes which exceed graphics memory
+    - drm/vmwgfx: 3D disabled should not effect STDU memory limits
+    - drm/vmwgfx: Remove STDU logic from generic mode_valid function
+    - drm/vmwgfx: Don't memcmp equivalent pointers
+    - af_unix: Annotate data-race of sk->sk_state in unix_accept().
+    - modpost: do not warn about missing MODULE_DESCRIPTION() for vmlinux.o
+    - net: sfp: Always call `sfp_sm_mod_remove()` on remove
+    - net: hns3: fix kernel crash problem in concurrent scenario
+    - net: hns3: add cond_resched() to hns3 ring buffer init process
+    - liquidio: Adjust a NULL pointer handling path in lio_vf_rep_copy_packet
+    - net: stmmac: dwmac-qcom-ethqos: Configure host DMA width
+    - drm/komeda: check for error-valued pointer
+    - drm/bridge/panel: Fix runtime warning on panel bridge release
+    - tcp: fix race in tcp_v6_syn_recv_sock()
+    - net dsa: qca8k: fix usages of device_get_named_child_node()
+    - geneve: Fix incorrect inner network header offset when innerprotoinherit is
+      set
+    - net/mlx5e: Fix features validation check for tunneled UDP (non-VXLAN)
+      packets
+    - Bluetooth: fix connection setup in l2cap_connect
+    - netfilter: nft_inner: validate mandatory meta and payload
+    - netfilter: ipset: Fix race between namespace cleanup and gc in the list:set
+      type
+    - x86/asm: Use %c/%n instead of %P operand modifier in asm templates
+    - x86/uaccess: Fix missed zeroing of ia32 u64 get_user() range checking
+    - scsi: ufs: core: Quiesce request queues before checking pending cmds
+    - net: pse-pd: Use EOPNOTSUPP error code instead of ENOTSUPP
+    - gve: ignore nonrelevant GSO type bits when processing TSO headers
+    - net: stmmac: replace priv->speed with the portTransmitRate from the tc-cbs
+      parameters
+    - block: sed-opal: avoid possible wrong address reference in
+      read_sed_opal_key()
+    - block: fix request.queuelist usage in flush
+    - nvmet-passthru: propagate status from id override functions
+    - net/ipv6: Fix the RT cache flush via sysctl using a previous delay
+    - net: bridge: mst: pass vlan group directly to br_mst_vlan_set_state
+    - net: bridge: mst: fix suspicious rcu usage in br_mst_set_state
+    - ionic: fix use after netif_napi_del()
+    - af_unix: Read with MSG_PEEK loops if the first unread byte is OOB
+    - bnxt_en: Adjust logging of firmware messages in case of released token in
+      __hwrm_send()
+    - misc: microchip: pci1xxxx: fix double free in the error handling of
+      gp_aux_bus_probe()
+    - ksmbd: move leading slash check to smb2_get_name()
+    - ksmbd: fix missing use of get_write in in smb2_set_ea()
+    - x86/boot: Don't add the EFI stub to targets, again
+    - iio: adc: ad9467: fix scan type sign
+    - iio: dac: ad5592r: fix temperature channel scaling value
+    - iio: invensense: fix odr switching to same value
+    - iio: imu: inv_icm42600: delete unneeded update watermark call
+    - drivers: core: synchronize really_probe() and dev_uevent()
+    - parisc: Try to fix random segmentation faults in package builds
+    - ACPI: x86: Force StorageD3Enable on more products
+    - drm/exynos/vidi: fix memory leak in .get_modes()
+    - drm/exynos: hdmi: report safe 640x480 mode as a fallback when no EDID found
+    - mptcp: ensure snd_una is properly initialized on connect
+    - mptcp: pm: inc RmAddr MIB counter once per RM_ADDR ID
+    - mptcp: pm: update add_addr counters after connect
+    - clkdev: Update clkdev id usage to allow for longer names
+    - irqchip/gic-v3-its: Fix potential race condition in its_vlpi_prop_update()
+    - x86/kexec: Fix bug with call depth tracking
+    - x86/amd_nb: Check for invalid SMN reads
+    - perf/core: Fix missing wakeup when waiting for context reference
+    - perf auxtrace: Fix multiple use of --itrace option
+    - riscv: fix overlap of allocated page and PTR_ERR
+    - tracing/selftests: Fix kprobe event name test for .isra. functions
+    - kheaders: explicitly define file modes for archived headers
+    - null_blk: Print correct max open zones limit in null_init_zoned_dev()
+    - sock_map: avoid race between sock_map_close and sk_psock_put
+    - dma-buf: handle testing kthreads creation failure
+    - vmci: prevent speculation leaks by sanitizing event in event_deliver()
+    - spmi: hisi-spmi-controller: Do not override device identifier
+    - knfsd: LOOKUP can return an illegal error value
+    - fs/proc: fix softlockup in __read_vmcore
+    - ocfs2: use coarse time for new created files
+    - ocfs2: fix races between hole punching and AIO+DIO
+    - PCI: rockchip-ep: Remove wrong mask on subsys_vendor_id
+    - dmaengine: axi-dmac: fix possible race in remove()
+    - remoteproc: k3-r5: Wait for core0 power-up before powering up core1
+    - remoteproc: k3-r5: Do not allow core1 to power up before core0 via sysfs
+    - iio: adc: axi-adc: make sure AXI clock is enabled
+    - iio: invensense: fix interrupt timestamp alignment
+    - riscv: rewrite __kernel_map_pages() to fix sleeping in invalid context
+    - rtla/timerlat: Simplify "no value" printing on top
+    - rtla/auto-analysis: Replace \t with spaces
+    - drm/i915/gt: Disarm breadcrumbs if engines are already idle
+    - drm/shmem-helper: Fix BUG_ON() on mmap(PROT_WRITE, MAP_PRIVATE)
+    - drm/i915/dpt: Make DPT object unshrinkable
+    - drm/i915: Fix audio component initialization
+    - intel_th: pci: Add Meteor Lake-S support
+    - pmdomain: ti-sci: Fix duplicate PD referrals
+    - btrfs: zoned: fix use-after-free due to race with dev replace
+    - xfs: fix imprecise logic in xchk_btree_check_block_owner
+    - xfs: fix scrub stats file permissions
+    - xfs: fix SEEK_HOLE/DATA for regions with active COW extents
+    - xfs: shrink failure needs to hold AGI buffer
+    - xfs: ensure submit buffers on LSN boundaries in error handlers
+    - xfs: allow sunit mount option to repair bad primary sb stripe values
+    - xfs: don't use current->journal_info
+    - xfs: allow cross-linking special files without project quota
+    - swiotlb: Enforce page alignment in swiotlb_alloc()
+    - swiotlb: Reinstate page-alignment for mappings >= PAGE_SIZE
+    - swiotlb: extend buffer pre-padding to alloc_align_mask if necessary
+    - tick/nohz_full: Don't abuse smp_call_function_single() in
+      tick_setup_device()
+    - mm/huge_memory: don't unpoison huge_zero_folio
+    - serial: 8250_pxa: Configure tx_loadsz to match FIFO IRQ level
+    - Revert "fork: defer linking file vma until vma is fully initialized"
+    - remoteproc: k3-r5: Jump to error handling labels in start/stop errors
+    - greybus: Fix use-after-free bug in gb_interface_release due to race
+      condition.
+    - ima: Fix use-after-free on a dentry's dname.name
+    - serial: core: Add UPIO_UNKNOWN constant for unknown port type
+    - serial: port: Introduce a common helper to read properties
+    - serial: 8250_dw: Switch to use uart_read_port_properties()
+    - serial: 8250_dw: Replace ACPI device check by a quirk
+    - serial: 8250_dw: Don't use struct dw8250_data outside of 8250_dw
+    - usb-storage: alauda: Check whether the media is initialized
+    - misc: microchip: pci1xxxx: Fix a memory leak in the error handling of
+      gp_aux_bus_probe()
+    - i2c: at91: Fix the functionality flags of the slave-only interface
+    - i2c: designware: Fix the functionality flags of the slave-only interface
+    - zap_pid_ns_processes: clear TIF_NOTIFY_SIGNAL along with TIF_SIGPENDING
+    - wifi: ath11k: fix WCN6750 firmware crash caused by 17 num_vdevs
+    - cpufreq: amd-pstate: Unify computation of
+      {max,min,nominal,lowest_nonlinear}_freq
+    - cpufreq: amd-pstate: Add quirk for the pstate CPPC capabilities missing
+    - cpufreq: amd-pstate: remove global header file
+    - virtio_net: fix possible dim status unrecoverable
+    - net: ethernet: mtk_eth_soc: handle dma buffer size soc specific
+    - ice: fix reads from NVM Shadow RAM on E830 and E825-C devices
+    - ice: map XDP queues to vectors in ice_vsi_map_rings_to_vectors()
+    - x86/cpu: Get rid of an unnecessary local variable in get_cpu_address_sizes()
+    - x86/cpu: Provide default cache line size if not enumerated
+    - selftests/mm: ksft_exit functions do not return
+    - selftests/mm: compaction_test: fix bogus test success and reduce probability
+      of OOM-killer invocation
+    - .editorconfig: remove trim_trailing_whitespace option
+    - kcov, usb: disable interrupts in kcov_remote_start_usb_softirq
+    - ata: libata-scsi: Set the RMB bit only for removable media devices
+    - powerpc/85xx: fix compile error without CONFIG_CRASH_DUMP
+    - kselftest/alsa: Ensure _GNU_SOURCE is defined
+    - thermal: core: Do not fail cdev registration because of invalid initial
+      state
+    - Bluetooth: hci_sync: Fix not using correct handle
+    - net/sched: initialize noop_qdisc owner
+    - tcp: use signed arithmetic in tcp_rtx_probe0_timed_out()
+    - drm/nouveau: don't attempt to schedule hpd_work on headless cards
+    - drm/xe/xe_gt_idle: use GT forcewake domain assertion
+    - drm/xe: flush engine buffers before signalling user fence on all engines
+    - drm/xe: Remove mem_access from guc_pc calls
+    - drm/xe: move disable_c6 call
+    - bnxt_en: Cap the size of HWRM_PORT_PHY_QCFG forwarded response
+    - iio: imu: bmi323: Fix trigger notification in case of error
+    - iio: pressure: bmp280: Fix BMP580 temperature reading
+    - iio: temperature: mlx90635: Fix ERR_PTR dereference in mlx90635_probe()
+    - thermal: ACPI: Invalidate trip points with temperature of 0 or below
+    - x86/mm/numa: Use NUMA_NO_NODE when calling memblock_set_node()
+    - memblock: make memblock_set_node() also warn about use of MAX_NUMNODES
+    - perf script: Show also errors for --insn-trace option
+    - wifi: cfg80211: validate HE operation element parsing
+    - wifi: rtlwifi: Ignore IEEE80211_CONF_CHANGE_RETRY_LIMITS
+    - locking/atomic: scripts: fix ${atomic}_sub_and_test() kerneldoc
+    - ata: ahci: Do not apply Intel PCS quirk on Intel Alder Lake
+    - ata: libata-core: Add ATA_HORKAGE_NOLPM for Apacer AS340
+    - ata: libata-core: Add ATA_HORKAGE_NOLPM for Crucial CT240BX500SSD1
+    - ata: libata-core: Add ATA_HORKAGE_NOLPM for AMD Radeon S3 SSD
+    - kexec: fix the unexpected kexec_dprintk() macro
+    - ocfs2: update inode fsync transaction id in ocfs2_unlink and ocfs2_link
+    - dm-integrity: set discard_granularity to logical block size
+    - drm/bridge: aux-hpd-bridge: correct devm_drm_dp_hpd_bridge_add() stub
+    - iio: temperature: mcp9600: Fix temperature reading for negative values
+    - drm/mst: Fix NULL pointer dereference at drm_dp_add_payload_part2
+    - riscv: force PAGE_SIZE linear mapping if debug_pagealloc is enabled
+    - drm/xe: Properly handle alloc_guc_id() failure
+    - wifi: iwlwifi: mvm: support iwl_dev_tx_power_cmd_v8
+    - wifi: iwlwifi: mvm: fix a crash on 7265
+    - mei: vsc: Fix wrong invocation of ACPI SID method
+    - Upstream stable to v6.6.35, v6.9.6
+
+  * [SRU] Add support for intel trace hub for last platforms (LP: #2073926) //
+    Noble update: upstream stable patchset 2024-07-25 (LP: #2074091)
+    - intel_th: pci: Add Granite Rapids support
+    - intel_th: pci: Add Granite Rapids SOC support
+    - intel_th: pci: Add Sapphire Rapids SOC support
+    - intel_th: pci: Add Lunar Lake support
+
+  * Fix L2CAP/LE/CPU/BV-02-C bluetooth certification failure (LP: #2072858) //
+    Noble update: upstream stable patchset 2024-07-25 (LP: #2074091)
+    - Bluetooth: L2CAP: Fix rejecting L2CAP_CONN_PARAM_UPDATE_REQ
+
+  * Noble update: upstream stable patchset 2024-07-22 (LP: #2073788)
+    - drm/i915/hwmon: Get rid of devm
+    - afs: Don't cross .backup mountpoint from backup volume
+    - erofs: avoid allocating DEFLATE streams before mounting
+    - vxlan: Fix regression when dropping packets due to invalid src addresses
+    - drm/sun4i: hdmi: Convert encoder to atomic
+    - drm/sun4i: hdmi: Move mode_set into enable
+    - f2fs: fix to do sanity check on i_xattr_nid in sanity_check_inode()
+    - media: lgdt3306a: Add a check against null-pointer-def
+    - drm/amdgpu: add error handle to avoid out-of-bounds
+    - wifi: rtw89: correct aSIFSTime for 6GHz band
+    - ata: pata_legacy: make legacy_exit() work again
+    - fsverity: use register_sysctl_init() to avoid kmemleak warning
+    - proc: Move fdinfo PTRACE_MODE_READ check into the inode .permission
+      operation
+    - platform/chrome: cros_ec: Handle events during suspend after resume
+      completion
+    - thermal/drivers/qcom/lmh: Check for SCM availability at probe
+    - soc: qcom: rpmh-rsc: Enhance check for VRM in-flight request
+    - ACPI: resource: Do IRQ override on TongFang GXxHRXx and GMxHGxx
+    - arm64: tegra: Correct Tegra132 I2C alias
+    - arm64: dts: qcom: qcs404: fix bluetooth device address
+    - md/raid5: fix deadlock that raid5d() wait for itself to clear
+      MD_SB_CHANGE_PENDING
+    - wifi: rtl8xxxu: Fix the TX power of RTL8192CU, RTL8723AU
+    - wifi: rtlwifi: rtl8192de: Fix 5 GHz TX power
+    - wifi: rtlwifi: rtl8192de: Fix low speed with WPA3-SAE
+    - wifi: rtlwifi: rtl8192de: Fix endianness issue in RX path
+    - arm64: dts: qcom: sc8280xp: add missing PCIe minimum OPP
+    - arm64: dts: hi3798cv200: fix the size of GICR
+    - arm64: dts: ti: verdin-am62: Set memory size to 2gb
+    - media: mc: Fix graph walk in media_pipeline_start
+    - media: mc: mark the media devnode as registered from the, start
+    - media: mxl5xx: Move xpt structures off stack
+    - media: v4l2-core: hold videodev_lock until dev reg, finishes
+    - media: v4l: async: Properly re-initialise notifier entry in unregister
+    - media: v4l: async: Don't set notifier's V4L2 device if registering fails
+    - media: v4l: async: Fix notifier list entry init
+    - mmc: core: Add mmc_gpiod_set_cd_config() function
+    - mmc: sdhci: Add support for "Tuning Error" interrupts
+    - mmc: sdhci-acpi: Sort DMI quirks alphabetically
+    - mmc: sdhci-acpi: Fix Lenovo Yoga Tablet 2 Pro 1380 sdcard slot not working
+    - mmc: sdhci-acpi: Disable write protect detection on Toshiba WT10-A
+    - mmc: sdhci-acpi: Add quirk to enable pull-up on the card-detect GPIO on Asus
+      T100TA
+    - drm/fbdev-generic: Do not set physical framebuffer address
+    - fbdev: savage: Handle err return when savagefb_check_var failed
+    - drm/amdgpu/atomfirmware: add intergrated info v2.3 table
+    - 9p: add missing locking around taking dentry fid list
+    - drm/amd: Fix shutdown (again) on some SMU v13.0.4/11 platforms
+    - Revert "drm/amdkfd: fix gfx_target_version for certain 11.0.3 devices"
+    - KVM: SVM: WARN on vNMI + NMI window iff NMIs are outright masked
+    - KVM: arm64: Fix AArch32 register narrowing on userspace write
+    - KVM: arm64: Allow AArch32 PSTATE.M to be restored as System mode
+    - KVM: arm64: AArch32: Fix spurious trapping of conditional instructions
+    - LoongArch: Add all CPUs enabled by fdt to NUMA node 0
+    - LoongArch: Override higher address bits in JUMP_VIRT_ADDR
+    - clk: bcm: dvp: Assign ->num before accessing ->hws
+    - clk: bcm: rpi: Assign ->num before accessing ->hws
+    - clk: qcom: clk-alpha-pll: fix rate setting for Stromer PLLs
+    - crypto: ecdsa - Fix module auto-load on add-key
+    - crypto: ecrdsa - Fix module auto-load on add_key
+    - crypto: qat - Fix ADF_DEV_RESET_SYNC memory leak
+    - kbuild: Remove support for Clang's ThinLTO caching
+    - mm: fix race between __split_huge_pmd_locked() and GUP-fast
+    - filemap: add helper mapping_max_folio_size()
+    - iomap: fault in smaller chunks for non-large folio mappings
+    - i2c: acpi: Unbind mux adapters before delete
+    - HID: i2c-hid: elan: fix reset suspend current leakage
+    - scsi: core: Handle devices which return an unusually large VPD page count
+    - net/ipv6: Fix route deleting failure when metric equals 0
+    - net/9p: fix uninit-value in p9_client_rpc()
+    - mm/ksm: fix ksm_pages_scanned accounting
+    - mm/ksm: fix ksm_zero_pages accounting
+    - kmsan: do not wipe out origin when doing partial unpoisoning
+    - tpm_tis: Do *not* flush uninitialized work
+    - intel_th: pci: Add Meteor Lake-S CPU support
+    - rtla/timerlat: Fix histogram report when a cpu count is 0
+    - sparc64: Fix number of online CPUs
+    - mm/cma: drop incorrect alignment check in cma_init_reserved_mem
+    - mm/hugetlb: pass correct order_per_bit to cma_declare_contiguous_nid
+    - mm: /proc/pid/smaps_rollup: avoid skipping vma after getting mmap_lock again
+    - mm/vmalloc: fix vmalloc which may return null if called with __GFP_NOFAIL
+    - selftests/mm: compaction_test: fix incorrect write of zero to nr_hugepages
+    - selftests/mm: fix build warnings on ppc64
+    - watchdog: rti_wdt: Set min_hw_heartbeat_ms to accommodate a safety margin
+    - bonding: fix oops during rmmod
+    - wifi: ath10k: fix QCOM_RPROC_COMMON dependency
+    - kdb: Fix buffer overflow during tab-complete
+    - kdb: Use format-strings rather than '\0' injection in kdb_read()
+    - kdb: Fix console handling when editing and tab-completing commands
+    - kdb: Merge identical case statements in kdb_read()
+    - kdb: Use format-specifiers rather than memset() for padding in kdb_read()
+    - sparc: move struct termio to asm/termios.h
+    - drm/amdkfd: handle duplicate BOs in reserve_bo_and_cond_vms
+    - ext4: Fixes len calculation in mpage_journal_page_buffers
+    - ext4: set type of ac_groups_linear_remaining to __u32 to avoid overflow
+    - ext4: fix mb_cache_entry's e_refcnt leak in ext4_xattr_block_cache_find()
+    - riscv: dts: starfive: Remove PMIC interrupt info for Visionfive 2 board
+    - ARM: dts: samsung: smdkv310: fix keypad no-autorepeat
+    - ARM: dts: samsung: smdk4412: fix keypad no-autorepeat
+    - ARM: dts: samsung: exynos4412-origen: fix keypad no-autorepeat
+    - parisc: Define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+    - parisc: Define sigset_t in parisc uapi header
+    - s390/ap: Fix crash in AP internal function modify_bitmap()
+    - s390/cpacf: Split and rework cpacf query functions
+    - s390/cpacf: Make use of invalid opcode produce a link error
+    - i3c: master: svc: fix invalidate IBI type and miss call client IBI handler
+    - genirq/irqdesc: Prevent use-after-free in irq_find_at_or_after()
+    - ASoC: SOF: ipc4-topology: Fix input format query of process modules without
+      base extension
+    - ALSA: ump: Don't clear bank selection after sending a program change
+    - ALSA: ump: Don't accept an invalid UMP protocol number
+    - EDAC/amd64: Convert PCIBIOS_* return codes to errnos
+    - EDAC/igen6: Convert PCIBIOS_* return codes to errnos
+    - nfs: fix undefined behavior in nfs_block_bits()
+    - NFS: Fix READ_PLUS when server doesn't support OP_READ_PLUS
+    - eventfs: Fix a possible null pointer dereference in eventfs_find_events()
+    - eventfs: Keep the directories from having the same inode number as files
+    - tracefs: Clear EVENT_INODE flag in tracefs_drop_inode()
+    - btrfs: fix crash on racing fsync and size-extending write into prealloc
+    - btrfs: fix leak of qgroup extent records after transaction abort
+    - ALSA: seq: Fix incorrect UMP type for system messages
+    - powerpc/bpf: enforce full ordering for ATOMIC operations with BPF_FETCH
+    - smb: client: fix deadlock in smb2_find_smb_tcon()
+    - smp: Provide 'setup_max_cpus' definition on UP too
+    - drm/xe/bb: assert width in xe_bb_create_job()
+    - crypto: starfive - Do not free stack buffer
+    - btrfs: qgroup: fix initialization of auto inherit array
+    - wifi: rtl8xxxu: enable MFP support with security flag of RX descriptor
+    - media: mgb4: Fix double debugfs remove
+    - media: ov2740: Fix LINK_FREQ and PIXEL_RATE control value reporting
+    - firmware: qcom_scm: disable clocks if qcom_scm_bw_enable() fails
+    - LoongArch: Fix built-in DTB detection
+    - LoongArch: Fix entry point in kernel image header
+    - clk: qcom: apss-ipq-pll: use stromer ops for IPQ5018 to fix boot failure
+    - net/tcp: Don't consider TCP_CLOSE in TCP_AO_ESTABLISHED
+    - selftests: net: lib: support errexit with busywait
+    - selftests: net: lib: avoid error removing empty netns name
+    - cpufreq: amd-pstate: Fix the inconsistency in max frequency units
+    - mm/memory-failure: fix handling of dissolved but not taken off from buddy
+      pages
+    - selftests/mm: compaction_test: fix bogus test success on Aarch64
+    - irqchip/riscv-intc: Prevent memory leak when riscv_intc_init_common() fails
+    - Revert "perf record: Reduce memory for recording PERF_RECORD_LOST_SAMPLES
+      event"
+    - hwmon: (ltc2992) Fix memory leak in ltc2992_parse_dt()
+    - riscv: enable HAVE_ARCH_HUGE_VMAP for XIP kernel
+    - btrfs: qgroup: update rescan message levels and error codes
+    - btrfs: qgroup: fix qgroup id collision across mounts
+    - btrfs: cache folio size and shift in extent_buffer
+    - btrfs: protect folio::private when attaching extent buffer folios
+    - bpf: fix multi-uprobe PID filtering logic
+    - powerpc/64/bpf: fix tail calls for PCREL addressing
+    - nilfs2: fix potential kernel bug due to lack of writeback flag waiting
+    - nilfs2: fix nilfs_empty_dir() misjudgment and long loop on I/O errors
+    - Upstream stable to v6.6.34, v6.9.5
+
+  * Noble update: upstream stable patchset 2024-07-19 (LP: #2073603)
+    - perf record: Delete session after stopping sideband thread
+    - perf probe: Add missing libgen.h header needed for using basename()
+    - iio: core: Leave private pointer NULL when no private data supplied
+    - greybus: lights: check return of get_channel_from_mode
+    - phy: qcom: qmp-combo: fix duplicate return in qmp_v4_configure_dp_phy
+    - f2fs: multidev: fix to recognize valid zero block address
+    - f2fs: fix to wait on page writeback in __clone_blkaddrs()
+    - fpga: manager: add owner module and take its refcount
+    - fpga: bridge: add owner module and take its refcount
+    - counter: linux/counter.h: fix Excess kernel-doc description warning
+    - perf annotate: Get rid of duplicate --group option item
+    - usb: typec: ucsi: always register a link to USB PD device
+    - usb: typec: ucsi: simplify partner's PD caps registration
+    - perf stat: Do not fail on metrics on s390 z/VM systems
+    - soundwire: cadence: fix invalid PDI offset
+    - dmaengine: idma64: Add check for dma_set_max_seg_size
+    - firmware: dmi-id: add a release callback function
+    - perf annotate: Fix annotation_calc_lines() to pass correct address to
+      get_srcline()
+    - serial: max3100: Lock port->lock when calling uart_handle_cts_change()
+    - serial: max3100: Update uart_driver_registered on driver removal
+    - serial: max3100: Fix bitwise types
+    - greybus: arche-ctrl: move device table to its right location
+    - PCI: tegra194: Fix probe path for Endpoint mode
+    - serial: sc16is7xx: add proper sched.h include for sched_set_fifo()
+    - module: don't ignore sysfs_create_link() failures
+    - interconnect: qcom: qcm2290: Fix mas_snoc_bimc QoS port assignment
+    - arm64: dts: meson: fix S4 power-controller node
+    - perf tests: Make "test data symbol" more robust on Neoverse N1
+    - perf tests: Apply attributes to all events in object code reading test
+    - perf record: Fix debug message placement for test consumption
+    - dt-bindings: PCI: rcar-pci-host: Add missing IOMMU properties
+    - perf bench uprobe: Remove lib64 from libc.so.6 binary path
+    - f2fs: compress: fix to relocate check condition in
+      f2fs_{release,reserve}_compress_blocks()
+    - f2fs: compress: fix to relocate check condition in
+      f2fs_ioc_{,de}compress_file()
+    - f2fs: fix to relocate check condition in f2fs_fallocate()
+    - f2fs: fix to check pinfile flag in f2fs_move_file_range()
+    - iio: adc: stm32: Fixing err code to not indicate success
+    - riscv: dts: starfive: visionfive 2: Remove non-existing TDM hardware
+    - coresight: etm4x: Fix unbalanced pm_runtime_enable()
+    - perf docs: Document bpf event modifier
+    - perf test shell arm_coresight: Increase buffer size for Coresight basic
+      tests
+    - iio: pressure: dps310: support negative temperature values
+    - iio: buffer-dmaengine: export buffer alloc and free functions
+    - iio: add the IIO backend framework
+    - [CONFIG] Update CONFIG_IIO_BACKEND
+    - iio: adc: ad9467: convert to backend framework
+    - [Config] Update CONFIG_AD9467
+    - iio: adc: adi-axi-adc: move to backend framework
+    - [Config] Update CONFIG_ADI_AXI_ADC
+    - iio: adc: adi-axi-adc: only error out in major version mismatch
+    - coresight: etm4x: Do not hardcode IOMEM access for register restore
+    - coresight: etm4x: Do not save/restore Data trace control registers
+    - coresight: etm4x: Safe access for TRCQCLTR
+    - coresight: etm4x: Fix access to resource selector registers
+    - vfio/pci: fix potential memory leak in vfio_intx_enable()
+    - fpga: region: add owner module and take its refcount
+    - udf: Remove GFP_NOFS allocation in udf_expand_file_adinicb()
+    - udf: Convert udf_expand_file_adinicb() to use a folio
+    - microblaze: Remove gcc flag for non existing early_printk.c file
+    - microblaze: Remove early printk call from cpuinfo-static.c
+    - PCI: Wait for Link Training==0 before starting Link retrain
+    - perf intel-pt: Fix unassigned instruction op (discovered by MemorySanitizer)
+    - leds: pwm: Disable PWM when going to suspend
+    - ovl: remove upper umask handling from ovl_create_upper()
+    - PCI: of_property: Return error for int_map allocation failure
+    - VMCI: Fix an error handling path in vmci_guest_probe_device()
+    - dt-bindings: pinctrl: mediatek: mt7622: fix array properties
+    - pinctrl: qcom: pinctrl-sm7150: Fix sdc1 and ufs special pins regs
+    - watchdog: cpu5wdt.c: Fix use-after-free bug caused by cpu5wdt_trigger
+    - watchdog: bd9576: Drop "always-running" property
+    - watchdog: sa1100: Fix PTR_ERR_OR_ZERO() vs NULL check in sa1100dog_probe()
+    - dt-bindings: phy: qcom,sc8280xp-qmp-ufs-phy: fix msm899[68] power-domains
+    - dt-bindings: phy: qcom,usb-snps-femto-v2: use correct fallback for sc8180x
+    - dmaengine: idxd: Avoid unnecessary destruction of file_ida
+    - usb: gadget: u_audio: Fix race condition use of controls after free during
+      gadget unbind.
+    - usb: gadget: u_audio: Clear uac pointer when freed.
+    - stm class: Fix a double free in stm_register_device()
+    - ppdev: Add an error check in register_device
+    - i2c: cadence: Avoid fifo clear after start
+    - i2c: synquacer: Fix an error handling path in synquacer_i2c_probe()
+    - perf bench internals inject-build-id: Fix trap divide when collecting just
+      one DSO
+    - perf ui browser: Don't save pointer to stack memory
+    - extcon: max8997: select IRQ_DOMAIN instead of depending on it
+    - dt-bindings: spmi: hisilicon,hisi-spmi-controller: fix binding references
+    - PCI/EDR: Align EDR_PORT_DPC_ENABLE_DSM with PCI Firmware r3.3
+    - PCI/EDR: Align EDR_PORT_LOCATE_DSM with PCI Firmware r3.3
+    - f2fs: support printk_ratelimited() in f2fs_printk()
+    - f2fs: use BLKS_PER_SEG, BLKS_PER_SEC, and SEGS_PER_SEC
+    - f2fs: separate f2fs_gc_range() to use GC for a range
+    - f2fs: kill heap-based allocation
+    - f2fs: support file pinning for zoned devices
+    - f2fs: fix block migration when section is not aligned to pow2
+    - perf ui browser: Avoid SEGV on title
+    - perf report: Avoid SEGV in report__setup_sample_type()
+    - perf thread: Fixes to thread__new() related to initializing comm
+    - perf symbols: Fix ownership of string in dso__load_vmlinux()
+    - f2fs: compress: fix to update i_compr_blocks correctly
+    - f2fs: deprecate io_bits
+    - f2fs: introduce get_available_block_count() for cleanup
+    - f2fs: compress: fix error path of inc_valid_block_count()
+    - f2fs: compress: fix to cover {reserve,release}_compress_blocks() w/ cp_rwsem
+      lock
+    - f2fs: fix to release node block count in error path of f2fs_new_node_page()
+    - f2fs: compress: don't allow unaligned truncation on released compress inode
+    - serial: sh-sci: protect invalidating RXDMA on shutdown
+    - libsubcmd: Fix parse-options memory leak
+    - perf daemon: Fix file leak in daemon_session__control
+    - f2fs: fix to add missing iput() in gc_data_segment()
+    - usb: fotg210: Add missing kernel doc description
+    - perf stat: Don't display metric header for non-leader uncore events
+    - perf tools: Use pmus to describe type from attribute
+    - perf tools: Add/use PMU reverse lookup from config to name
+    - perf pmu: Assume sysfs events are always the same case
+    - perf pmu: Count sys and cpuid JSON events separately
+    - LoongArch: Fix callchain parse error with kernel tracepoint events again
+    - s390/vdso64: filter out munaligned-symbols flag for vdso
+    - s390/vdso: Generate unwind information for C modules
+    - s390/vdso: Create .build-id links for unstripped vdso files
+    - s390/vdso: Use standard stack frame layout
+    - s390/ipl: Fix incorrect initialization of len fields in nvme reipl block
+    - s390/ipl: Fix incorrect initialization of nvme dump block
+    - s390/boot: Remove alt_stfle_fac_list from decompressor
+    - dt-bindings: PCI: rockchip,rk3399-pcie: Add missing maxItems to ep-gpios
+    - gpiolib: acpi: Fix failed in acpi_gpiochip_find() by adding parent node
+      match
+    - eventfs: Do not differentiate the toplevel events directory
+    - eventfs: Create eventfs_root_inode to store dentry
+    - eventfs/tracing: Add callback for release of an eventfs_inode
+    - eventfs: Free all of the eventfs_inode after RCU
+    - eventfs: Have "events" directory get permissions from its parent
+    - dt-bindings: adc: axi-adc: update bindings for backend framework
+    - dt-bindings: adc: axi-adc: add clocks property
+    - Input: ims-pcu - fix printf string overflow
+    - mmc: sdhci_am654: Add tuning algorithm for delay chain
+    - mmc: sdhci_am654: Write ITAPDLY for DDR52 timing
+    - mmc: sdhci_am654: Add OTAP/ITAP delay enable
+    - mmc: sdhci_am654: Add ITAPDLYSEL in sdhci_j721e_4bit_set_clock
+    - mmc: sdhci_am654: Fix ITAPDLY for HS400 timing
+    - Input: pm8xxx-vibrator - correct VIB_MAX_LEVELS calculation
+    - media: v4l: Don't turn on privacy LED if streamon fails
+    - media: ov2680: Clear the 'ret' variable on success
+    - media: ov2680: Allow probing if link-frequencies is absent
+    - media: ov2680: Do not fail if data-lanes property is absent
+    - drm/msm/dsi: Print dual-DSI-adjusted pclk instead of original mode pclk
+    - drm/msm/dpu: Always flush the slave INTF on the CTL
+    - drm/mediatek: dp: Fix mtk_dp_aux_transfer return value
+    - drm/meson: gate px_clk when setting rate
+    - um: Fix return value in ubd_init()
+    - um: vector: fix bpfflash parameter evaluation
+    - fs/ntfs3: Check 'folio' pointer for NULL
+    - fs/ntfs3: Use 64 bit variable to avoid 32 bit overflow
+    - fs/ntfs3: Use variable length array instead of fixed size
+    - drm/msm/dpu: Add callback function pointer check before its call
+    - drm/bridge: tc358775: fix support for jeida-18 and jeida-24
+    - media: stk1160: fix bounds checking in stk1160_copy_video()
+    - Input: cyapa - add missing input core locking to suspend/resume functions
+    - drm/amdgpu: init microcode chip name from ip versions
+    - drm/amdgpu: Fix buffer size in gfx_v9_4_3_init_ cp_compute_microcode() and
+      rlc_microcode()
+    - media: mediatek: vcodec: fix possible unbalanced PM counter
+    - tools/arch/x86/intel_sdsi: Fix maximum meter bundle length
+    - tools/arch/x86/intel_sdsi: Fix meter_show display
+    - tools/arch/x86/intel_sdsi: Fix meter_certificate decoding
+    - platform/x86: thinkpad_acpi: Take hotkey_mutex during hotkey_exit()
+    - media: flexcop-usb: fix sanity check of bNumEndpoints
+    - powerpc/pseries: Add failure related checks for h_get_mpp and h_get_ppp
+    - um: Fix the -Wmissing-prototypes warning for __switch_mm
+    - um: Fix the -Wmissing-prototypes warning for get_thread_reg
+    - um: Fix the declaration of kasan_map_memory
+    - cxl/trace: Correct DPA field masks for general_media & dram events
+    - cxl/region: Fix cxlr_pmem leaks
+    - media: sunxi: a83-mips-csi2: also select GENERIC_PHY
+    - media: cec: cec-adap: always cancel work in cec_transmit_msg_fh
+    - media: cec: cec-api: add locking in cec_release()
+    - media: cec: core: avoid recursive cec_claim_log_addrs
+    - media: cec: core: avoid confusing "transmit timed out" message
+    - Revert "drm/bridge: ti-sn65dsi83: Fix enable error path"
+    - drm: zynqmp_dpsub: Always register bridge
+    - selftests/powerpc/dexcr: Add -no-pie to hashchk tests
+    - drm/msm/a6xx: Avoid a nullptr dereference when speedbin setting fails
+    - ASoC: tas2781: Fix a warning reported by robot kernel test
+    - null_blk: Fix the WARNING: modpost: missing MODULE_DESCRIPTION()
+    - ALSA: hda/cs_dsp_ctl: Use private_free for control cleanup
+    - ALSA: hda: cs35l56: Fix lifetime of cs_dsp instance
+    - ASoC: mediatek: mt8192: fix register configuration for tdm
+    - drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations
+    - blk-cgroup: fix list corruption from resetting io stat
+    - blk-cgroup: fix list corruption from reorder of WRITE ->lqueued
+    - blk-cgroup: Properly propagate the iostat update up the hierarchy
+    - regulator: bd71828: Don't overwrite runtime voltages
+    - xen/x86: add extra pages to unpopulated-alloc if available
+    - perf/arm-dmc620: Fix lockdep assert in ->event_init()
+    - x86/kconfig: Select ARCH_WANT_FRAME_POINTERS again when
+      UNWINDER_FRAME_POINTER=y
+    - [Config] Update CONFIG_ARCH_WANT_FRAME_POINTERS
+    - net: Always descend into dsa/ folder with CONFIG_NET_DSA enabled
+    - ipv6: sr: fix missing sk_buff release in seg6_input_core
+    - selftests: net: kill smcrouted in the cleanup logic in amt.sh
+    - nfc: nci: Fix uninit-value in nci_rx_work
+    - ASoC: tas2552: Add TX path for capturing AUDIO-OUT data
+    - ASoC: tas2781: Fix wrong loading calibrated data sequence
+    - NFSv4: Fixup smatch warning for ambiguous return
+    - nfs: keep server info for remounts
+    - sunrpc: fix NFSACL RPC retry on soft mount
+    - rpcrdma: fix handling for RDMA_CM_EVENT_DEVICE_REMOVAL
+    - regulator: pickable ranges: don't always cache vsel
+    - regulator: tps6287x: Force writing VSEL bit
+    - af_unix: Update unix_sk(sk)->oob_skb under sk_receive_queue lock.
+    - ipv6: sr: fix memleak in seg6_hmac_init_algo
+    - regulator: tps6594-regulator: Correct multi-phase configuration
+    - tcp: Fix shift-out-of-bounds in dctcp_update_alpha().
+    - pNFS/filelayout: fixup pNfs allocation modes
+    - openvswitch: Set the skbuff pkt_type for proper pmtud support.
+    - arm64: asm-bug: Add .align 2 to the end of __BUG_ENTRY
+    - rv: Update rv_en(dis)able_monitor doc to match kernel-doc
+    - net: lan966x: Remove ptp traps in case the ptp is not enabled.
+    - virtio: delete vq in vp_find_vqs_msix() when request_irq() fails
+    - i3c: master: svc: change ENXIO to EAGAIN when IBI occurs during start frame
+    - Revert "ixgbe: Manual AN-37 for troublesome link partners for X550 SFI"
+    - net: fec: avoid lock evasion when reading pps_enable
+    - tls: fix missing memory barrier in tls_init
+    - net: relax socket state check at accept time.
+    - nfc: nci: Fix handling of zero-length payload packets in nci_rx_work()
+    - drivers/xen: Improve the late XenStore init protocol
+    - ice: Interpret .set_channels() input differently
+    - kasan, fortify: properly rename memintrinsics
+    - tracing/probes: fix error check in parse_btf_field()
+    - tpm_tis_spi: Account for SPI header when allocating TPM SPI xfer buffer
+    - netfilter: nfnetlink_queue: acquire rcu_read_lock() in
+      instance_destroy_rcu()
+    - netfilter: ipset: Add list flush to cancel_gc
+    - netfilter: nft_payload: restore vlan q-in-q match support
+    - spi: Don't mark message DMA mapped when no transfer in it is
+    - dma-mapping: benchmark: fix up kthread-related error handling
+    - dma-mapping: benchmark: fix node id validation
+    - dma-mapping: benchmark: handle NUMA_NO_NODE correctly
+    - nvme-multipath: fix io accounting on failover
+    - nvmet: fix ns enable/disable possible hang
+    - drm/amd/display: Enable colorspace property for MST connectors
+    - net: phy: micrel: set soft_reset callback to genphy_soft_reset for KSZ8061
+    - net/mlx5: Lag, do bond only if slaves agree on roce state
+    - net/mlx5: Fix MTMP register capability offset in MCAM register
+    - net/mlx5: Use mlx5_ipsec_rx_status_destroy to correctly delete status rules
+    - net/mlx5e: Fix IPsec tunnel mode offload feature check
+    - net/mlx5e: Use rx_missed_errors instead of rx_dropped for reporting buffer
+      exhaustion
+    - net/mlx5e: Fix UDP GSO for encapsulated packets
+    - dma-buf/sw-sync: don't enable IRQ from sync_print_obj()
+    - bpf: Fix potential integer overflow in resolve_btfids
+    - ALSA: jack: Use guard() for locking
+    - ALSA: core: Remove debugfs at disconnection
+    - ALSA: hda/realtek: Adjust G814JZR to use SPI init for amp
+    - enic: Validate length of nl attributes in enic_set_vf_port
+    - af_unix: Annotate data-race around unix_sk(sk)->addr.
+    - af_unix: Read sk->sk_hash under bindlock during bind().
+    - Octeontx2-pf: Free send queue buffers incase of leaf to inner
+    - net: usb: smsc95xx: fix changing LED_SEL bit value updated from EEPROM
+    - ASoC: cs42l43: Only restrict 44.1kHz for the ASP
+    - bpf: Allow delete from sockmap/sockhash only if update is allowed
+    - net:fec: Add fec_enet_deinit()
+    - net: micrel: Fix lan8841_config_intr after getting out of sleep mode
+    - ice: fix accounting if a VLAN already exists
+    - selftests: mptcp: simult flows: mark 'unbalanced' tests as flaky
+    - selftests: mptcp: add ms units for tc-netem delay
+    - selftests: mptcp: join: mark 'fail' tests as flaky
+    - ALSA: seq: Fix missing bank setup between MIDI1/MIDI2 UMP conversion
+    - ALSA: seq: Don't clear bank selection at event -> UMP MIDI2 conversion
+    - net: ti: icssg-prueth: Fix start counter for ft1 filter
+    - netfilter: nft_payload: skbuff vlan metadata mangle support
+    - netfilter: tproxy: bail out if IP has been disabled on the device
+    - netfilter: nft_fib: allow from forward/input without iif selector
+    - net/sched: taprio: make q->picos_per_byte available to fill_sched_entry()
+    - net/sched: taprio: extend minimum interval restriction to entire cycle too
+    - kconfig: fix comparison to constant symbols, 'm', 'n'
+    - drm/i915/guc: avoid FIELD_PREP warning
+    - kheaders: use `command -v` to test for existence of `cpio`
+    - spi: stm32: Don't warn about spurious interrupts
+    - net: dsa: microchip: fix RGMII error in KSZ DSA driver
+    - net: ena: Reduce lines with longer column width boundary
+    - net: ena: Fix redundant device NUMA node override
+    - ipvlan: Dont Use skb->sk in ipvlan_process_v{4,6}_outbound
+    - ALSA: seq: Fix yet another spot for system message conversion
+    - powerpc/pseries/lparcfg: drop error message from guest name lookup
+    - drm/panel: sitronix-st7789v: fix timing for jt240mhqs_hwt_ek_e3 panel
+    - drm/panel: sitronix-st7789v: tweak timing for jt240mhqs_hwt_ek_e3 panel
+    - drm/panel: sitronix-st7789v: fix display size for jt240mhqs_hwt_ek_e3 panel
+    - hwmon: (intel-m10-bmc-hwmon) Fix multiplier for N6000 board power sensor
+    - hwmon: (shtc1) Fix property misspelling
+    - ALSA: seq: ump: Fix swapped song position pointer data
+    - ALSA: timer: Set lower bound of start tick time
+    - x86/efistub: Omit physical KASLR when memory reservations exist
+    - efi: libstub: only free priv.runtime_map when allocated
+    - x86/pci: Skip early E820 check for ECAM region
+    - KVM: x86: Don't advertise guest.MAXPHYADDR as host.MAXPHYADDR in CPUID
+    - genirq/cpuhotplug, x86/vector: Prevent vector leak during CPU offline
+    - platform/x86/intel/tpmi: Handle error from tpmi_process_info()
+    - platform/x86/intel-uncore-freq: Don't present root domain on error
+    - perf sched timehist: Fix -g/--call-graph option failure
+    - f2fs: write missing last sum blk of file pinning section
+    - f2fs: use f2fs_{err,info}_ratelimited() for cleanup
+    - SUNRPC: Fix loop termination condition in gss_free_in_token_pages()
+    - riscv: prevent pt_regs corruption for secondary idle threads
+    - riscv: stacktrace: fixed walk_stackframe()
+    - perf build: Fix out of tree build related to installation of sysreg-defs
+    - dt-bindings: pinctrl: qcom: update functions to match with driver
+    - usb: typec: ucsi: allow non-partner GET_PDOS for Qualcomm devices
+    - perf report: Fix PAI counter names for s390 virtual machines
+    - PCI: dwc: ep: Fix DBI access failure for drivers requiring refclk from host
+    - perf map: Remove kernel map before updating start and end addresses
+    - riscv: dts: starfive: visionfive 2: Remove non-existing I2S hardware
+    - pinctrl: renesas: rzg2l: Limit 2.5V power supply to Ethernet interfaces
+    - riscv: Flush the instruction cache during SMP bringup
+    - usb: xhci: check if 'requested segments' exceeds ERST capacity
+    - spmi: pmic-arb: Replace three IS_ERR() calls by null pointer checks in
+      spmi_pmic_arb_probe()
+    - perf symbols: Remove map from list before updating addresses
+    - perf symbols: Update kcore map before merging in remaining symbols
+    - s390/ftrace: Use unwinder instead of __builtin_return_address()
+    - s390/stacktrace: Merge perf_callchain_user() and arch_stack_walk_user()
+    - s390/stacktrace: Skip first user stack frame
+    - s390/stacktrace: Improve detection of invalid instruction pointers
+    - s390/vdso: Introduce and use struct stack_frame_vdso_wrapper
+    - s390/stackstrace: Detect vdso stack frames
+    - s390/ap: Fix bind complete udev event sent after each AP bus scan
+    - ocfs2: correctly use ocfs2_find_next_zero_bit()
+    - mailbox: mtk-cmdq: Fix pm_runtime_get_sync() warning in mbox shutdown
+    - Input: ioc3kbd - add device table
+    - phy: qcom: qmp-combo: fix sm8650 voltage swing table
+    - media: ti: j721e-csi2rx: Fix races while restarting DMA
+    - drm/msm/dpu: Allow configuring multiple active DSC blocks
+    - drm: Make drivers depends on DRM_DW_HDMI
+    - [Config] Drivers now depend on DRM_DW_HDMI
+    - string: Prepare to merge strscpy_kunit.c into string_kunit.c
+    - string: Prepare to merge strcat KUnit tests into string_kunit.c
+    - drm/msm/adreno: fix CP cycles stat retrieval on a7xx
+    - printk: Fix LOG_CPU_MAX_BUF_SHIFT when BASE_SMALL is enabled
+    - powerpc/bpf/32: Fix failing test_bpf tests
+    - KVM: PPC: Book3S HV nestedv2: Cancel pending DEC exception
+    - KVM: PPC: Book3S HV nestedv2: Fix an error handling path in
+      gs_msg_ops_kvmhv_nestedv2_config_fill_info()
+    - KVM: arm64: Destroy mpidr_data for 'late' vCPU creation
+    - Bluetooth: ISO: Handle PA sync when no BIGInfo reports are generated
+    - Bluetooth: L2CAP: Fix div-by-zero in l2cap_le_flowctl_init()
+    - ubsan: Restore dependency on ARCH_HAS_UBSAN
+    - selftests: forwarding: Have RET track kselftest framework constants
+    - selftests: forwarding: Convert log_test() to recognize RET values
+    - selftests: net: Unify code of busywait() and slowwait()
+    - selftests/net: use tc rule to filter the na packet
+    - virtio_balloon: Give the balloon its own wakeup source
+    - riscv: cpufeature: Fix thead vector hwcap removal
+    - riscv: cpufeature: Fix extension subset checking
+    - riscv: selftests: Add hwprobe binaries to .gitignore
+    - idpf: Interpret .set_channels() input differently
+    - null_blk: fix null-ptr-dereference while configuring 'power' and
+      'submit_queues'
+    - netfs: Fix setting of BDP_ASYNC from iocb flags
+    - cifs: Set zero_point in the copy_file_range() and remap_file_range()
+    - cifs: Fix missing set of remote_i_size
+    - selftests: net: lib: set 'i' as local
+    - nvme: fix multipath batched completion accounting
+    - netkit: Fix setting mac address in l2 mode
+    - netkit: Fix pkt_type override upon netkit pass verdict
+    - null_blk: Fix return value of nullb_device_power_store()
+    - idpf: don't enable NAPI and interrupts prior to allocating Rx buffers
+    - selftests: mptcp: join: mark 'fastclose' tests as flaky
+    - drm/xe: Add dbg messages on the suspend resume functions.
+    - drm/xe: check pcode init status only on root gt of root tile
+    - drm/xe: Change pcode timeout to 50msec while polling again
+    - drm/xe: Only use reserved BCS instances for usm migrate exec queue
+    - sd: also set max_user_sectors when setting max_sectors
+    - block: stack max_user_sectors
+    - ipv6: introduce dst_rt6_info() helper
+    - inet: introduce dst_rtable() helper
+    - net: fix __dst_negative_advice() race
+    - ice: fix 200G PHY types to link speed mapping
+    - x86/topology/intel: Unlock CPUID before evaluating anything
+    - Upstream stable to v6.6.33, v6.9.4
+
+  * Reenable CONFIG_UBSAN for noble (LP: #2076650)
+    - ubsan: Remove CONFIG_UBSAN_SANITIZE_ALL
+    - [Config] Remove CONFIG_UBSAN_SANITIZE_ALL
+
+  * Dangling symlink to linux-lib-rust when Rust is disabled (LP: #2072592)
+    - [Packaging] Check do_lib_rust before linking Rust lib files
+
+  * kdump doesn't work with UEFI secure boot and kernel lockdown enabled on
+    ARM64 (LP: #2033007)
+    - [Config]: Enable CONFIG_KEXEC_IMAGE_VERIFY_SIG on arm64
+
+  * net/sched: Fix conntrack use-after-free (LP: #2073092)
+    - net/sched: Fix UAF when resolving a clash
+
+  * No sound on Huawei Matebook D14  AMD since Linux 6.8.0-38 [regression]
+    (LP: #2073049)
+    - ASoC: amd: acp: fix for acp platform device creation failure
+
+  * i915: Fixup regressions introduced with enabling single CCS engine
+    (LP: #2072755)
+    - drm/i915/gt: Fix CCS id's calculation for CCS mode setting
+
+  * [Ubuntu 24.04] FW1060.00 (NH1060_026) sosreport is running to Kernel OOPS
+    crash (LP: #2070358)
+    - nfsd: initialise nfsd_info.mutex early.
+
+  * 6.8 generic & amdpgu / polaris (LP: #2072428)
+    - drm/amdgpu: Adjust logic in amdgpu_device_partner_bandwidth()
+
+  * md: nvme over tcp with a striped underlying md raid device leads to data
+    corruption (LP: #2075110)
+    - md/md-bitmap: fix writing non bitmap pages
+
+  * Linux 6.8 fails to boot on ARM64 if any param is more than 146 chars
+    (LP: #2069534)
+    - SAUCE: arm64: v6.8: cmdline param >= 146 chars kills kernel
+
+  * CVE-2024-39484
+    - mmc: davinci: Don't strip remove function when driver is builtin
+
+  * CVE-2024-39292
+    - um: Add winch to winch_handlers before registering winch IRQ
+
+  * Miscellaneous upstream changes
+    - bnx2x: Fix multiple UBSAN array-index-out-of-bounds
+
+ -- Roxana Nicolescu <roxana.nicolescu@canonical.com>  Tue, 13 Aug 2024 12:20:36 +0200
diff --git a/debian.nvidia-adv/config/README.rst b/debian.nvidia-adv/config/README.rst
new file mode 100644
index 0000000000000..751ce7f3b284d
--- /dev/null
+++ b/debian.nvidia-adv/config/README.rst
@@ -0,0 +1,185 @@
+==================
+Config Annotations
+==================
+
+:Author: Andrea Righi
+
+Overview
+========
+
+Each Ubuntu kernel needs to maintain its own .config for each supported
+architecture and each flavour.
+
+Every time a new patch is applied or a kernel is rebased on top of a new
+one, we need to update the .config's accordingly (config options can be
+added, removed and also renamed).
+
+So, we need to make sure that some critical config options are always
+matching the desired value in order to have a functional kernel.
+
+State of the art
+================
+
+At the moment configs are maintained as a set of Kconfig chunks (inside
+`debian.<kernel>/config/`): a global one, plus per-arch / per-flavour
+chunks.
+
+In addition to that, we need to maintain also a file called
+'annotations'; the purpose of this file is to make sure that some
+critical config options are not silently removed or changed when the
+real .config is re-generated (for example after a rebase or after
+applying a new set of patches).
+
+The main problem with this approach is that, often, we have duplicate
+information that is stored both in the Kconfig chunks *and* in the
+annotations files and, at the same time, the whole .config's information
+is distributed between Kconfig chunks and annotations, making it hard to
+maintain, review and manage in general.
+
+Proposed solution
+=================
+
+The proposed solution is to store all the config information into the
+"annotations" format and get rid of the config chunks (basically the
+real .config's can be produced "compiling" annotations).
+
+Implementation
+==============
+
+To help the management of the annotations an helper script is provided
+(`debian/scripts/misc/annotations`):
+
+```
+usage: annotations [-h] [--version] [--file FILE] [--arch ARCH] [--flavour FLAVOUR] [--config CONFIG]
+                   (--query | --export | --import FILE | --update FILE | --check FILE)
+
+Manage Ubuntu kernel .config and annotations
+
+options:
+  -h, --help            show this help message and exit
+  --version, -v         show program's version number and exit
+  --file FILE, -f FILE  Pass annotations or .config file to be parsed
+  --arch ARCH, -a ARCH  Select architecture
+  --flavour FLAVOUR, -l FLAVOUR
+                        Select flavour (default is "generic")
+  --config CONFIG, -c CONFIG
+                        Select a specific config option
+
+Action:
+  --query, -q           Query annotations
+  --export, -e          Convert annotations to .config format
+  --import FILE, -i FILE
+                        Import a full .config for a specific arch and flavour into annotations
+  --update FILE, -u FILE
+                        Import a partial .config into annotations (only resync configs specified in FILE)
+  --check FILE, -k FILE
+                        Validate kernel .config with annotations
+```
+
+This script allows to query config settings (per arch/flavour/config),
+export them into the Kconfig format (generating the real .config files)
+and check if the final .config matches the rules defined in the
+annotations.
+
+Examples (annotations is defined as an alias to `debian/scripts/annotations`):
+
+ - Show settings for `CONFIG_DEBUG_INFO_BTF` for master kernel across all the
+   supported architectures and flavours:
+
+```
+$ annotations --query --config CONFIG_DEBUG_INFO_BTF
+{
+    "policy": {
+        "amd64": "y",
+        "arm64": "y",
+        "armhf": "n",
+        "ppc64el": "y",
+        "riscv64": "y",
+        "s390x": "y"
+    },
+    "note": "'Needs newer pahole for armhf'"
+}
+```
+
+ - Dump kernel .config for arm64 and flavour generic-64k:
+
+```
+$ annotations --arch arm64 --flavour generic-64k --export
+CONFIG_DEBUG_FS=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_COMPAT=y
+...
+```
+
+ - Update annotations file with a new kernel .config for amd64 flavour
+   generic:
+
+```
+$ annotations --arch amd64 --flavour generic --import build/.config
+```
+
+Moreover, an additional kernelconfig commands are provided
+(via debian/rules targets):
+ - `migrateconfigs`: automatically merge all the previous configs into
+   annotations (local changes still need to be committed)
+
+Annotations headers
+===================
+
+The main annotations file should contain a header to define the architectures
+and flavours that are supported.
+
+Here is the format of the header for the generic kernel:
+```
+# Menu: HEADER
+# FORMAT: 4
+# ARCH: amd64 arm64 armhf ppc64el riscv64 s390x
+# FLAVOUR: amd64-generic arm64-generic arm64-generic-64k armhf-generic armhf-generic-lpae ppc64el-generic riscv64-generic s390x-generic
+
+```
+
+Example header of a derivative (linux-aws):
+```
+# Menu: HEADER
+# FORMAT: 4
+# ARCH: amd64 arm64
+# FLAVOUR: amd64-aws arm64-aws
+# FLAVOUR_DEP: {'amd64-aws': 'amd64-generic', 'arm64-aws': 'arm64-generic'}
+
+include "../../debian.master/config/annotations"
+
+# Below you can define only the specific linux-aws configs that differ from linux generic
+
+```
+
+Pros and Cons
+=============
+
+ Pros:
+  - avoid duplicate information in .config's and annotations
+  - allow to easily define groups of config settings (for a specific
+    environment or feature, such as annotations.clouds, annotations.ubuntu,
+    annotations.snapd, etc.)
+  - config options are more accessible, easy to change and review
+  - we can easily document how config options are managed (and external
+    contributors won't be discouraged anymore when they need to to change a
+    config option)
+
+ Cons:
+  - potential regressions: the new tool/scripts can have potential bugs,
+    so we could experience regressions due to some missed config changes
+  - kernel team need to understand the new process (even if everything
+    is transparent, kernel cranking process is the same, there might be
+    corner cases that need to be addressed and resolved manually)
+
+TODO
+====
+
+ - Migrate all flavour and arch definitions into annotations (rather
+   than having this information defined in multiple places inside
+   debian/scripts); right now this information is "partially" migrated,
+   meaning that we need to define arches and flavours in the headers
+   section of annotations (so that the annotations tool can figure out
+   the list of supported arches and flavours), but arches and flavours
+   are still defined elsewhere, ideally we would like to have arches and
+   flavours defined only in one place: annotations.
diff --git a/debian.nvidia-adv/config/annotations b/debian.nvidia-adv/config/annotations
new file mode 100644
index 0000000000000..7d9fd0b373b69
--- /dev/null
+++ b/debian.nvidia-adv/config/annotations
@@ -0,0 +1,182 @@
+# Menu: HEADER
+# FORMAT: 4
+# ARCH: amd64 arm64
+# FLAVOUR: amd64-nvidia-adv arm64-nvidia-adv arm64-nvidia-adv-64k
+# FLAVOUR_DEP: {'amd64-nvidia-adv': 'amd64-generic', 'arm64-nvidia-adv': 'arm64-generic', 'arm64-nvidia-adv-64k': 'arm64-generic-64k'}
+
+include "../../debian.master/config/annotations"
+
+CONFIG_AAEON_IWMI_WDT                           policy<{'amd64': '-'}>
+CONFIG_AAEON_IWMI_WDT                           note<'{Disable all Ubuntu ODM drivers}'>
+
+CONFIG_ARM64_ERRATUM_1902691                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_1902691                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_ERRATUM_2038923                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_2038923                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_ERRATUM_2064142                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_2064142                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_ERRATUM_2119858                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_2119858                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_ERRATUM_2139208                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_2139208                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_ERRATUM_2224489                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_2224489                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_ERRATUM_2253138                    policy<{'arm64': 'y'}>
+CONFIG_ARM64_ERRATUM_2253138                    note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE policy<{'arm64': 'y'}>
+CONFIG_ARM64_WORKAROUND_TRBE_OVERWRITE_FILL_MODE note<'{Required for Grace enablement}'>
+
+CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE policy<{'arm64': 'y'}>
+CONFIG_ARM64_WORKAROUND_TRBE_WRITE_OUT_OF_RANGE note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT                                policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT                                note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_CATU                           policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_CATU                           note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_CPU_DEBUG                      policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_CPU_DEBUG                      note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_CPU_DEBUG_DEFAULT_ON           policy<{'arm64': 'n'}>
+CONFIG_CORESIGHT_CPU_DEBUG_DEFAULT_ON           note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_CTI                            policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_CTI                            note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_CTI_INTEGRATION_REGS           policy<{'arm64': 'n'}>
+CONFIG_CORESIGHT_CTI_INTEGRATION_REGS           note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_DUMMY                          policy<{'arm64': 'n'}>
+CONFIG_CORESIGHT_DUMMY                          note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_LINKS_AND_SINKS                policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_LINKS_AND_SINKS                note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_LINK_AND_SINK_TMC              policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_LINK_AND_SINK_TMC              note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_SINK_ETBV10                    policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_SINK_ETBV10                    note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_SINK_TPIU                      policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_SINK_TPIU                      note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_SOURCE_ETM4X                   policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_SOURCE_ETM4X                   note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_STM                            policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_STM                            note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_TPDA                           policy<{'arm64': 'n'}>
+CONFIG_CORESIGHT_TPDA                           note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_TPDM                           policy<{'arm64': 'n'}>
+CONFIG_CORESIGHT_TPDM                           note<'{Required for Grace enablement}'>
+
+CONFIG_CORESIGHT_TRBE                           policy<{'arm64': 'm'}>
+CONFIG_CORESIGHT_TRBE                           note<'{Required for Grace enablement}'>
+
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND            policy<{'arm64': 'n'}>
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND            note<'{required for nvidia workloads}'>
+
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE         policy<{'amd64': 'n', 'arm64': 'y'}>
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE         note<'{required for nvidia workloads}'>
+
+CONFIG_DRM_NOUVEAU                              policy<{'amd64': 'n', 'arm64': 'm'}>
+CONFIG_DRM_NOUVEAU                              note<'{Disable NOUVEAU driver}'>
+
+CONFIG_DRM_NOUVEAU_BACKLIGHT                    policy<{'amd64': '-', 'arm64': 'y'}>
+CONFIG_DRM_NOUVEAU_BACKLIGHT                    note<'{Disable NOUVEAU driver}'>
+
+CONFIG_DRM_NOUVEAU_GSP_DEFAULT                  policy<{'amd64': '-', 'arm64': 'n'}>
+CONFIG_DRM_NOUVEAU_GSP_DEFAULT                  note<'{Disable NOUVEAU driver}'>
+
+CONFIG_DRM_NOUVEAU_SVM                          policy<{'amd64': '-', 'arm64': 'n'}>
+CONFIG_DRM_NOUVEAU_SVM                          note<'{Disable NOUVEAU driver}'>
+
+CONFIG_ETM4X_IMPDEF_FEATURE                     policy<{'arm64': 'n'}>
+CONFIG_ETM4X_IMPDEF_FEATURE                     note<'{Required for Grace enablement}'>
+
+CONFIG_GPIO_AAEON                               policy<{'amd64': '-'}>
+CONFIG_GPIO_AAEON                               note<'{Disable all Ubuntu ODM drivers}'>
+
+CONFIG_LEDS_AAEON                               policy<{'amd64': '-'}>
+CONFIG_LEDS_AAEON                               note<'{Disable all Ubuntu ODM drivers}'>
+
+CONFIG_MFD_AAEON                                policy<{'amd64': '-'}>
+CONFIG_MFD_AAEON                                note<'{Disable all Ubuntu ODM drivers}'>
+
+CONFIG_MTD                                      policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_MTD                                      note<'boot essential on arm'>
+
+CONFIG_NOUVEAU_DEBUG                            policy<{'amd64': '-', 'arm64': '5'}>
+CONFIG_NOUVEAU_DEBUG                            note<'{Disable NOUVEAU driver}'>
+
+CONFIG_NOUVEAU_DEBUG_DEFAULT                    policy<{'amd64': '-', 'arm64': '3'}>
+CONFIG_NOUVEAU_DEBUG_DEFAULT                    note<'{Disable NOUVEAU driver}'>
+
+CONFIG_NOUVEAU_DEBUG_MMU                        policy<{'amd64': '-', 'arm64': 'n'}>
+CONFIG_NOUVEAU_DEBUG_MMU                        note<'{Disable NOUVEAU driver}'>
+
+CONFIG_NOUVEAU_DEBUG_PUSH                       policy<{'amd64': '-', 'arm64': 'n'}>
+CONFIG_NOUVEAU_DEBUG_PUSH                       note<'{Disable NOUVEAU driver}'>
+
+CONFIG_NR_CPUS                                  policy<{'amd64': '8192', 'arm64': '512'}>
+CONFIG_NR_CPUS                                  note<'LP: #1864198'>
+
+CONFIG_PID_IN_CONTEXTIDR                        policy<{'arm64': 'y'}>
+CONFIG_PID_IN_CONTEXTIDR                        note<'{Required for Grace enablement}'>
+
+CONFIG_PREEMPT_NONE                             policy<{'amd64': 'n', 'arm64': 'y'}>
+CONFIG_PREEMPT_NONE                             note<'required for nvidia workloads'>
+
+CONFIG_PREEMPT_VOLUNTARY                        policy<{'amd64': 'y', 'arm64': 'n'}>
+CONFIG_PREEMPT_VOLUNTARY                        note<'required for nvidia workloads'>
+
+CONFIG_RUST                                     policy<{'amd64': 'n', 'arm64': '-'}>
+CONFIG_RUST                                     note<'Rust is disabled in derivatives'>
+
+CONFIG_RUST_IS_AVAILABLE                        policy<{'amd64': 'y', 'arm64': 'y'}>
+CONFIG_RUST_IS_AVAILABLE                        note<'Rust is disabled in derivatives'>
+
+CONFIG_SAMPLE_CORESIGHT_SYSCFG                  policy<{'arm64': 'n'}>
+CONFIG_SAMPLE_CORESIGHT_SYSCFG                  note<'{Required for Grace enablement}'>
+
+CONFIG_SENSORS_AAEON                            policy<{'amd64': '-'}>
+CONFIG_SENSORS_AAEON                            note<'{Disable all Ubuntu ODM drivers}'>
+
+CONFIG_SPI_TEGRA210_QUAD                        policy<{'arm64': 'y'}>
+CONFIG_SPI_TEGRA210_QUAD                        note<'ensures the TPM is available before the IMA driver initializes'>
+
+CONFIG_TCG_TIS_SPI                              policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_TCG_TIS_SPI                              note<'ensures the TPM is available before the IMA driver initializes'>
+
+CONFIG_UBUNTU_ODM_DRIVERS                       policy<{'amd64': 'n', 'arm64': 'n'}>
+CONFIG_UBUNTU_ODM_DRIVERS                       note<'{Disable all Ubuntu ODM drivers}'>
+
+CONFIG_ULTRASOC_SMB                             policy<{'arm64': 'n'}>
+CONFIG_ULTRASOC_SMB                             note<'{Required for Grace enablement}'>
+
+
+# ---- Annotations without notes ----
+
+CONFIG_AX88796B_RUST_PHY                        policy<{'amd64': '-'}>
+CONFIG_BCH                                      policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_BINDGEN_VERSION_TEXT                     policy<{'amd64': '-'}>
+CONFIG_CONSTRUCTORS                             policy<{'amd64': '-'}>
+CONFIG_EFI_CAPSULE_LOADER                       policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_MTD_NAND_CORE                            policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_RUSTC_VERSION_TEXT                       policy<{'amd64': '-'}>
+CONFIG_RUST_BUILD_ASSERT_ALLOW                  policy<{'amd64': '-'}>
+CONFIG_RUST_DEBUG_ASSERTIONS                    policy<{'amd64': '-'}>
+CONFIG_RUST_OVERFLOW_CHECKS                     policy<{'amd64': '-'}>
+CONFIG_RUST_PHYLIB_ABSTRACTIONS                 policy<{'amd64': '-'}>
+CONFIG_SAMPLES_RUST                             policy<{'amd64': '-'}>
diff --git a/debian.nvidia-adv/control.d/flavour-control.stub b/debian.nvidia-adv/control.d/flavour-control.stub
new file mode 100644
index 0000000000000..a7ba586e6909c
--- /dev/null
+++ b/debian.nvidia-adv/control.d/flavour-control.stub
@@ -0,0 +1,153 @@
+# Items that get replaced:
+# FLAVOUR
+# DESC
+# ARCH
+# SUPPORTED
+# TARGET
+# BOOTLOADER
+# =PROVIDES=
+#
+# Items marked with =FOO= are optional
+#
+# This file describes the template for packages that are created for each flavour
+# in debian/control.d/vars.*
+#
+# This file gets edited in a couple of places. See the debian/control.stub rule in
+# debian/rules. PGGVER, ABINUM, and SRCPKGNAME are all converted in the
+# process of creating debian/control.
+#
+# The flavour specific strings (ARCH, DESC, etc) are converted using values from the various
+# flavour files in debian/control.d/vars.*
+#
+# XXX: Leave the blank line before the first package!!
+
+Package: linux-image=SIGN-ME-PKG=-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: kernel
+Priority: optional
+Provides: linux-image, fuse-module, =PROVIDES=${linux:rprovides}
+Depends: ${misc:Depends}, ${shlibs:Depends}, kmod, linux-base (>= 4.5ubuntu1~16.04.1), linux-modules-PKGVER-ABINUM-FLAVOUR
+Recommends: BOOTLOADER, initramfs-tools | linux-initramfs-tool
+Breaks: flash-kernel (<< 3.90ubuntu2) [arm64 armhf], s390-tools (<< 2.3.0-0ubuntu3) [s390x]
+Conflicts: linux-image=SIGN-PEER-PKG=-PKGVER-ABINUM-FLAVOUR
+Suggests: fdutils, SRCPKGNAME-tools, linux-headers-PKGVER-ABINUM-FLAVOUR, linux-modules-extra-PKGVER-ABINUM-FLAVOUR
+Description: Linux kernel image for version PKGVER on DESC
+ This package contains the=SIGN-ME-TXT= Linux kernel image for version PKGVER on
+ DESC.
+ .
+ Supports SUPPORTED processors.
+ .
+ TARGET
+ .
+ You likely do not want to install this package directly. Instead, install
+ the linux-FLAVOUR meta-package, which will ensure that upgrades work
+ correctly, and that supporting packages are also installed.
+
+Package: linux-modules-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: kernel
+Priority: optional
+Depends: ${misc:Depends}, ${shlibs:Depends}
+Built-Using: ${linux:BuiltUsing}
+Description: Linux kernel extra modules for version PKGVER on DESC
+ Contains the corresponding System.map file, the modules built by the
+ packager, and scripts that try to ensure that the system is not left in an
+ unbootable state after an update.
+ .
+ Supports SUPPORTED processors.
+ .
+ TARGET
+ .
+ You likely do not want to install this package directly. Instead, install
+ the linux-FLAVOUR meta-package, which will ensure that upgrades work
+ correctly, and that supporting packages are also installed.
+
+Package: linux-modules-extra-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: kernel
+Priority: optional
+Depends: ${misc:Depends}, ${shlibs:Depends}, linux-modules-PKGVER-ABINUM-FLAVOUR, wireless-regdb
+Description: Linux kernel extra modules for version PKGVER on DESC
+ This package contains the Linux kernel extra modules for version PKGVER on
+ DESC.
+ .
+ Also includes the corresponding System.map file, the modules built by the
+ packager, and scripts that try to ensure that the system is not left in an
+ unbootable state after an update.
+ .
+ Supports SUPPORTED processors.
+ .
+ TARGET
+ .
+ You likely do not want to install this package directly. Instead, install
+ the linux-FLAVOUR meta-package, which will ensure that upgrades work
+ correctly, and that supporting packages are also installed.
+
+Package: linux-headers-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, SRCPKGNAME-headers-PKGVER-ABINUM, ${shlibs:Depends}
+Provides: linux-headers, linux-headers-3.0
+Description: Linux kernel headers for version PKGVER on DESC
+ This package provides kernel header files for version PKGVER on
+ DESC.
+ .
+ This is for sites that want the latest kernel headers.  Please read
+ /usr/share/doc/linux-headers-PKGVER-ABINUM/debian.README.gz for details.
+
+Package: SRCPKGNAME-lib-rust-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: amd64
+Multi-Arch: foreign
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, coreutils
+Description: Rust library files related to Linux kernel version PKGVER
+ This package provides kernel library files for version PKGVER, that allow to
+ compile out-of-tree kernel modules written in Rust.
+
+Package: linux-image=SIGN-ME-PKG=-PKGVER-ABINUM-FLAVOUR-dbgsym
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}
+Provides: linux-debug
+Description: Linux kernel debug image for version PKGVER on DESC
+ This package provides the=SIGN-ME-TXT= kernel debug image for version PKGVER on
+ DESC.
+ .
+ This is for sites that wish to debug the kernel.
+ .
+ The kernel image contained in this package is NOT meant to boot from. It
+ is uncompressed, and unstripped. This package also includes the
+ unstripped modules.
+
+Package: linux-tools-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, SRCPKGNAME-tools-PKGVER-ABINUM
+Description: Linux kernel version specific tools for version PKGVER-ABINUM
+ This package provides the architecture dependant parts for kernel
+ version locked tools (such as perf and x86_energy_perf_policy) for
+ version PKGVER-ABINUM on
+ =HUMAN=.
+
+Package: linux-cloud-tools-PKGVER-ABINUM-FLAVOUR
+Build-Profiles: <!stage1>
+Architecture: ARCH
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, SRCPKGNAME-cloud-tools-PKGVER-ABINUM
+Description: Linux kernel version specific cloud tools for version PKGVER-ABINUM
+ This package provides the architecture dependant parts for kernel
+ version locked tools for cloud for version PKGVER-ABINUM on
+ =HUMAN=.
+
diff --git a/debian.nvidia-adv/control.d/nvidia-adv-64k.inclusion-list b/debian.nvidia-adv/control.d/nvidia-adv-64k.inclusion-list
new file mode 100644
index 0000000000000..31b1d207b3fa5
--- /dev/null
+++ b/debian.nvidia-adv/control.d/nvidia-adv-64k.inclusion-list
@@ -0,0 +1,304 @@
+arch/*/{crypto,kernel,oprofile}
+arch/*/kvm/kvm.ko
+arch/powerpc/kvm/kvm-hv.ko
+arch/powerpc/kvm/kvm-pr.ko
+arch/powerpc/kvm/vfio.ko
+arch/powerpc/platforms/powernv/opal-prd.ko
+arch/s390/*
+arch/x86/kvm/kvm-amd.ko
+arch/x86/kvm/kvm-intel.ko
+crypto/*
+drivers/acpi/*
+drivers/ata/acard-ahci.ko
+drivers/ata/ahci.ko
+drivers/ata/ahci_platform.ko
+drivers/ata/ahci_tegra.ko
+drivers/ata/ata_generic.ko
+drivers/ata/libahci.ko
+drivers/ata/libahci_platform.ko
+drivers/block/brd.ko
+drivers/block/cryptoloop.ko
+drivers/block/floppy.ko
+drivers/block/loop.ko
+drivers/block/nbd.ko
+drivers/block/rbd.ko
+drivers/block/virtio_blk.ko
+drivers/block/xen-blkfront.ko
+drivers/bus/tegra-aconnect.ko
+drivers/char/hangcheck-timer.ko
+drivers/char/hw_random/powernv-rng.ko
+drivers/char/hw_random/virtio-rng.ko
+drivers/char/ipmi/*
+drivers/char/ipmi/ipmi_msghandler.ko
+drivers/char/lp.ko
+drivers/char/nvram.ko
+drivers/char/ppdev.ko
+drivers/char/raw.ko
+drivers/char/virtio_console.ko
+drivers/clk/clk-max77686.ko
+drivers/cpufreq/tegra186-cpufreq.ko
+drivers/cpufreq/tegra194-cpufreq.ko
+drivers/crypto/nx/*
+drivers/crypto/vmx/vmx-crypto.ko
+drivers/dma/tegra210-adma.ko
+drivers/firmware/efi/*
+drivers/firmware/iscsi_ibft.ko
+drivers/gpio/gpio-max77620.ko
+drivers/gpu/drm/ast/ast.ko
+drivers/gpu/drm/bochs/bochs-drm.ko
+drivers/gpu/drm/cirrus/cirrus.ko
+drivers/gpu/drm/drm.ko
+drivers/gpu/drm/drm_kms_helper.ko
+drivers/gpu/drm/tegra/tegra-drm.ko
+drivers/gpu/drm/ttm/ttm.ko
+drivers/gpu/drm/vboxvideo/vboxvideo.ko
+drivers/gpu/drm/virtio/virtio-gpu.ko
+drivers/gpu/drm/vmwgfx/vmwgfx.ko
+drivers/gpu/drm/xen/drm_xen_front.ko
+drivers/gpu/host1x/host1x.ko
+drivers/hid/hid-generic.ko
+drivers/hid/hid-hyperv.ko
+drivers/hid/hid.ko
+drivers/hid/usbhid/usbhid.ko
+drivers/hv/*
+drivers/hwmon/ibmpowernv.ko
+drivers/hwmon/pwm-fan.ko
+drivers/hwtracing/coresight/*
+drivers/i2c/busses/i2c-tegra-bpmp.ko
+drivers/i2c/busses/i2c-tegra-bpmp.ko
+drivers/i2c/busses/i2c-tegra.ko
+drivers/infiniband/core/ib_addr.ko
+drivers/infiniband/core/ib_cm.ko
+drivers/infiniband/core/ib_core.ko
+drivers/infiniband/core/ib_mad.ko
+drivers/infiniband/core/ib_sa.ko
+drivers/infiniband/core/iw_cm.ko
+drivers/infiniband/core/rdma_cm.ko
+drivers/infiniband/ulp/iser/ib_iser.ko
+drivers/infiniband/ulp/isert/ib_isert.ko
+drivers/input/evbug.ko
+drivers/input/gameport/gameport.ko
+drivers/input/input-leds.ko
+drivers/input/joydev.ko
+drivers/input/keyboard/gpio_keys.ko
+drivers/input/misc/xen-kbdfront.ko
+drivers/input/mouse/psmouse.ko
+drivers/input/serio/hyperv-keyboard.ko
+drivers/input/serio/serio_raw.ko
+drivers/input/serio/serport.ko
+drivers/input/touchscreen/usbtouchscreen.ko
+drivers/leds/leds-powernv.ko
+drivers/md/*
+drivers/memory/tegra/tegra210-emc.ko
+drivers/message/fusion*
+drivers/misc/cxl/*
+drivers/misc/eeprom/at24.ko
+drivers/misc/vmw_balloon.ko
+drivers/misc/vmw_vmci/vmw_vmci.ko
+drivers/mmc/host/sdhci-tegra.ko
+drivers/mtd/cmdlinepart.ko
+drivers/mtd/devices/powernv_flash.ko
+drivers/mtd/ofpart.ko
+drivers/net/appletalk/ipddp.ko
+drivers/net/bonding/bonding.ko
+drivers/net/caif/caif_virtio.ko
+drivers/net/dummy.ko
+drivers/net/eql.ko
+drivers/net/ethernet/8390/8390.ko
+drivers/net/ethernet/8390/ne2k-pci.ko
+drivers/net/ethernet/amazon/ena/ena.ko
+drivers/net/ethernet/amd/pcnet32.ko
+drivers/net/ethernet/broadcom/bnx2x/*
+drivers/net/ethernet/broadcom/tg3.ko
+drivers/net/ethernet/dec/tulip/*
+drivers/net/ethernet/emulex/benet/*
+drivers/net/ethernet/ibm/*
+drivers/net/ethernet/intel/e1000/e1000.ko
+drivers/net/ethernet/intel/e1000e/e1000e.ko
+drivers/net/ethernet/intel/i40e/*
+drivers/net/ethernet/intel/iavf/iavf.ko
+drivers/net/ethernet/intel/igb/*
+drivers/net/ethernet/intel/igbvf/igbvf.ko
+drivers/net/ethernet/intel/ixgbe/*
+drivers/net/ethernet/intel/ixgbevf/ixgbevf.ko
+drivers/net/ethernet/mellanox/*
+drivers/net/ethernet/netronome/nfp/nfp.ko
+drivers/net/ethernet/realtek/8139cp.ko
+drivers/net/ethernet/realtek/8139too.ko
+drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.ko
+drivers/net/ethernet/stmicro/stmmac/stmmac-platform.ko
+drivers/net/ethernet/stmicro/stmmac/stmmac.ko
+drivers/net/fddi/*
+drivers/net/geneve.ko
+drivers/net/hyperv/hv_netvsc.ko
+drivers/net/ifb.ko
+drivers/net/ipvlan/*
+drivers/net/macvlan.ko
+drivers/net/macvtap.ko
+drivers/net/mii.ko
+drivers/net/netconsole.ko
+drivers/net/pcs/pcs-xpcs.ko
+drivers/net/phy/marvell.ko
+drivers/net/phy/phylink.ko
+drivers/net/ppp/*
+drivers/net/ppp/bsd_comp.ko
+drivers/net/slip/*
+drivers/net/veth.ko
+drivers/net/virtio_net.ko
+drivers/net/vmxnet3/vmxnet3.ko
+drivers/net/vxlan.ko
+drivers/net/wireguard/wireguard.ko
+drivers/net/xen-netback/*
+drivers/net/xen-netfront.ko
+drivers/nvme/host/nvme.ko
+drivers/nvmem/nvmem_core.ko
+drivers/parport/parport.ko
+drivers/parport/parport_pc.ko
+drivers/pci/controller/dwc/pcie-tegra194.ko
+drivers/pci/host/vmd.ko
+drivers/phy/tegra/phy-tegra194-p2u.ko
+drivers/pinctrl/pinctrl-max77620.ko
+drivers/platform/x86/pvpanic.ko
+drivers/pps/pps_core.ko
+drivers/ptp/ptp.ko
+drivers/pwm/pwm-tegra.ko
+drivers/regulator/fixed.ko
+drivers/regulator/max77620-regulator.ko
+drivers/rtc/rtc-max77686.ko
+drivers/rtc/rtc-tegra.ko
+drivers/s390/*
+drivers/s390/block/xpram.ko
+drivers/scsi/BusLogic.ko
+drivers/scsi/aacraid/*
+drivers/scsi/cxlflash/*
+drivers/scsi/device_handler/scsi_dh_alua.ko
+drivers/scsi/device_handler/scsi_dh_emc.ko
+drivers/scsi/device_handler/scsi_dh_hp_sw.ko
+drivers/scsi/device_handler/scsi_dh_rdac.ko
+drivers/scsi/hv_storvsc.ko
+drivers/scsi/ibmvscsi/*
+drivers/scsi/ipr.ko
+drivers/scsi/iscsi_boot_sysfs.ko
+drivers/scsi/iscsi_tcp.ko
+drivers/scsi/libiscsi.ko
+drivers/scsi/libiscsi_tcp.ko
+drivers/scsi/libsas/*
+drivers/scsi/lpfc/*
+drivers/scsi/megaraid/*
+drivers/scsi/mpt3sas/*
+drivers/scsi/osd/libosd.ko
+drivers/scsi/osd/osd.ko
+drivers/scsi/qla1280.ko
+drivers/scsi/qla2xxx/*
+drivers/scsi/raid_class.ko
+drivers/scsi/scsi_transport_fc.ko
+drivers/scsi/scsi_transport_iscsi.ko
+drivers/scsi/scsi_transport_sas.ko
+drivers/scsi/scsi_transport_spi.ko
+drivers/scsi/sd_mod.ko
+drivers/scsi/sr_mod.ko
+drivers/scsi/virtio_scsi.ko
+drivers/scsi/vmw_pvscsi.ko
+drivers/spi/spi-tegra114.ko
+drivers/staging/media/tegra-video/tegra-video.ko
+drivers/target/loopback/tcm_loop.ko
+drivers/target/target_core*.ko
+drivers/thermal/tegra/tegra-bpmp-thermal.ko
+drivers/tty/serial/jsm/*
+drivers/tty/serial/serial-tegra.ko
+drivers/uio/uio.ko
+drivers/uio/uio_pdrv_genirq.ko
+drivers/usb/gadget/udc/tegra-xudc.ko
+drivers/usb/host/*
+drivers/usb/storage/uas.ko
+drivers/usb/storage/usb-storage.ko
+drivers/vfio/*
+drivers/vhost/*
+drivers/video/fbdev/*
+drivers/video/vgastate.ko
+drivers/virt/vboxguest/vboxguest.ko
+drivers/virtio/*
+drivers/watchdog/softdog.ko
+drivers/xen/*
+fs/9p/*
+fs/aufs/aufs.ko
+fs/autofs/autofs4.ko
+fs/binfmt_misc.ko
+fs/btrfs/*
+fs/cachefiles/cachefiles.ko
+fs/ceph/*
+fs/smb/*
+fs/configfs/*
+fs/dlm/dlm.ko
+fs/ecryptfs/*
+fs/efivarfs/*
+fs/exofs/libore.ko
+fs/ext4/*
+fs/fat/*
+fs/fscache/*
+fs/fuse/*
+fs/isofs/*
+fs/lockd/*
+fs/nfs/*
+fs/nfs_common/*
+fs/nfsd/*
+fs/nls/nls_cp437.ko
+fs/nls/nls_iso8859-1.ko
+fs/overlayfs/*
+fs/shiftfs.ko
+fs/squashfs/*
+fs/udf/*
+fs/ufs/*
+fs/vboxsf/vboxsf.ko
+fs/xfs/*
+lib/*
+net/6lowpan/*
+net/802/*
+net/8021q/*
+net/9p/*
+net/appletalk/*
+net/atm/*
+net/ax25/*
+net/bpfilter/*
+net/bridge/*
+net/can/*
+net/ceph/libceph.ko
+net/core/*
+net/dccp/*
+net/decnet/*
+net/ieee802154/*
+net/ipv4/*
+net/ipv6/*
+net/ipx/*
+net/key/*
+net/lapb/*
+net/llc/*
+net/netfilter/*
+net/netlink/netlink_diag.ko
+net/netrom/*
+net/openvswitch/*
+net/packet/af_packet_diag.ko
+net/phonet/*
+net/rose/*
+net/rxrpc/*
+net/sched/*
+net/sctp/*
+net/sunrpc/auth_gss/auth_rpcgss.ko
+net/sunrpc/auth_gss/rpcsec_gss_krb5.ko
+net/sunrpc/sunrpc.ko
+net/tipc/*
+net/unix/unix_diag.ko
+net/vmw_vsock/*
+net/x25/*
+net/xfrm/*
+! find sound/core -name oss -prune -o -name *.ko -print
+sound/drivers/pcsp/snd-pcsp.ko
+sound/pci/hda/snd-hda-tegra.ko
+sound/pci/snd-ens1370.ko
+sound/soc/tegra/snd-soc-tegra186-dspk.ko
+sound/soc/tegra/snd-soc-tegra210-admaif.ko
+sound/soc/tegra/snd-soc-tegra210-ahub.ko
+sound/soc/tegra/snd-soc-tegra210-dmic.ko
+sound/soc/tegra/snd-soc-tegra210-i2s.ko
+sound/soundcore.ko
+ubuntu/ubuntu-host/ubuntu-host.ko
diff --git a/debian.nvidia-adv/control.d/nvidia-adv.inclusion-list b/debian.nvidia-adv/control.d/nvidia-adv.inclusion-list
new file mode 100644
index 0000000000000..31b1d207b3fa5
--- /dev/null
+++ b/debian.nvidia-adv/control.d/nvidia-adv.inclusion-list
@@ -0,0 +1,304 @@
+arch/*/{crypto,kernel,oprofile}
+arch/*/kvm/kvm.ko
+arch/powerpc/kvm/kvm-hv.ko
+arch/powerpc/kvm/kvm-pr.ko
+arch/powerpc/kvm/vfio.ko
+arch/powerpc/platforms/powernv/opal-prd.ko
+arch/s390/*
+arch/x86/kvm/kvm-amd.ko
+arch/x86/kvm/kvm-intel.ko
+crypto/*
+drivers/acpi/*
+drivers/ata/acard-ahci.ko
+drivers/ata/ahci.ko
+drivers/ata/ahci_platform.ko
+drivers/ata/ahci_tegra.ko
+drivers/ata/ata_generic.ko
+drivers/ata/libahci.ko
+drivers/ata/libahci_platform.ko
+drivers/block/brd.ko
+drivers/block/cryptoloop.ko
+drivers/block/floppy.ko
+drivers/block/loop.ko
+drivers/block/nbd.ko
+drivers/block/rbd.ko
+drivers/block/virtio_blk.ko
+drivers/block/xen-blkfront.ko
+drivers/bus/tegra-aconnect.ko
+drivers/char/hangcheck-timer.ko
+drivers/char/hw_random/powernv-rng.ko
+drivers/char/hw_random/virtio-rng.ko
+drivers/char/ipmi/*
+drivers/char/ipmi/ipmi_msghandler.ko
+drivers/char/lp.ko
+drivers/char/nvram.ko
+drivers/char/ppdev.ko
+drivers/char/raw.ko
+drivers/char/virtio_console.ko
+drivers/clk/clk-max77686.ko
+drivers/cpufreq/tegra186-cpufreq.ko
+drivers/cpufreq/tegra194-cpufreq.ko
+drivers/crypto/nx/*
+drivers/crypto/vmx/vmx-crypto.ko
+drivers/dma/tegra210-adma.ko
+drivers/firmware/efi/*
+drivers/firmware/iscsi_ibft.ko
+drivers/gpio/gpio-max77620.ko
+drivers/gpu/drm/ast/ast.ko
+drivers/gpu/drm/bochs/bochs-drm.ko
+drivers/gpu/drm/cirrus/cirrus.ko
+drivers/gpu/drm/drm.ko
+drivers/gpu/drm/drm_kms_helper.ko
+drivers/gpu/drm/tegra/tegra-drm.ko
+drivers/gpu/drm/ttm/ttm.ko
+drivers/gpu/drm/vboxvideo/vboxvideo.ko
+drivers/gpu/drm/virtio/virtio-gpu.ko
+drivers/gpu/drm/vmwgfx/vmwgfx.ko
+drivers/gpu/drm/xen/drm_xen_front.ko
+drivers/gpu/host1x/host1x.ko
+drivers/hid/hid-generic.ko
+drivers/hid/hid-hyperv.ko
+drivers/hid/hid.ko
+drivers/hid/usbhid/usbhid.ko
+drivers/hv/*
+drivers/hwmon/ibmpowernv.ko
+drivers/hwmon/pwm-fan.ko
+drivers/hwtracing/coresight/*
+drivers/i2c/busses/i2c-tegra-bpmp.ko
+drivers/i2c/busses/i2c-tegra-bpmp.ko
+drivers/i2c/busses/i2c-tegra.ko
+drivers/infiniband/core/ib_addr.ko
+drivers/infiniband/core/ib_cm.ko
+drivers/infiniband/core/ib_core.ko
+drivers/infiniband/core/ib_mad.ko
+drivers/infiniband/core/ib_sa.ko
+drivers/infiniband/core/iw_cm.ko
+drivers/infiniband/core/rdma_cm.ko
+drivers/infiniband/ulp/iser/ib_iser.ko
+drivers/infiniband/ulp/isert/ib_isert.ko
+drivers/input/evbug.ko
+drivers/input/gameport/gameport.ko
+drivers/input/input-leds.ko
+drivers/input/joydev.ko
+drivers/input/keyboard/gpio_keys.ko
+drivers/input/misc/xen-kbdfront.ko
+drivers/input/mouse/psmouse.ko
+drivers/input/serio/hyperv-keyboard.ko
+drivers/input/serio/serio_raw.ko
+drivers/input/serio/serport.ko
+drivers/input/touchscreen/usbtouchscreen.ko
+drivers/leds/leds-powernv.ko
+drivers/md/*
+drivers/memory/tegra/tegra210-emc.ko
+drivers/message/fusion*
+drivers/misc/cxl/*
+drivers/misc/eeprom/at24.ko
+drivers/misc/vmw_balloon.ko
+drivers/misc/vmw_vmci/vmw_vmci.ko
+drivers/mmc/host/sdhci-tegra.ko
+drivers/mtd/cmdlinepart.ko
+drivers/mtd/devices/powernv_flash.ko
+drivers/mtd/ofpart.ko
+drivers/net/appletalk/ipddp.ko
+drivers/net/bonding/bonding.ko
+drivers/net/caif/caif_virtio.ko
+drivers/net/dummy.ko
+drivers/net/eql.ko
+drivers/net/ethernet/8390/8390.ko
+drivers/net/ethernet/8390/ne2k-pci.ko
+drivers/net/ethernet/amazon/ena/ena.ko
+drivers/net/ethernet/amd/pcnet32.ko
+drivers/net/ethernet/broadcom/bnx2x/*
+drivers/net/ethernet/broadcom/tg3.ko
+drivers/net/ethernet/dec/tulip/*
+drivers/net/ethernet/emulex/benet/*
+drivers/net/ethernet/ibm/*
+drivers/net/ethernet/intel/e1000/e1000.ko
+drivers/net/ethernet/intel/e1000e/e1000e.ko
+drivers/net/ethernet/intel/i40e/*
+drivers/net/ethernet/intel/iavf/iavf.ko
+drivers/net/ethernet/intel/igb/*
+drivers/net/ethernet/intel/igbvf/igbvf.ko
+drivers/net/ethernet/intel/ixgbe/*
+drivers/net/ethernet/intel/ixgbevf/ixgbevf.ko
+drivers/net/ethernet/mellanox/*
+drivers/net/ethernet/netronome/nfp/nfp.ko
+drivers/net/ethernet/realtek/8139cp.ko
+drivers/net/ethernet/realtek/8139too.ko
+drivers/net/ethernet/stmicro/stmmac/dwmac-dwc-qos-eth.ko
+drivers/net/ethernet/stmicro/stmmac/stmmac-platform.ko
+drivers/net/ethernet/stmicro/stmmac/stmmac.ko
+drivers/net/fddi/*
+drivers/net/geneve.ko
+drivers/net/hyperv/hv_netvsc.ko
+drivers/net/ifb.ko
+drivers/net/ipvlan/*
+drivers/net/macvlan.ko
+drivers/net/macvtap.ko
+drivers/net/mii.ko
+drivers/net/netconsole.ko
+drivers/net/pcs/pcs-xpcs.ko
+drivers/net/phy/marvell.ko
+drivers/net/phy/phylink.ko
+drivers/net/ppp/*
+drivers/net/ppp/bsd_comp.ko
+drivers/net/slip/*
+drivers/net/veth.ko
+drivers/net/virtio_net.ko
+drivers/net/vmxnet3/vmxnet3.ko
+drivers/net/vxlan.ko
+drivers/net/wireguard/wireguard.ko
+drivers/net/xen-netback/*
+drivers/net/xen-netfront.ko
+drivers/nvme/host/nvme.ko
+drivers/nvmem/nvmem_core.ko
+drivers/parport/parport.ko
+drivers/parport/parport_pc.ko
+drivers/pci/controller/dwc/pcie-tegra194.ko
+drivers/pci/host/vmd.ko
+drivers/phy/tegra/phy-tegra194-p2u.ko
+drivers/pinctrl/pinctrl-max77620.ko
+drivers/platform/x86/pvpanic.ko
+drivers/pps/pps_core.ko
+drivers/ptp/ptp.ko
+drivers/pwm/pwm-tegra.ko
+drivers/regulator/fixed.ko
+drivers/regulator/max77620-regulator.ko
+drivers/rtc/rtc-max77686.ko
+drivers/rtc/rtc-tegra.ko
+drivers/s390/*
+drivers/s390/block/xpram.ko
+drivers/scsi/BusLogic.ko
+drivers/scsi/aacraid/*
+drivers/scsi/cxlflash/*
+drivers/scsi/device_handler/scsi_dh_alua.ko
+drivers/scsi/device_handler/scsi_dh_emc.ko
+drivers/scsi/device_handler/scsi_dh_hp_sw.ko
+drivers/scsi/device_handler/scsi_dh_rdac.ko
+drivers/scsi/hv_storvsc.ko
+drivers/scsi/ibmvscsi/*
+drivers/scsi/ipr.ko
+drivers/scsi/iscsi_boot_sysfs.ko
+drivers/scsi/iscsi_tcp.ko
+drivers/scsi/libiscsi.ko
+drivers/scsi/libiscsi_tcp.ko
+drivers/scsi/libsas/*
+drivers/scsi/lpfc/*
+drivers/scsi/megaraid/*
+drivers/scsi/mpt3sas/*
+drivers/scsi/osd/libosd.ko
+drivers/scsi/osd/osd.ko
+drivers/scsi/qla1280.ko
+drivers/scsi/qla2xxx/*
+drivers/scsi/raid_class.ko
+drivers/scsi/scsi_transport_fc.ko
+drivers/scsi/scsi_transport_iscsi.ko
+drivers/scsi/scsi_transport_sas.ko
+drivers/scsi/scsi_transport_spi.ko
+drivers/scsi/sd_mod.ko
+drivers/scsi/sr_mod.ko
+drivers/scsi/virtio_scsi.ko
+drivers/scsi/vmw_pvscsi.ko
+drivers/spi/spi-tegra114.ko
+drivers/staging/media/tegra-video/tegra-video.ko
+drivers/target/loopback/tcm_loop.ko
+drivers/target/target_core*.ko
+drivers/thermal/tegra/tegra-bpmp-thermal.ko
+drivers/tty/serial/jsm/*
+drivers/tty/serial/serial-tegra.ko
+drivers/uio/uio.ko
+drivers/uio/uio_pdrv_genirq.ko
+drivers/usb/gadget/udc/tegra-xudc.ko
+drivers/usb/host/*
+drivers/usb/storage/uas.ko
+drivers/usb/storage/usb-storage.ko
+drivers/vfio/*
+drivers/vhost/*
+drivers/video/fbdev/*
+drivers/video/vgastate.ko
+drivers/virt/vboxguest/vboxguest.ko
+drivers/virtio/*
+drivers/watchdog/softdog.ko
+drivers/xen/*
+fs/9p/*
+fs/aufs/aufs.ko
+fs/autofs/autofs4.ko
+fs/binfmt_misc.ko
+fs/btrfs/*
+fs/cachefiles/cachefiles.ko
+fs/ceph/*
+fs/smb/*
+fs/configfs/*
+fs/dlm/dlm.ko
+fs/ecryptfs/*
+fs/efivarfs/*
+fs/exofs/libore.ko
+fs/ext4/*
+fs/fat/*
+fs/fscache/*
+fs/fuse/*
+fs/isofs/*
+fs/lockd/*
+fs/nfs/*
+fs/nfs_common/*
+fs/nfsd/*
+fs/nls/nls_cp437.ko
+fs/nls/nls_iso8859-1.ko
+fs/overlayfs/*
+fs/shiftfs.ko
+fs/squashfs/*
+fs/udf/*
+fs/ufs/*
+fs/vboxsf/vboxsf.ko
+fs/xfs/*
+lib/*
+net/6lowpan/*
+net/802/*
+net/8021q/*
+net/9p/*
+net/appletalk/*
+net/atm/*
+net/ax25/*
+net/bpfilter/*
+net/bridge/*
+net/can/*
+net/ceph/libceph.ko
+net/core/*
+net/dccp/*
+net/decnet/*
+net/ieee802154/*
+net/ipv4/*
+net/ipv6/*
+net/ipx/*
+net/key/*
+net/lapb/*
+net/llc/*
+net/netfilter/*
+net/netlink/netlink_diag.ko
+net/netrom/*
+net/openvswitch/*
+net/packet/af_packet_diag.ko
+net/phonet/*
+net/rose/*
+net/rxrpc/*
+net/sched/*
+net/sctp/*
+net/sunrpc/auth_gss/auth_rpcgss.ko
+net/sunrpc/auth_gss/rpcsec_gss_krb5.ko
+net/sunrpc/sunrpc.ko
+net/tipc/*
+net/unix/unix_diag.ko
+net/vmw_vsock/*
+net/x25/*
+net/xfrm/*
+! find sound/core -name oss -prune -o -name *.ko -print
+sound/drivers/pcsp/snd-pcsp.ko
+sound/pci/hda/snd-hda-tegra.ko
+sound/pci/snd-ens1370.ko
+sound/soc/tegra/snd-soc-tegra186-dspk.ko
+sound/soc/tegra/snd-soc-tegra210-admaif.ko
+sound/soc/tegra/snd-soc-tegra210-ahub.ko
+sound/soc/tegra/snd-soc-tegra210-dmic.ko
+sound/soc/tegra/snd-soc-tegra210-i2s.ko
+sound/soundcore.ko
+ubuntu/ubuntu-host/ubuntu-host.ko
diff --git a/debian.nvidia-adv/control.d/vars.nvidia-adv b/debian.nvidia-adv/control.d/vars.nvidia-adv
new file mode 100644
index 0000000000000..296aa654e8855
--- /dev/null
+++ b/debian.nvidia-adv/control.d/vars.nvidia-adv
@@ -0,0 +1,6 @@
+arch="amd64 arm64"
+supported="Nvidia-Adv"
+target="Geared toward desktop and server systems."
+desc="=HUMAN= SMP"
+bootloader="grub-pc [amd64] | grub-efi-amd64 [amd64] | grub-efi-ia32 [amd64] | grub [amd64] | lilo [amd64] | flash-kernel [arm64] | grub-efi-arm64 [arm64]"
+provides="kvm-api-4, redhat-cluster-modules, ivtv-modules, virtualbox-guest-modules [amd64]"
diff --git a/debian.nvidia-adv/control.d/vars.nvidia-adv-64k b/debian.nvidia-adv/control.d/vars.nvidia-adv-64k
new file mode 100644
index 0000000000000..7833201bc0d7b
--- /dev/null
+++ b/debian.nvidia-adv/control.d/vars.nvidia-adv-64k
@@ -0,0 +1,6 @@
+arch="arm64"
+supported="Nvidia-Adv 64K pages"
+target="Geared toward desktop and server systems."
+desc="=HUMAN= SMP"
+bootloader="grub-efi-arm64 [arm64] | flash-kernel [arm64]"
+provides="kvm-api-4, redhat-cluster-modules, ivtv-modules"
diff --git a/debian.nvidia-adv/control.stub.in b/debian.nvidia-adv/control.stub.in
new file mode 100644
index 0000000000000..57a5f1b2a7c92
--- /dev/null
+++ b/debian.nvidia-adv/control.stub.in
@@ -0,0 +1,97 @@
+Source: SRCPKGNAME
+Section: devel
+Priority: optional
+Maintainer: Ubuntu Kernel Team <kernel-team@lists.ubuntu.com>
+Rules-Requires-Root: no
+Standards-Version: 3.9.4.0
+Build-Depends:
+ debhelper-compat (= 10),
+ cpio,
+ kmod <!stage1>,
+ makedumpfile [amd64] <!stage1>,
+ libcap-dev <!stage1>,
+ libelf-dev <!stage1>,
+ libnewt-dev <!stage1>,
+ libiberty-dev <!stage1>,
+ default-jdk-headless <!stage1>,
+ java-common <!stage1>,
+ rsync [!i386] <!stage1>,
+ libdw-dev <!stage1>,
+ libpci-dev <!stage1>,
+ pkg-config <!stage1>,
+ python3 <!stage1>,
+ python3-dev <!stage1>,
+ python3-setuptools <!stage1>,
+ flex <!stage1>,
+ bison <!stage1>,
+ libunwind8-dev [amd64 arm64 armhf ppc64el] <!stage1>,
+ liblzma-dev <!stage1>,
+ openssl <!stage1>,
+ libssl-dev <!stage1>,
+ libaudit-dev <!stage1>,
+ bc <!stage1>,
+ gawk <!stage1>,
+ libudev-dev <!stage1>,
+ autoconf <!stage1>,
+ automake <!stage1>,
+ libtool <!stage1>,
+ uuid-dev <!stage1>,
+ libnuma-dev [amd64 arm64 ppc64el s390x] <!stage1>,
+ libtraceevent-dev [amd64 arm64 armhf ppc64el s390x riscv64] <!stage1>,
+ libtracefs-dev [amd64 arm64 armhf ppc64el s390x riscv64] <!stage1>,
+ dkms <!stage1>,
+ curl <!stage1>,
+ zstd <!stage1>,
+ pahole [amd64 arm64 armhf ppc64el s390x riscv64] | dwarves (>= 1.21) [amd64 arm64 armhf ppc64el s390x riscv64] <!stage1>,
+ clang-18 [amd64 arm64 armhf ppc64el riscv64 s390x],
+ rustc [amd64 arm64 armhf ppc64el riscv64 s390x],
+ rust-src [amd64 arm64 armhf ppc64el riscv64 s390x],
+ rustfmt [amd64 arm64 armhf ppc64el riscv64 s390x],
+ bindgen-0.65 [amd64 arm64 armhf ppc64el riscv64 s390x],
+ libstdc++-dev,
+Build-Depends-Indep:
+ xmlto <!stage1>,
+ bzip2 <!stage1>,
+ sharutils <!stage1>,
+ asciidoc <!stage1>,
+ python3-docutils <!stage1>,
+Vcs-Git: git://git.launchpad.net/~nvidia-kernel/+git/noble-linux-nvidia-adv
+XS-Testsuite: autopkgtest
+#XS-Testsuite-Depends: gcc-4.7 binutils
+
+Package: SRCPKGNAME-headers-PKGVER-ABINUM
+Build-Profiles: <!stage1>
+Architecture: all
+Multi-Arch: foreign
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, coreutils
+Description: Header files related to Linux kernel version PKGVER
+ This package provides kernel header files for version PKGVER, for sites
+ that want the latest kernel headers. Please read
+ /usr/share/doc/SRCPKGNAME-headers-PKGVER-ABINUM/debian.README.gz for details
+
+Package: SRCPKGNAME-tools-PKGVER-ABINUM
+Build-Profiles: <!stage1>
+Architecture: amd64 armhf arm64 ppc64el s390x
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, ${shlibs:Depends}, linux-tools-common
+Description: Linux kernel version specific tools for version PKGVER-ABINUM
+ This package provides the architecture dependant parts for kernel
+ version locked tools (such as perf and x86_energy_perf_policy) for
+ version PKGVER-ABINUM on
+ =HUMAN=.
+ You probably want to install linux-tools-PKGVER-ABINUM-<flavour>.
+
+Package: SRCPKGNAME-cloud-tools-PKGVER-ABINUM
+Build-Profiles: <!stage1>
+Architecture: amd64 armhf
+Section: devel
+Priority: optional
+Depends: ${misc:Depends}, ${shlibs:Depends}, linux-cloud-tools-common
+Description: Linux kernel version specific cloud tools for version PKGVER-ABINUM
+ This package provides the architecture dependant parts for kernel
+ version locked tools for cloud tools for version PKGVER-ABINUM on
+ =HUMAN=.
+ You probably want to install linux-cloud-tools-PKGVER-ABINUM-<flavour>.
diff --git a/debian.nvidia-adv/copyright b/debian.nvidia-adv/copyright
new file mode 100644
index 0000000000000..d1d04a6d66974
--- /dev/null
+++ b/debian.nvidia-adv/copyright
@@ -0,0 +1,29 @@
+This is the Ubuntu prepackaged version of the Linux kernel.
+Linux was written by Linus Torvalds <Linus.Torvalds@cs.Helsinki.FI>
+and others.
+
+This package was put together by the Ubuntu Kernel Team, from
+sources retrieved from upstream linux git.
+The sources may be found at most Linux ftp sites, including 
+ftp://ftp.kernel.org/pub/linux/kernel/
+
+This package is currently maintained by the
+Ubuntu Kernel Team <ubuntu-kernel@lists.ubuntu.com>
+
+Linux is copyrighted by Linus Torvalds and others.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation; version 2 dated June, 1991.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; if not, write to the Free Software
+   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+
+On Ubuntu Linux systems, the complete text of the GNU General
+Public License v2 can be found in `/usr/share/common-licenses/GPL-2'.
diff --git a/debian.nvidia-adv/dkms-versions b/debian.nvidia-adv/dkms-versions
new file mode 100644
index 0000000000000..c8180eb4882b7
--- /dev/null
+++ b/debian.nvidia-adv/dkms-versions
@@ -0,0 +1,3 @@
+zfs-linux 2.2.2-0ubuntu9 modulename=zfs debpath=pool/universe/z/%package%/zfs-dkms_%version%_all.deb arch=amd64 arch=arm64 arch=ppc64el arch=s390x rprovides=spl-modules rprovides=spl-dkms rprovides=zfs-modules rprovides=zfs-dkms
+backport-iwlwifi-dkms 11510-0ubuntu1 modulename=iwlwifi debpath=pool/universe/b/%package%/backport-iwlwifi-dkms_%version%_all.deb arch=amd64 rprovides=iwlwifi-modules rprovides=backport-iwlwifi-dkms type=standalone
+v4l2loopback 0.12.7-2ubuntu5 modulename=v4l2loopback debpath=pool/universe/v/%package%/v4l2loopback-dkms_%version%_all.deb arch=amd64 rprovides=v4l2loopback-modules rprovides=v4l2loopback-dkms
diff --git a/debian.nvidia-adv/etc/update.conf b/debian.nvidia-adv/etc/update.conf
new file mode 100644
index 0000000000000..7bdb111739342
--- /dev/null
+++ b/debian.nvidia-adv/etc/update.conf
@@ -0,0 +1,7 @@
+# WARNING: we do not create update.conf when we are not a
+# derivative. Various cranky components make use of this.
+# If we start unconditionally creating update.conf we need
+# to fix at least cranky close and cranky rebase.
+RELEASE_REPO=git://git.launchpad.net/~ubuntu-kernel/ubuntu/+source/linux/+git/noble
+SOURCE_RELEASE_BRANCH=master-next
+DEBIAN_MASTER=debian.master
diff --git a/debian.nvidia-adv/modprobe.d/common.conf b/debian.nvidia-adv/modprobe.d/common.conf
new file mode 100644
index 0000000000000..e0fbbd6e060d4
--- /dev/null
+++ b/debian.nvidia-adv/modprobe.d/common.conf
@@ -0,0 +1,3 @@
+# LP:1434842 -- disable OSS drivers by default to allow pulseaudio to emulate
+blacklist snd-mixer-oss
+blacklist snd-pcm-oss
diff --git a/debian.nvidia-adv/reconstruct b/debian.nvidia-adv/reconstruct
new file mode 100644
index 0000000000000..3e552dabffb20
--- /dev/null
+++ b/debian.nvidia-adv/reconstruct
@@ -0,0 +1,49 @@
+# Recreate any symlinks created since the orig.
+chmod +x 'arch/mips/pci/pcie-octeon.c'
+chmod +x 'debian/cloud-tools/hv_get_dhcp_info'
+chmod +x 'debian/cloud-tools/hv_get_dns_info'
+chmod +x 'debian/cloud-tools/hv_set_ifconfig'
+chmod +x 'debian/rules'
+chmod +x 'debian/scripts/checks/final-checks'
+chmod +x 'debian/scripts/checks/module-signature-check'
+chmod +x 'debian/scripts/control-create'
+chmod +x 'debian/scripts/dkms-build'
+chmod +x 'debian/scripts/dkms-build--nvidia-N'
+chmod +x 'debian/scripts/dkms-build-configure--zfs'
+chmod +x 'debian/scripts/file-downloader'
+chmod +x 'debian/scripts/link-headers'
+chmod +x 'debian/scripts/link-lib-rust'
+chmod +x 'debian/scripts/misc/annotations'
+chmod +x 'debian/scripts/misc/find-missing-sauce.sh'
+chmod +x 'debian/scripts/misc/gen-auto-reconstruct'
+chmod +x 'debian/scripts/misc/git-ubuntu-log'
+chmod +x 'debian/scripts/misc/insert-changes'
+chmod +x 'debian/scripts/misc/insert-ubuntu-changes'
+chmod +x 'debian/scripts/misc/kernelconfig'
+chmod +x 'debian/scripts/module-inclusion'
+chmod +x 'debian/scripts/sign-module'
+chmod +x 'debian/templates/extra.postinst.in'
+chmod +x 'debian/templates/extra.postrm.in'
+chmod +x 'debian/templates/headers.postinst.in'
+chmod +x 'debian/templates/image.postinst.in'
+chmod +x 'debian/templates/image.postrm.in'
+chmod +x 'debian/templates/image.preinst.in'
+chmod +x 'debian/templates/image.prerm.in'
+chmod +x 'debian/tests-build/check-aliases'
+chmod +x 'debian/tests/rebuild'
+chmod +x 'debian/tests/ubuntu-regression-suite'
+chmod +x 'drivers/watchdog/f71808e_wdt.c'
+# Remove any files deleted from the orig.
+rm -f 'arch/arm/kernel/pj4-cp0.c'
+rm -f 'arch/arm64/boot/dts/qcom/pm2250.dtsi'
+rm -f 'arch/loongarch/include/asm/qspinlock.h'
+rm -f 'arch/sparc/lib/cmpdi2.c'
+rm -f 'arch/sparc/lib/ucmpdi2.c'
+rm -f 'drivers/gpu/drm/gma500/psb_lid.c'
+rm -f 'include/linux/amd-pstate.h'
+rm -f 'include/linux/iio/adc/adi-axi-adc.h'
+rm -f 'net/bluetooth/a2mp.c'
+rm -f 'net/bluetooth/a2mp.h'
+rm -f 'net/bluetooth/amp.c'
+rm -f 'net/bluetooth/amp.h'
+exit 0
diff --git a/debian.nvidia-adv/rules.d/amd64.mk b/debian.nvidia-adv/rules.d/amd64.mk
new file mode 100644
index 0000000000000..d01da77ae747a
--- /dev/null
+++ b/debian.nvidia-adv/rules.d/amd64.mk
@@ -0,0 +1,20 @@
+human_arch	= 64 bit x86
+build_arch	= x86
+defconfig	= defconfig
+flavours	= nvidia-adv
+build_image	= bzImage
+kernel_file	= arch/$(build_arch)/boot/bzImage
+install_file	= vmlinuz
+no_dumpfile	= true
+
+vdso		= vdso_install
+
+do_extras_package = true
+do_tools_usbip  = true
+do_tools_cpupower = true
+do_tools_perf   = true
+do_tools_perf_jvmti = true
+do_tools_perf_python = true
+do_tools_bpftool = true
+do_tools_rtla = true
+do_lib_rust     = false
diff --git a/debian.nvidia-adv/rules.d/arm64.mk b/debian.nvidia-adv/rules.d/arm64.mk
new file mode 100644
index 0000000000000..d1529d9b93eef
--- /dev/null
+++ b/debian.nvidia-adv/rules.d/arm64.mk
@@ -0,0 +1,21 @@
+human_arch	= ARMv8
+build_arch	= arm64
+defconfig	= defconfig
+flavours	= nvidia-adv nvidia-adv-64k
+build_image	= Image.gz
+kernel_file	= arch/$(build_arch)/boot/Image.gz
+install_file	= vmlinuz
+no_dumpfile = true
+
+vdso		= vdso_install
+
+do_extras_package = true
+do_tools_usbip  = true
+do_tools_cpupower = true
+do_tools_perf   = true
+do_tools_perf_jvmti = true
+do_tools_perf_python = true
+do_tools_bpftool = true
+do_tools_rtla = true
+
+do_dtbs         = true
diff --git a/debian/debian.env b/debian/debian.env
index be31a0c270197..c67ac54c94399 100644
--- a/debian/debian.env
+++ b/debian/debian.env
@@ -1 +1 @@
-DEBIAN=debian.master
+DEBIAN=debian.nvidia-adv

From bdc7d2345121289a63f08abf458d01589f437751 Mon Sep 17 00:00:00 2001
From: Jacob Martin <jacob.martin@canonical.com>
Date: Thu, 26 Sep 2024 14:21:04 -0500
Subject: [PATCH 002/352] UBUNTU: Start new release

Ignore: yes
Signed-off-by: Jacob Martin <jacob.martin@canonical.com>
---
 debian.nvidia-adv/changelog | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
index da61fe688060c..5ef1ceec49d37 100644
--- a/debian.nvidia-adv/changelog
+++ b/debian.nvidia-adv/changelog
@@ -1,3 +1,11 @@
+linux-nvidia-adv (6.8.0-1001.1) UNRELEASED; urgency=medium
+
+  CHANGELOG: Do not edit directly. Autogenerated at release.
+  CHANGELOG: Use the printchanges target to see the curent changes.
+  CHANGELOG: Use the insertchanges target to create the final log.
+
+ -- Jacob Martin <jacob.martin@canonical.com>  Thu, 26 Sep 2024 14:21:04 -0500
+
 linux-nvidia-adv (6.8.0-1000.0) noble; urgency=medium
 
   * Initialize n/linux-nvidia-adv.

From b9d06c171bf464eddeebbc3e18310ec7f4a6b91d Mon Sep 17 00:00:00 2001
From: Jacob Martin <jacob.martin@canonical.com>
Date: Fri, 27 Sep 2024 12:22:44 -0500
Subject: [PATCH 003/352] UBUNTU: Ubuntu-nvidia-adv-6.8.0-1001.1

Signed-off-by: Jacob Martin <jacob.martin@canonical.com>
---
 debian.nvidia-adv/changelog | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
index 5ef1ceec49d37..5633195e75035 100644
--- a/debian.nvidia-adv/changelog
+++ b/debian.nvidia-adv/changelog
@@ -1,10 +1,29 @@
-linux-nvidia-adv (6.8.0-1001.1) UNRELEASED; urgency=medium
+linux-nvidia-adv (6.8.0-1001.1) noble; urgency=medium
 
-  CHANGELOG: Do not edit directly. Autogenerated at release.
-  CHANGELOG: Use the printchanges target to see the curent changes.
-  CHANGELOG: Use the insertchanges target to create the final log.
+  [ Ubuntu: 6.8.0-45.45 ]
 
- -- Jacob Martin <jacob.martin@canonical.com>  Thu, 26 Sep 2024 14:21:04 -0500
+  * noble/linux: 6.8.0-45.45 -proposed tracker (LP: #2078100)
+  * Packaging resync (LP: #1786013)
+    - [Packaging] debian.master/dkms-versions -- update from kernel-versions
+      (main/s2024.08.05)
+  * Noble update: upstream stable patchset 2024-08-09 (LP: #2076435) //
+    CVE-2024-41009
+    - bpf: Fix overrunning reservations in ringbuf
+  * CVE-2024-42160
+    - f2fs: check validation of fault attrs in f2fs_build_fault_attr()
+    - f2fs: Add inline to f2fs_build_fault_attr() stub
+  * Noble update: upstream stable patchset 2024-08-22 (LP: #2077600) //
+    CVE-2024-42224
+    - net: dsa: mv88e6xxx: Correct check for empty list
+  * Noble update: upstream stable patchset 2024-08-22 (LP: #2077600) //
+    CVE-2024-42154
+    - tcp_metrics: validate source addr length
+  * CVE-2024-42228
+    - drm/amdgpu: Using uninitialized value *size when calling amdgpu_vce_cs_reloc
+  * CVE-2024-42159
+    - scsi: mpi3mr: Sanitise num_phys
+
+ -- Jacob Martin <jacob.martin@canonical.com>  Fri, 27 Sep 2024 12:22:43 -0500
 
 linux-nvidia-adv (6.8.0-1000.0) noble; urgency=medium
 

From 867b82a8d6e0bd576a1a4f1f8fa051a74f7080d6 Mon Sep 17 00:00:00 2001
From: Brad Figg <bfigg@nvidia.com>
Date: Fri, 5 Jan 2024 08:18:39 -0800
Subject: [PATCH 004/352] UBUNTU: [Packaging] dkms-versions standalone provides
 support

Add support for exposing rprovides data for standalone modules
too. Switch to exposing provides as a shared debian/substvar file
and use that in the templates.

Ignore: yes
Signed-off-by: Brad Figg <bfigg@nvidia.com>
Signed-off-by: Ian May <ian.may@canonical.com>
---
 debian/control.d/flavour-module.stub | 1 +
 debian/rules                         | 4 ++++
 debian/rules.d/2-binary-arch.mk      | 3 +--
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/debian/control.d/flavour-module.stub b/debian/control.d/flavour-module.stub
index 2810f83bb361f..4aa9ddbe76b95 100644
--- a/debian/control.d/flavour-module.stub
+++ b/debian/control.d/flavour-module.stub
@@ -4,6 +4,7 @@ Build-Profiles: <!stage1>
 Architecture: ARCH
 Section: kernel
 Priority: optional
+Provides: ${MODULE:rprovides}
 Depends:
  ${misc:Depends},
  linux-image-PKGVER-ABINUM-FLAVOUR | linux-image-unsigned-PKGVER-ABINUM-FLAVOUR,
diff --git a/debian/rules b/debian/rules
index 43eae8d5aaa8a..90a2cba7f5ade 100755
--- a/debian/rules
+++ b/debian/rules
@@ -153,6 +153,10 @@ clean: debian/control debian/canonical-certs.pem debian/canonical-revoked-certs.
 	rm -f $(DROOT)/control.stub $(DEBIAN)/control.stub
 	rm -f $(DROOT)/scripts/fix-filenames
 
+	# SUBSTVARS: rprovides for all DKMS packages
+	echo "linux:rprovides=$(foreach dkms,$(all_built-in_dkms_modules),$(foreach provides,$(dkms_$(dkms)_rprovides),$(provides)$(comma)))" >"debian/substvars"
+	echo "$(foreach dkms,$(all_standalone_dkms_modules),$(dkms):rprovides=$(foreach provides,$(dkms_$(dkms)_rprovides),$(provides)$(comma))=NL=)" | sed -e "s/=NL= */\n/g" >>"debian/substvars"
+
 .PHONY: distclean
 distclean: clean
 	rm -rf $(DROOT)/control debian/changelog \
diff --git a/debian/rules.d/2-binary-arch.mk b/debian/rules.d/2-binary-arch.mk
index 89d1eb769ed90..cb9c2f3921351 100644
--- a/debian/rules.d/2-binary-arch.mk
+++ b/debian/rules.d/2-binary-arch.mk
@@ -524,7 +524,7 @@ define dh_all
 	dh_shlibdeps -p$(1) $(shlibdeps_opts)
 	dh_installdeb -p$(1)
 	dh_installdebconf -p$(1)
-	$(lockme) dh_gencontrol -p$(1) -- -Vlinux:rprovides='$(rprovides)' $(2)
+	$(lockme) dh_gencontrol -p$(1) -- -Tdebian/substvars $(2)
 	dh_md5sums -p$(1)
 	dh_builddeb -p$(1)
 endef
@@ -558,7 +558,6 @@ binary-%: pkgcloud = $(cloud_flavour_pkg_name)-$*
 $(foreach _m,$(all_dkms_modules), \
   $(eval binary-%: enable_$(_m) = $$(filter true,$$(call custom_override,do_$(_m),$$*))) \
 )
-binary-%: rprovides = $(foreach _m,$(all_built-in_dkms_modules),$(if $(enable_$(_m)),$(foreach _r,$(dkms_$(_m)_rprovides),$(_r)$(comma) )))
 binary-%: target_flavour = $*
 binary-%: checks-%
 	@echo Debug: $@

From 7162ac03683bd08ad64a0ec0dec6f85fcd0ce25f Mon Sep 17 00:00:00 2001
From: Ian May <ian.may@canonical.com>
Date: Thu, 21 Mar 2024 17:12:26 -0500
Subject: [PATCH 005/352] UBUNTU: [Packaging] add versioning to dkms standalone
 rprovides

When nvidia-fs-dkms is available as a dkms package, we want to
default to using the signed modules if possible.  Adding
a version number for the nvidia-fs modules package enables the inbox
modules to be selected over an equivalent dkms version.

Ignore: yes
Signed-off-by: Ian May <ian.may@canonical.com>
---
 debian/rules                    | 2 +-
 debian/rules.d/0-common-vars.mk | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/debian/rules b/debian/rules
index 90a2cba7f5ade..c86e90aa6ad88 100755
--- a/debian/rules
+++ b/debian/rules
@@ -155,7 +155,7 @@ clean: debian/control debian/canonical-certs.pem debian/canonical-revoked-certs.
 
 	# SUBSTVARS: rprovides for all DKMS packages
 	echo "linux:rprovides=$(foreach dkms,$(all_built-in_dkms_modules),$(foreach provides,$(dkms_$(dkms)_rprovides),$(provides)$(comma)))" >"debian/substvars"
-	echo "$(foreach dkms,$(all_standalone_dkms_modules),$(dkms):rprovides=$(foreach provides,$(dkms_$(dkms)_rprovides),$(provides)$(comma))=NL=)" | sed -e "s/=NL= */\n/g" >>"debian/substvars"
+	echo "$(foreach dkms,$(all_standalone_dkms_modules),$(dkms):rprovides=$(strip $(foreach provides,$(dkms_$(dkms)_rprovides),$(provides)$(comma)))=NL=)" | sed -e 's/~(/ (/g' -e 's/, (/ (/g' -e 's/=NL= */\n/g' >>"debian/substvars"
 
 .PHONY: distclean
 distclean: clean
diff --git a/debian/rules.d/0-common-vars.mk b/debian/rules.d/0-common-vars.mk
index 5cd38f6f1b6c9..df9e3413b2fb5 100644
--- a/debian/rules.d/0-common-vars.mk
+++ b/debian/rules.d/0-common-vars.mk
@@ -256,7 +256,8 @@ $(foreach _line,$(shell gawk '{ OFS = "!"; $$1 = $$1; print }' $(DEBIAN)/dkms-ve
     , \
     $(eval dkms_$(_m)_archs = any) \
   ) \
-  $(eval dkms_$(_m)_rprovides = $(patsubst rprovides=%,%,$(filter rprovides=%,$(_params)))) \
+  $(eval _rprovides_raw = $(filter rprovides=%,$(_params))) \
+  $(eval dkms_$(_m)_rprovides = $(patsubst rprovides=%,%,$(_rprovides_raw))) \
   $(eval dkms_$(_m)_type = $(word 1,$(patsubst type=%,%,$(filter type=%,$(_params))) built-in)) \
   $(eval all_$(dkms_$(_m)_type)_dkms_modules += $(_m)) \
   $(if $(filter standalone,$(dkms_$(_m)_type)), \

From e62360192a58106e0869725b1a61d9ba6af54fee Mon Sep 17 00:00:00 2001
From: Sourab Gupta <sougupta@nvidia.com>
Date: Mon, 30 Oct 2023 17:10:13 +0000
Subject: [PATCH 006/352] NVIDIA: SAUCE: Patch NFS driver to support GDS with
 6.8 Kernel

BugLink: https://bugs.launchpad.net/bugs/2059814

With this change, the NFS driver would be enabled to support GPUDirectStorage(GDS).
The change is around frwr_map and frwr_unmap in the NFS driver, where the IO request
is first intercepted to check for GDS pages and if it is a GDS page then the
request is served by GDS driver component called nvidia-fs,
else the request would be served by the standard NFS driver code.

Signed-off-by: Sourab Gupta <sougupta@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Ian May <ian.may@canonical.com>
Signed-off-by: Ian May <ian.may@canonical.com>
---
 net/sunrpc/xprtrdma/Makefile        |   2 +
 net/sunrpc/xprtrdma/frwr_ops.c      |  35 ++++++++-
 net/sunrpc/xprtrdma/nvfs.h          | 113 ++++++++++++++++++++++++++++
 net/sunrpc/xprtrdma/nvfs_rpc_rdma.c |  51 +++++++++++++
 net/sunrpc/xprtrdma/nvfs_rpc_rdma.h |  65 ++++++++++++++++
 5 files changed, 262 insertions(+), 4 deletions(-)
 create mode 100644 net/sunrpc/xprtrdma/nvfs.h
 create mode 100644 net/sunrpc/xprtrdma/nvfs_rpc_rdma.c
 create mode 100644 net/sunrpc/xprtrdma/nvfs_rpc_rdma.h

diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 55b21bae866db..f831c236cdc9b 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -1,8 +1,10 @@
 # SPDX-License-Identifier: GPL-2.0
+ccflags-y				+= -DCONFIG_NVFS
 obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
 
 rpcrdma-y := transport.o rpc_rdma.o verbs.o frwr_ops.o \
 	svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
 	svc_rdma_sendto.o svc_rdma_recvfrom.o svc_rdma_rw.o \
 	svc_rdma_pcl.o module.o
+rpcrdma-y += nvfs_rpc_rdma.o
 rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ffbf99894970e..72c70237b1366 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -44,6 +44,11 @@
 
 #include "xprt_rdma.h"
 #include <trace/events/rpcrdma.h>
+#ifdef CONFIG_NVFS
+#define NVFS_FRWR
+#include "nvfs.h"
+#include "nvfs_rpc_rdma.h"
+#endif
 
 static void frwr_cid_init(struct rpcrdma_ep *ep,
 			  struct rpcrdma_mr *mr)
@@ -58,6 +63,13 @@ static void frwr_mr_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
 {
 	if (mr->mr_device) {
 		trace_xprtrdma_mr_unmap(mr);
+#ifdef CONFIG_NVFS
+		if (rpcrdma_nvfs_unmap_data(mr->mr_device->dma_device,
+					    mr->mr_sg, mr->mr_nents, mr->mr_dir))
+			pr_debug("rpcrdma_nvfs_unmap_data device %s mr->mr_sg: %p , nents: %d\n",
+				 mr->mr_device->name, mr->mr_sg, mr->mr_nents);
+		else
+#endif
 		ib_dma_unmap_sg(mr->mr_device, mr->mr_sg, mr->mr_nents,
 				mr->mr_dir);
 		mr->mr_device = NULL;
@@ -286,6 +298,9 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 				int nsegs, bool writing, __be32 xid,
 				struct rpcrdma_mr *mr)
 {
+#ifdef CONFIG_NVFS
+	bool is_nvfs_io = false;
+#endif
 	struct rpcrdma_ep *ep = r_xprt->rx_ep;
 	struct ib_reg_wr *reg_wr;
 	int i, n, dma_nents;
@@ -308,11 +323,23 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
 	}
 	mr->mr_dir = rpcrdma_data_dir(writing);
 	mr->mr_nents = i;
-
-	dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
-				  mr->mr_dir);
-	if (!dma_nents)
+#ifdef CONFIG_NVFS
+	dma_nents = rpcrdma_nvfs_map_data(ep->re_id->device->dma_device,
+					  mr->mr_sg, i, mr->mr_dir,
+					  &is_nvfs_io);
+	if (dma_nents == -EIO) {
 		goto out_dmamap_err;
+	} else if (is_nvfs_io) {
+		pr_debug("rpcrdma_nvfs_map_data device %s mr->mr_sg: %p , nents: %d\n",
+			 ep->re_id->device->name, mr->mr_sg, mr->mr_nents);
+	} else
+#endif
+	{
+		dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
+					  mr->mr_dir);
+		if (!dma_nents)
+			goto out_dmamap_err;
+	}
 	mr->mr_device = ep->re_id->device;
 
 	ibmr = mr->mr_ibmr;
diff --git a/net/sunrpc/xprtrdma/nvfs.h b/net/sunrpc/xprtrdma/nvfs.h
new file mode 100644
index 0000000000000..f85f0cc2f4b44
--- /dev/null
+++ b/net/sunrpc/xprtrdma/nvfs.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef NVFS_H
+#define NVFS_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/cpumask.h>
+#include <linux/scatterlist.h>
+#include <linux/percpu-defs.h>
+#include <linux/dma-direction.h>
+
+#define REGSTR2(x) x##_register_nvfs_dma_ops
+#define REGSTR(x)  REGSTR2(x)
+
+#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops
+#define UNREGSTR(x)  UNREGSTR2(x)
+
+#define REGISTER_FUNC REGSTR(MODULE_PREFIX)
+#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX)
+
+#define NVFS_IO_ERR                    -1
+#define NVFS_CPU_REQ                   -2
+
+#define NVFS_HOLD_TIME_MS 1000
+
+extern struct nvfs_dma_rw_ops *nvfs_ops;
+
+extern atomic_t nvfs_shutdown;
+
+DECLARE_PER_CPU(long, nvfs_n_ops);
+
+static inline long nvfs_count_ops(void)
+{
+	int i;
+	long sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += per_cpu(nvfs_n_ops, i);
+	return sum;
+}
+
+static inline bool nvfs_get_ops(void)
+{
+	if (nvfs_ops && !atomic_read(&nvfs_shutdown)) {
+		this_cpu_inc(nvfs_n_ops);
+		return true;
+	}
+	return false;
+}
+
+static inline void nvfs_put_ops(void)
+{
+	this_cpu_dec(nvfs_n_ops);
+}
+
+struct nvfs_dma_rw_ops {
+	unsigned long long ft_bmap; // feature bitmap
+
+	int (*nvfs_blk_rq_map_sg)(struct request_queue *q,
+				  struct request *req,
+				  struct scatterlist *sglist);
+
+	int (*nvfs_dma_map_sg_attrs)(struct device *device,
+				     struct scatterlist *sglist,
+				     int nents,
+				     enum dma_data_direction dma_dir,
+				     unsigned long attrs);
+
+	int (*nvfs_dma_unmap_sg)(struct device *device,
+				 struct scatterlist *sglist,
+				 int nents,
+				 enum dma_data_direction dma_dir);
+
+	bool (*nvfs_is_gpu_page)(struct page *page);
+
+	unsigned int (*nvfs_gpu_index)(struct page *page);
+
+	unsigned int (*nvfs_device_priority)(struct device *dev, unsigned int gpu_index);
+};
+
+// feature list for dma_ops, values indicate bit pos
+enum ft_bits {
+	nvfs_ft_prep_sglist         = 1ULL << 0,
+	nvfs_ft_map_sglist          = 1ULL << 1,
+	nvfs_ft_is_gpu_page         = 1ULL << 2,
+	nvfs_ft_device_priority     = 1ULL << 3,
+};
+
+// check features for use in registration with vendor drivers
+#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops)         ((ops)->ft_bmap & nvfs_ft_prep_sglist)
+#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)          ((ops)->ft_bmap & nvfs_ft_map_sglist)
+#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops)            ((ops)->ft_bmap & nvfs_ft_is_gpu_page)
+#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops)     ((ops)->ft_bmap & nvfs_ft_device_priority)
+
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops);
+
+void UNREGISTER_FUNC(void);
+
+#endif /* NVFS_H */
diff --git a/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c b/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c
new file mode 100644
index 0000000000000..8691ec73bca26
--- /dev/null
+++ b/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifdef CONFIG_NVFS
+#define MODULE_PREFIX rpcrdma
+#include "nvfs.h"
+
+struct nvfs_dma_rw_ops *nvfs_ops;
+
+atomic_t nvfs_shutdown = ATOMIC_INIT(1);
+
+DEFINE_PER_CPU(long, nvfs_n_ops);
+
+// must have for compatibility
+#define NVIDIA_FS_COMPAT_FT(ops) \
+	((NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops)) && (NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)))
+
+// protected via nvfs_module_mutex
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
+{
+	if (NVIDIA_FS_COMPAT_FT(ops)) {
+		nvfs_ops = ops;
+		atomic_set(&nvfs_shutdown, 0);
+		return 0;
+	}
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(REGISTER_FUNC);
+
+// protected via nvfs_module_mutex
+void UNREGISTER_FUNC(void)
+{
+	(void)atomic_cmpxchg(&nvfs_shutdown, 0, 1);
+	do {
+		msleep(NVFS_HOLD_TIME_MS);
+	} while (nvfs_count_ops());
+	nvfs_ops = NULL;
+}
+EXPORT_SYMBOL(UNREGISTER_FUNC);
+#endif
diff --git a/net/sunrpc/xprtrdma/nvfs_rpc_rdma.h b/net/sunrpc/xprtrdma/nvfs_rpc_rdma.h
new file mode 100644
index 0000000000000..971282cf1dc0b
--- /dev/null
+++ b/net/sunrpc/xprtrdma/nvfs_rpc_rdma.h
@@ -0,0 +1,65 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef NVFS_RPCRDMA_H
+#define NVFS_RPCRDMA_H
+
+#ifdef NVFS_FRWR
+static int rpcrdma_nvfs_map_data(struct device *dev, struct scatterlist *sg,
+				 int nents, enum dma_data_direction dma_dir,
+				 bool *is_nvfs_io)
+{
+	int count;
+
+	*is_nvfs_io = false;
+	count = 0;
+	if (nvfs_get_ops()) {
+		count = nvfs_ops->nvfs_dma_map_sg_attrs(dev,
+				sg,
+				nents,
+				dma_dir,
+				DMA_ATTR_NO_WARN);
+
+		if (unlikely(count == NVFS_IO_ERR)) {
+			nvfs_put_ops();
+			return -EIO;
+		}
+
+		if (unlikely(count == NVFS_CPU_REQ)) {
+			nvfs_put_ops();
+			return 0;
+		}
+		*is_nvfs_io = true;
+	}
+	return count;
+}
+#endif
+
+static bool rpcrdma_nvfs_unmap_data(struct device *dev, struct scatterlist *sg,
+				    int nents, enum dma_data_direction dma_dir)
+{
+	int count;
+
+	if (nvfs_ops != NULL) {
+		count = nvfs_ops->nvfs_dma_unmap_sg(dev, sg, nents,
+				dma_dir);
+		if (count > 0) {
+			nvfs_put_ops();
+			return true;
+		}
+	}
+	return false;
+}
+
+#endif /* NVFS_RPCRDMA_H */

From 8741c51c131f352ca47d4b040e622247119b2843 Mon Sep 17 00:00:00 2001
From: Sourab Gupta <sougupta@nvidia.com>
Date: Mon, 30 Oct 2023 16:56:28 +0000
Subject: [PATCH 007/352] NVIDIA: SAUCE: NVMe/MVMEeOF: Patch NVMe/NVMeOF driver
 to support GDS on Linux 6.8 Kernel

BugLink: https://bugs.launchpad.net/bugs/2059814

With this change, the NVMe and NVMeOF driver would be enabled to support GPUDirectStorage(GDS).
The change is around nvme/nvme rdma map_data()
and unmap_data(), where the IO request is
first intercepted to check for GDS pages and
if it is a GDS page then the request is served
by GDS driver component called nvidia-fs,
else the request would be served by the standard NVMe driver code

Signed-off-by: Sourab Gupta <sougupta@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Ian May <ian.may@canonical.com>
Signed-off-by: Ian May <ian.may@canonical.com>
---
 drivers/nvme/host/Makefile    |   4 +-
 drivers/nvme/host/nvfs-dma.c  |  53 +++++++++++++++
 drivers/nvme/host/nvfs-dma.h  | 123 ++++++++++++++++++++++++++++++++++
 drivers/nvme/host/nvfs-rdma.c |  52 ++++++++++++++
 drivers/nvme/host/nvfs-rdma.h |  89 ++++++++++++++++++++++++
 drivers/nvme/host/nvfs.h      | 113 +++++++++++++++++++++++++++++++
 drivers/nvme/host/pci.c       |  17 +++++
 drivers/nvme/host/rdma.c      |  23 +++++++
 8 files changed, 473 insertions(+), 1 deletion(-)
 create mode 100644 drivers/nvme/host/nvfs-dma.c
 create mode 100644 drivers/nvme/host/nvfs-dma.h
 create mode 100644 drivers/nvme/host/nvfs-rdma.c
 create mode 100644 drivers/nvme/host/nvfs-rdma.h
 create mode 100644 drivers/nvme/host/nvfs.h

diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index 6414ec968f99a..2c87c46177e25 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 
 ccflags-y				+= -I$(src)
-
+ccflags-y               		+= -DCONFIG_NVFS
 obj-$(CONFIG_NVME_CORE)			+= nvme-core.o
 obj-$(CONFIG_BLK_DEV_NVME)		+= nvme.o
 obj-$(CONFIG_NVME_FABRICS)		+= nvme-fabrics.o
@@ -20,10 +20,12 @@ nvme-core-$(CONFIG_NVME_HWMON)		+= hwmon.o
 nvme-core-$(CONFIG_NVME_HOST_AUTH)	+= auth.o
 
 nvme-y					+= pci.o
+nvme-y                  		+= nvfs-dma.o
 
 nvme-fabrics-y				+= fabrics.o
 
 nvme-rdma-y				+= rdma.o
+nvme-rdma-y				+= nvfs-rdma.o
 
 nvme-fc-y				+= fc.o
 
diff --git a/drivers/nvme/host/nvfs-dma.c b/drivers/nvme/host/nvfs-dma.c
new file mode 100644
index 0000000000000..f79a3f1fac391
--- /dev/null
+++ b/drivers/nvme/host/nvfs-dma.c
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifdef CONFIG_NVFS
+#define MODULE_PREFIX nvme_v1
+#include "nvfs.h"
+
+struct nvfs_dma_rw_ops *nvfs_ops;
+
+atomic_t nvfs_shutdown = ATOMIC_INIT(1);
+
+DEFINE_PER_CPU(long, nvfs_n_ops);
+
+// must have for compatability
+#define NVIDIA_FS_COMPAT_FT(ops) \
+      (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))
+
+// protected via nvfs_module_mutex
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
+{
+       if (NVIDIA_FS_COMPAT_FT(ops)) {
+             nvfs_ops = ops;
+             atomic_set(&nvfs_shutdown, 0);
+             return 0;
+       } else
+             return -EOPNOTSUPP;
+
+
+}
+EXPORT_SYMBOL(REGISTER_FUNC);
+
+// protected via nvfs_module_mutex
+void UNREGISTER_FUNC(void)
+{
+        (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1);
+        do{
+                msleep(NVFS_HOLD_TIME_MS);
+        } while (nvfs_count_ops());
+        nvfs_ops = NULL;
+}
+EXPORT_SYMBOL(UNREGISTER_FUNC);
+#endif
diff --git a/drivers/nvme/host/nvfs-dma.h b/drivers/nvme/host/nvfs-dma.h
new file mode 100644
index 0000000000000..34959a409472e
--- /dev/null
+++ b/drivers/nvme/host/nvfs-dma.h
@@ -0,0 +1,123 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef NVFS_DMA_H
+#define NVFS_DMA_H
+
+static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+                struct request *req, struct nvme_rw_command *cmnd);
+
+static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+                struct request *req, struct nvme_rw_command *cmnd);
+
+static bool nvme_nvfs_unmap_data(struct nvme_dev *dev, struct request *req)
+{
+        struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+        enum dma_data_direction dma_dir = rq_dma_dir(req);
+
+        if (!iod || !iod->sgt.nents)
+                return false;
+
+        if (iod->sgt.sgl && !is_pci_p2pdma_page(sg_page(iod->sgt.sgl)) &&
+                !blk_integrity_rq(req) &&
+                !iod->dma_len &&
+                nvfs_ops != NULL) {
+                int count;
+                count = nvfs_ops->nvfs_dma_unmap_sg(dev->dev, iod->sgt.sgl, iod->sgt.nents, dma_dir);
+                if (!count)
+                        return false;
+
+                nvfs_put_ops();
+                return true;
+        }
+
+        return false;
+}
+
+static blk_status_t nvme_nvfs_map_data(struct nvme_dev *dev, struct request *req,
+               struct nvme_command *cmnd, bool *is_nvfs_io)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct request_queue *q = req->q;
+       enum dma_data_direction dma_dir = rq_dma_dir(req);
+       blk_status_t ret = BLK_STS_RESOURCE;
+       int nr_mapped;
+
+       nr_mapped = 0;
+       *is_nvfs_io = false;
+
+       if (!blk_integrity_rq(req) && nvfs_get_ops()) {
+                iod->dma_len = 0;
+                iod->sgt.sgl = mempool_alloc(dev->iod_mempool, GFP_ATOMIC);
+                if (!iod->sgt.sgl) {
+                        nvfs_put_ops();
+                        return BLK_STS_RESOURCE;
+                }
+
+               sg_init_table(iod->sgt.sgl, blk_rq_nr_phys_segments(req));
+               // associates bio pages to scatterlist
+               iod->sgt.orig_nents = nvfs_ops->nvfs_blk_rq_map_sg(q, req, iod->sgt.sgl);
+               if (!iod->sgt.orig_nents) {
+                       mempool_free(iod->sgt.sgl, dev->iod_mempool);
+                       nvfs_put_ops();
+                       return BLK_STS_IOERR; // reset to original ret
+               }
+               *is_nvfs_io = true;
+
+               if (unlikely((iod->sgt.orig_nents == NVFS_IO_ERR))) {
+                       pr_err("%s: failed to map sg_nents=:%d\n", __func__, iod->sgt.nents);
+                       mempool_free(iod->sgt.sgl, dev->iod_mempool);
+                       nvfs_put_ops();
+                       return BLK_STS_IOERR;
+               }
+
+               nr_mapped = nvfs_ops->nvfs_dma_map_sg_attrs(dev->dev,
+                               iod->sgt.sgl,
+                               iod->sgt.orig_nents,
+                               dma_dir,
+                               DMA_ATTR_NO_WARN);
+
+
+               if (unlikely((nr_mapped == NVFS_IO_ERR))) {
+                       mempool_free(iod->sgt.sgl, dev->iod_mempool);
+                       nvfs_put_ops();
+                       pr_err("%s: failed to dma map sglist=:%d\n", __func__, iod->sgt.nents);
+                       return BLK_STS_IOERR;
+               }
+
+               if (unlikely(nr_mapped == NVFS_CPU_REQ)) {
+                       mempool_free(iod->sgt.sgl, dev->iod_mempool);
+                       nvfs_put_ops();
+                       BUG();
+               }
+
+	       iod->sgt.nents = nr_mapped;
+
+                if (nvme_pci_use_sgls(dev, req, iod->sgt.nents)) { // TBD: not tested on SGL mode supporting drive
+		       ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+	       } else {
+		       // push dma address to hw registers
+                       ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+               }
+
+               if (ret != BLK_STS_OK) {
+                        nvme_nvfs_unmap_data(dev, req);
+                        mempool_free(iod->sgt.sgl, dev->iod_mempool);
+               }
+               return ret;
+       }
+       return ret;
+}
+
+#endif /* NVFS_DMA_H */
diff --git a/drivers/nvme/host/nvfs-rdma.c b/drivers/nvme/host/nvfs-rdma.c
new file mode 100644
index 0000000000000..2bdebc8693fbb
--- /dev/null
+++ b/drivers/nvme/host/nvfs-rdma.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifdef CONFIG_NVFS
+#define MODULE_PREFIX nvme_rdma_v1
+#include "nvfs.h"
+
+struct nvfs_dma_rw_ops *nvfs_ops;
+
+atomic_t nvfs_shutdown = ATOMIC_INIT(1);
+
+DEFINE_PER_CPU(long, nvfs_n_ops);
+
+// must have for compatability
+#define NVIDIA_FS_COMPAT_FT(ops) \
+      (NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops) && NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops))
+
+// protected via nvfs_module_mutex
+int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
+{
+       if (NVIDIA_FS_COMPAT_FT(ops)) {
+             nvfs_ops = ops;
+             atomic_set(&nvfs_shutdown, 0);
+             return 0;
+       } else
+             return -EOPNOTSUPP;
+
+}
+EXPORT_SYMBOL(REGISTER_FUNC);
+
+// protected via nvfs_module_mutex
+void UNREGISTER_FUNC(void)
+{
+        (void) atomic_cmpxchg(&nvfs_shutdown, 0, 1);
+        do{
+                msleep(NVFS_HOLD_TIME_MS);
+        } while (nvfs_count_ops());
+        nvfs_ops = NULL;
+}
+EXPORT_SYMBOL(UNREGISTER_FUNC);
+#endif
diff --git a/drivers/nvme/host/nvfs-rdma.h b/drivers/nvme/host/nvfs-rdma.h
new file mode 100644
index 0000000000000..020fc83f73607
--- /dev/null
+++ b/drivers/nvme/host/nvfs-rdma.h
@@ -0,0 +1,89 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef NVFS_RDMA_H
+#define NVFS_RDMA_H
+
+static bool nvme_rdma_nvfs_unmap_data(struct ib_device *ibdev,
+		struct request *rq)
+
+{
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	enum dma_data_direction dma_dir = rq_dma_dir(rq);
+	int count;
+
+	if (!blk_integrity_rq(rq) && nvfs_ops != NULL) {
+		count = nvfs_ops->nvfs_dma_unmap_sg(ibdev->dma_device, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
+				dma_dir);
+		if (count) {
+			nvfs_put_ops();
+			sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
+			return true;
+		}
+	}
+	return false;
+}
+
+static int nvme_rdma_nvfs_map_data(struct ib_device *ibdev, struct request *rq, bool *is_nvfs_io, int* count)
+{
+	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
+	enum dma_data_direction dma_dir = rq_dma_dir(rq);
+	int ret = 0;
+
+	*is_nvfs_io = false;
+	*count = 0;
+	if (!blk_integrity_rq(rq) && nvfs_get_ops()) {
+
+		// associates bio pages to scatterlist
+		*count = nvfs_ops->nvfs_blk_rq_map_sg(rq->q, rq , req->data_sgl.sg_table.sgl);
+		if (!*count) {
+			nvfs_put_ops();
+			return 0; // fall to cpu path
+		}
+
+		*is_nvfs_io = true;
+		if (unlikely((*count == NVFS_IO_ERR))) {
+			nvfs_put_ops();
+			pr_err("%s: failed to map sg_nents=:%d\n", __func__, req->data_sgl.nents);
+			return -EIO;
+		}
+		req->data_sgl.nents = *count;
+
+		*count = nvfs_ops->nvfs_dma_map_sg_attrs(ibdev->dma_device,
+				req->data_sgl.sg_table.sgl,
+				req->data_sgl.nents,
+				dma_dir,
+				DMA_ATTR_NO_WARN);
+
+		if (unlikely((*count == NVFS_IO_ERR))) {
+			nvfs_put_ops();
+			return -EIO;
+		}
+
+		if (unlikely(*count == NVFS_CPU_REQ)) {
+			nvfs_put_ops();
+			BUG();
+			return -EIO;
+		}
+
+		return ret;
+	} else {
+		// Fall to CPU path
+		return 0;
+	}
+
+	return ret;
+}
+
+#endif
diff --git a/drivers/nvme/host/nvfs.h b/drivers/nvme/host/nvfs.h
new file mode 100644
index 0000000000000..cf6966771786a
--- /dev/null
+++ b/drivers/nvme/host/nvfs.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#ifndef NVFS_H
+#define NVFS_H
+
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <linux/blkdev.h>
+#include <linux/cpumask.h>
+#include <linux/scatterlist.h>
+#include <linux/percpu-defs.h>
+#include <linux/dma-direction.h>
+
+#define REGSTR2(x) x##_register_nvfs_dma_ops
+#define REGSTR(x)  REGSTR2(x)
+
+#define UNREGSTR2(x) x##_unregister_nvfs_dma_ops
+#define UNREGSTR(x)  UNREGSTR2(x)
+
+#define REGISTER_FUNC REGSTR(MODULE_PREFIX)
+#define UNREGISTER_FUNC UNREGSTR(MODULE_PREFIX)
+
+#define NVFS_IO_ERR                    -1
+#define NVFS_CPU_REQ                   -2
+
+#define NVFS_HOLD_TIME_MS 1000
+
+extern struct nvfs_dma_rw_ops *nvfs_ops;
+
+extern atomic_t nvfs_shutdown;
+
+DECLARE_PER_CPU(long, nvfs_n_ops);
+
+static inline long nvfs_count_ops(void)
+{
+       int i;
+       long sum = 0;
+
+       for_each_possible_cpu(i)
+               sum += per_cpu(nvfs_n_ops, i);
+       return sum;
+}
+
+static inline bool nvfs_get_ops(void)
+{
+       if (nvfs_ops && !atomic_read(&nvfs_shutdown)) {
+               this_cpu_inc(nvfs_n_ops);
+               return true;
+       }
+       return false;
+}
+
+static inline void nvfs_put_ops(void)
+{
+       this_cpu_dec(nvfs_n_ops);
+}
+
+struct nvfs_dma_rw_ops {
+       unsigned long long ft_bmap; // feature bitmap
+
+       int (*nvfs_blk_rq_map_sg) (struct request_queue *q,
+                       struct request *req,
+                       struct scatterlist *sglist);
+
+       int (*nvfs_dma_map_sg_attrs) (struct device *device,
+                       struct scatterlist *sglist,
+                       int nents,
+                       enum dma_data_direction dma_dir,
+                       unsigned long attrs);
+
+       int (*nvfs_dma_unmap_sg)  (struct device *device,
+                       struct scatterlist *sglist,
+                       int nents,
+                       enum dma_data_direction dma_dir);
+
+       bool (*nvfs_is_gpu_page) (struct page *page);
+
+       unsigned int (*nvfs_gpu_index) (struct page *page);
+
+       unsigned int (*nvfs_device_priority) (struct device *dev, unsigned int gpu_index);
+};
+
+// feature list for dma_ops, values indicate bit pos
+enum ft_bits {
+        nvfs_ft_prep_sglist         = 1ULL << 0,
+        nvfs_ft_map_sglist          = 1ULL << 1,
+        nvfs_ft_is_gpu_page         = 1ULL << 2,
+        nvfs_ft_device_priority     = 1ULL << 3,
+};
+
+// check features for use in registration with vendor drivers
+#define NVIDIA_FS_CHECK_FT_SGLIST_PREP(ops)         ((ops)->ft_bmap & nvfs_ft_prep_sglist)
+#define NVIDIA_FS_CHECK_FT_SGLIST_DMA(ops)          ((ops)->ft_bmap & nvfs_ft_map_sglist)
+#define NVIDIA_FS_CHECK_FT_GPU_PAGE(ops)            ((ops)->ft_bmap & nvfs_ft_is_gpu_page)
+#define NVIDIA_FS_CHECK_FT_DEVICE_PRIORITY(ops)     ((ops)->ft_bmap & nvfs_ft_device_priority)
+
+int REGISTER_FUNC (struct nvfs_dma_rw_ops *ops);
+
+void UNREGISTER_FUNC (void);
+
+#endif /* NVFS_H */
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 710043086dffa..7fb7024cfc3ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -31,6 +31,9 @@
 
 #include "trace.h"
 #include "nvme.h"
+#ifdef CONFIG_NVFS
+#include "nvfs.h"
+#endif
 
 #define SQ_SIZE(q)	((q)->q_depth << (q)->sqes)
 #define CQ_SIZE(q)	((q)->q_depth * sizeof(struct nvme_completion))
@@ -537,6 +540,9 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
 	}
 }
 
+#ifdef CONFIG_NVFS
+#include "nvfs-dma.h"
+#endif
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -549,7 +555,12 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 
 	WARN_ON_ONCE(!iod->sgt.nents);
 
+#ifdef CONFIG_NVFS
+	if (!nvme_nvfs_unmap_data(dev, req))
+		dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
+#else
 	dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
+#endif
 
 	if (iod->nr_allocations == 0)
 		dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
@@ -773,6 +784,12 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
 	blk_status_t ret = BLK_STS_RESOURCE;
 	int rc;
 
+#ifdef CONFIG_NVFS
+	bool is_nvfs_io = false;
+	ret = nvme_nvfs_map_data(dev, req, cmnd, &is_nvfs_io);
+	if (is_nvfs_io)
+		return ret;
+#endif
 	if (blk_rq_nr_phys_segments(req) == 1) {
 		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 		struct bio_vec bv = req_bvec(req);
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 20fdd40b1879f..fd74c37df9e09 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -27,6 +27,9 @@
 #include "nvme.h"
 #include "fabrics.h"
 
+#ifdef CONFIG_NVFS
+#include "nvfs.h"
+#endif
 
 #define NVME_RDMA_CM_TIMEOUT_MS		3000		/* 3 second */
 
@@ -1209,6 +1212,9 @@ static int nvme_rdma_inv_rkey(struct nvme_rdma_queue *queue,
 	return ib_post_send(queue->qp, &wr, NULL);
 }
 
+#ifdef CONFIG_NVFS
+#include "nvfs-rdma.h"
+#endif
 static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq)
 {
 	struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
@@ -1220,6 +1226,11 @@ static void nvme_rdma_dma_unmap_req(struct ib_device *ibdev, struct request *rq)
 				      NVME_INLINE_METADATA_SG_CNT);
 	}
 
+#ifdef CONFIG_NVFS
+	if (nvme_rdma_nvfs_unmap_data(ibdev, rq))
+		return;
+#endif
+
 	ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl, req->data_sgl.nents,
 			rq_dma_dir(rq));
 	sg_free_table_chained(&req->data_sgl.sg_table, NVME_INLINE_SG_CNT);
@@ -1473,6 +1484,18 @@ static int nvme_rdma_dma_map_req(struct ib_device *ibdev, struct request *rq,
 	if (ret)
 		return -ENOMEM;
 
+#ifdef CONFIG_NVFS
+        {
+        bool is_nvfs_io = false;
+        ret = nvme_rdma_nvfs_map_data(ibdev, rq, &is_nvfs_io, count);
+        if (is_nvfs_io) {
+	        if (ret)
+	               goto out_free_table;
+                return 0;
+	}
+        }
+#endif
+
 	req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
 					    req->data_sgl.sg_table.sgl);
 

From 4836858d0023667135b8b0de1c4a8673bbc9faa0 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 29 Jan 2024 13:46:35 +0100
Subject: [PATCH 008/352] arm64/mm: make set_ptes() robust when OAs cross
 48-bit boundary

BugLink: https://bugs.launchpad.net/bugs/2059316

Patch series "mm/memory: optimize fork() with PTE-mapped THP", v3.

Now that the rmap overhaul[1] is upstream that provides a clean interface
for rmap batching, let's implement PTE batching during fork when
processing PTE-mapped THPs.

This series is partially based on Ryan's previous work[2] to implement
cont-pte support on arm64, but its a complete rewrite based on [1] to
optimize all architectures independent of any such PTE bits, and to use
the new rmap batching functions that simplify the code and prepare for
further rmap accounting changes.

We collect consecutive PTEs that map consecutive pages of the same large
folio, making sure that the other PTE bits are compatible, and (a) adjust
the refcount only once per batch, (b) call rmap handling functions only
once per batch and (c) perform batch PTE setting/updates.

While this series should be beneficial for adding cont-pte support on
ARM64[2], it's one of the requirements for maintaining a total mapcount[3]
for large folios with minimal added overhead and further changes[4] that
build up on top of the total mapcount.

Independent of all that, this series results in a speedup during fork with
PTE-mapped THP, which is the default with THPs that are smaller than a PMD
(for example, 16KiB to 1024KiB mTHPs for anonymous memory[5]).

On an Intel Xeon Silver 4210R CPU, fork'ing with 1GiB of PTE-mapped folios
of the same size (stddev < 1%) results in the following runtimes for
fork() (shorter is better):

Folio Size | v6.8-rc1 |      New | Change
------------------------------------------
      4KiB | 0.014328 | 0.014035 |   - 2%
     16KiB | 0.014263 | 0.01196  |   -16%
     32KiB | 0.014334 | 0.01094  |   -24%
     64KiB | 0.014046 | 0.010444 |   -26%
    128KiB | 0.014011 | 0.010063 |   -28%
    256KiB | 0.013993 | 0.009938 |   -29%
    512KiB | 0.013983 | 0.00985  |   -30%
   1024KiB | 0.013986 | 0.00982  |   -30%
   2048KiB | 0.014305 | 0.010076 |   -30%

Note that these numbers are even better than the ones from v1 (verified
over multiple reboots), even though there were only minimal code changes.
Well, I removed a pte_mkclean() call for anon folios, maybe that also
plays a role.

But my experience is that fork() is extremely sensitive to code size,
inlining, ...  so I suspect we'll see on other architectures rather a
change of -20% instead of -30%, and it will be easy to "lose" some of that
speedup in the future by subtle code changes.

Next up is PTE batching when unmapping.  Only tested on x86-64.
Compile-tested on most other architectures.

[1] https://lkml.kernel.org/r/20231220224504.646757-1-david@redhat.com
[2] https://lkml.kernel.org/r/20231218105100.172635-1-ryan.roberts@arm.com
[3] https://lkml.kernel.org/r/20230809083256.699513-1-david@redhat.com
[4] https://lkml.kernel.org/r/20231124132626.235350-1-david@redhat.com
[5] https://lkml.kernel.org/r/20231207161211.2374093-1-ryan.roberts@arm.com

This patch (of 15):

Since the high bits [51:48] of an OA are not stored contiguously in the
PTE, there is a theoretical bug in set_ptes(), which just adds PAGE_SIZE
to the pte to get the pte with the next pfn.  This works until the pfn
crosses the 48-bit boundary, at which point we overflow into the upper
attributes.

Of course one could argue (and Matthew Wilcox has :) that we will never
see a folio cross this boundary because we only allow naturally aligned
power-of-2 allocation, so this would require a half-petabyte folio.  So
its only a theoretical bug.  But its better that the code is robust
regardless.

I've implemented pte_next_pfn() as part of the fix, which is an opt-in
core-mm interface.  So that is now available to the core-mm, which will be
needed shortly to support forthcoming fork()-batching optimizations.

Link: https://lkml.kernel.org/r/20240129124649.189745-1-david@redhat.com
Link: https://lkml.kernel.org/r/20240125173534.1659317-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240129124649.189745-2-david@redhat.com
Fixes: 4a169d61c2ed ("arm64: implement the new page table range API")
Closes: https://lore.kernel.org/linux-mm/fdaeb9a5-d890-499a-92c8-d171df43ad01@arm.com/
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 6e8f588708971e0626f5be808e8c4b6cdb86eb0b)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 79ce70fbb751c..52d0b0a763f16 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -341,6 +341,22 @@ static inline void __sync_cache_and_tags(pte_t pte, unsigned int nr_pages)
 		mte_sync_tags(pte, nr_pages);
 }
 
+/*
+ * Select all bits except the pfn
+ */
+static inline pgprot_t pte_pgprot(pte_t pte)
+{
+	unsigned long pfn = pte_pfn(pte);
+
+	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
+}
+
+#define pte_next_pfn pte_next_pfn
+static inline pte_t pte_next_pfn(pte_t pte)
+{
+	return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte));
+}
+
 static inline void set_ptes(struct mm_struct *mm,
 			    unsigned long __always_unused addr,
 			    pte_t *ptep, pte_t pte, unsigned int nr)
@@ -354,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte_val(pte) += PAGE_SIZE;
+		pte = pte_next_pfn(pte);
 	}
 }
 #define set_ptes set_ptes
@@ -433,16 +449,6 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte)
 	return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
 }
 
-/*
- * Select all bits except the pfn
- */
-static inline pgprot_t pte_pgprot(pte_t pte)
-{
-	unsigned long pfn = pte_pfn(pte);
-
-	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
-}
-
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * See the comment in include/linux/pgtable.h

From 402f7da3c286d6bd57bc8ed76e1b551de2693653 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:36 +0100
Subject: [PATCH 009/352] arm/pgtable: define PFN_PTE_SHIFT

BugLink: https://bugs.launchpad.net/bugs/2059316

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-3-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 12b884f2e09ab42d3879a3e2c703e7157691013c)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index d657b84b6bf70..be91e376df79e 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -209,6 +209,8 @@ static inline void __sync_icache_dcache(pte_t pteval)
 extern void __sync_icache_dcache(pte_t pteval);
 #endif
 
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 void set_ptes(struct mm_struct *mm, unsigned long addr,
 		      pte_t *ptep, pte_t pteval, unsigned int nr);
 #define set_ptes set_ptes

From b15a89c717a894b5b648d1b80c7a248a1ebbf2e3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:37 +0100
Subject: [PATCH 010/352] nios2/pgtable: define PFN_PTE_SHIFT

BugLink: https://bugs.launchpad.net/bugs/2059316

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-4-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 3a6a6c3fbda8f50fc9f0e5fede8a0f70abdea033)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/nios2/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/nios2/include/asm/pgtable.h b/arch/nios2/include/asm/pgtable.h
index 5144506dfa693..d052dfcbe8d3a 100644
--- a/arch/nios2/include/asm/pgtable.h
+++ b/arch/nios2/include/asm/pgtable.h
@@ -178,6 +178,8 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
 	*ptep = pteval;
 }
 
+#define PFN_PTE_SHIFT		0
+
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pte, unsigned int nr)
 {

From 37036d51d070b22856a3d089a694f924c125892e Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:38 +0100
Subject: [PATCH 011/352] powerpc/pgtable: define PFN_PTE_SHIFT

BugLink: https://bugs.launchpad.net/bugs/2059316

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-5-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit f7dc4d689e6fafe3d8424f600b924f2d59d1a3cf)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/powerpc/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h
index 9224f23065fff..7a1ba8889aeae 100644
--- a/arch/powerpc/include/asm/pgtable.h
+++ b/arch/powerpc/include/asm/pgtable.h
@@ -41,6 +41,8 @@ struct mm_struct;
 
 #ifndef __ASSEMBLY__
 
+#define PFN_PTE_SHIFT		PTE_RPN_SHIFT
+
 void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 		pte_t pte, unsigned int nr);
 #define set_ptes set_ptes

From f74c919482714332afa80589f0a487d1622ad3a6 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:39 +0100
Subject: [PATCH 012/352] riscv/pgtable: define PFN_PTE_SHIFT

BugLink: https://bugs.launchpad.net/bugs/2059316

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-6-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 57c254b2fb31f0160829f4bf1cb993a9e9c302a8)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/riscv/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 39c6bb8254683..67d69bf949e84 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -529,6 +529,8 @@ static inline void __set_pte_at(pte_t *ptep, pte_t pteval)
 	set_pte(ptep, pteval);
 }
 
+#define PFN_PTE_SHIFT		_PAGE_PFN_SHIFT
+
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pteval, unsigned int nr)
 {

From 0bfc2f1a07f4f45f15ef3ee99f1c0789f1844cc0 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:40 +0100
Subject: [PATCH 013/352] s390/pgtable: define PFN_PTE_SHIFT

BugLink: https://bugs.launchpad.net/bugs/2059316

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-7-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 4555ac8b3c16f67f74c04ff71ce8c4a8fcee973a)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/s390/include/asm/pgtable.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index e74200b1b895a..2bbb4b653c6b6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1326,6 +1326,8 @@ pgprot_t pgprot_writecombine(pgprot_t prot);
 #define pgprot_writethrough	pgprot_writethrough
 pgprot_t pgprot_writethrough(pgprot_t prot);
 
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 /*
  * Set multiple PTEs to consecutive pages with a single call.  All PTEs
  * are within the same folio, PMD and VMA.

From aa571074c3affd03f88fb283a03af627548605e3 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:41 +0100
Subject: [PATCH 014/352] sparc/pgtable: define PFN_PTE_SHIFT

BugLink: https://bugs.launchpad.net/bugs/2059316

We want to make use of pte_next_pfn() outside of set_ptes().  Let's simply
define PFN_PTE_SHIFT, required by pte_next_pfn().

Link: https://lkml.kernel.org/r/20240129124649.189745-8-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit ce7a9de353da053e55a68e2441196114547e38d0)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/sparc/include/asm/pgtable_64.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index a8c871b7d7860..652af9d63fa29 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -929,6 +929,8 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
 	maybe_tlb_batch_add(mm, addr, ptep, orig, fullmm, PAGE_SHIFT);
 }
 
+#define PFN_PTE_SHIFT		PAGE_SHIFT
+
 static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 		pte_t *ptep, pte_t pte, unsigned int nr)
 {

From 94c5d8163e547b1ec8f918c922e7fb5b78a6174d Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:42 +0100
Subject: [PATCH 015/352] mm/pgtable: make pte_next_pfn() independent of
 set_ptes()

BugLink: https://bugs.launchpad.net/bugs/2059316

Let's provide pte_next_pfn(), independently of set_ptes().  This allows
for using the generic pte_next_pfn() version in some arch-specific
set_ptes() implementations, and prepares for reusing pte_next_pfn() in
other context.

Link: https://lkml.kernel.org/r/20240129124649.189745-9-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 6cdfa1d5d5d8285108495c33588c48cdda81b647)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 include/linux/pgtable.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index f6d0e3513948a..351cd9dc7194f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
-#ifndef set_ptes
 
 #ifndef pte_next_pfn
 static inline pte_t pte_next_pfn(pte_t pte)
@@ -221,6 +220,7 @@ static inline pte_t pte_next_pfn(pte_t pte)
 }
 #endif
 
+#ifndef set_ptes
 /**
  * set_ptes - Map consecutive pages to a contiguous range of addresses.
  * @mm: Address space to map the pages into.

From 766ce932ca863ece00ecdfa317033ca48d4d8e93 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:43 +0100
Subject: [PATCH 016/352] arm/mm: use pte_next_pfn() in set_ptes()

BugLink: https://bugs.launchpad.net/bugs/2059316

Let's use our handy helper now that it's available on all archs.

Link: https://lkml.kernel.org/r/20240129124649.189745-10-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit e5ea320aec811c0e5cddefda17052579e0306415)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm/mm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 674ed71573a84..c24e29c0b9a48 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1814,6 +1814,6 @@ void set_ptes(struct mm_struct *mm, unsigned long addr,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte_val(pteval) += PAGE_SIZE;
+		pteval = pte_next_pfn(pteval);
 	}
 }

From 8e8dd0e093206b4220a058574d655f00603908ff Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:44 +0100
Subject: [PATCH 017/352] powerpc/mm: use pte_next_pfn() in set_ptes()

BugLink: https://bugs.launchpad.net/bugs/2059316

Let's use our handy new helper. Note that the implementation is slightly
different, but shouldn't really make a difference in practice.

Link: https://lkml.kernel.org/r/20240129124649.189745-11-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Christophe Leroy <christophe.leroy@csgroup.eu>
Tested-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 802cc2ab33b0d8a013c216ca7f4caa9034bfc257)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/powerpc/mm/pgtable.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c
index a04ae4449a025..549a440ed7f65 100644
--- a/arch/powerpc/mm/pgtable.c
+++ b/arch/powerpc/mm/pgtable.c
@@ -220,10 +220,7 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep,
 			break;
 		ptep++;
 		addr += PAGE_SIZE;
-		/*
-		 * increment the pfn.
-		 */
-		pte = pfn_pte(pte_pfn(pte) + 1, pte_pgprot((pte)));
+		pte = pte_next_pfn(pte);
 	}
 }
 

From 32b215e4ed336a036c2fbadeef1775694ac94e26 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:45 +0100
Subject: [PATCH 018/352] mm/memory: factor out copying the actual PTE in
 copy_present_pte()

BugLink: https://bugs.launchpad.net/bugs/2059316

Let's prepare for further changes.

Link: https://lkml.kernel.org/r/20240129124649.189745-12-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 23ed190868a65525b8941370630fbb215f12ebe8)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 mm/memory.c | 63 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 30 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index d5753ed81fc71..e28b23ee6e786 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -930,6 +930,29 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	return 0;
 }
 
+static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
+		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
+		pte_t pte, unsigned long addr)
+{
+	struct mm_struct *src_mm = src_vma->vm_mm;
+
+	/* If it's a COW mapping, write protect it both processes. */
+	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
+		ptep_set_wrprotect(src_mm, addr, src_pte);
+		pte = pte_wrprotect(pte);
+	}
+
+	/* If it's a shared mapping, mark it clean in the child. */
+	if (src_vma->vm_flags & VM_SHARED)
+		pte = pte_mkclean(pte);
+	pte = pte_mkold(pte);
+
+	if (!userfaultfd_wp(dst_vma))
+		pte = pte_clear_uffd_wp(pte);
+
+	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+}
+
 /*
  * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
  * is required to copy this pte.
@@ -939,23 +962,23 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
 		 struct folio **prealloc)
 {
-	struct mm_struct *src_mm = src_vma->vm_mm;
-	unsigned long vm_flags = src_vma->vm_flags;
 	pte_t pte = ptep_get(src_pte);
 	struct page *page;
 	struct folio *folio;
 
 	page = vm_normal_page(src_vma, addr, pte);
-	if (page)
-		folio = page_folio(page);
-	if (page && folio_test_anon(folio)) {
+	if (unlikely(!page))
+		goto copy_pte;
+
+	folio = page_folio(page);
+	folio_get(folio);
+	if (folio_test_anon(folio)) {
 		/*
 		 * If this page may have been pinned by the parent process,
 		 * copy the page immediately for the child so that we'll always
 		 * guarantee the pinned page won't be randomly replaced in the
 		 * future.
 		 */
-		folio_get(folio);
 		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
 			/* Page may be pinned, we have to copy. */
 			folio_put(folio);
@@ -963,34 +986,14 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 						 addr, rss, prealloc, page);
 		}
 		rss[MM_ANONPAGES]++;
-	} else if (page) {
-		folio_get(folio);
+		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+	} else {
 		folio_dup_file_rmap_pte(folio, page);
 		rss[mm_counter_file(page)]++;
 	}
 
-	/*
-	 * If it's a COW mapping, write protect it both
-	 * in the parent and the child
-	 */
-	if (is_cow_mapping(vm_flags) && pte_write(pte)) {
-		ptep_set_wrprotect(src_mm, addr, src_pte);
-		pte = pte_wrprotect(pte);
-	}
-	VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
-
-	/*
-	 * If it's a shared mapping, mark it clean in
-	 * the child
-	 */
-	if (vm_flags & VM_SHARED)
-		pte = pte_mkclean(pte);
-	pte = pte_mkold(pte);
-
-	if (!userfaultfd_wp(dst_vma))
-		pte = pte_clear_uffd_wp(pte);
-
-	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+copy_pte:
+	__copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr);
 	return 0;
 }
 

From 29f98cb8532411b819e64a73909fcef3d75e933a Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:46 +0100
Subject: [PATCH 019/352] mm/memory: pass PTE to copy_present_pte()

BugLink: https://bugs.launchpad.net/bugs/2059316

We already read it, let's just forward it.

This patch is based on work by Ryan Roberts.

[david@redhat.com: fix the hmm "exclusive_cow" selftest]
  Link: https://lkml.kernel.org/r/13f296b8-e882-47fd-b939-c2141dc28717@redhat.com
Link: https://lkml.kernel.org/r/20240129124649.189745-13-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 53723298ba436830fdf0744c19b57b2a18f44041)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 mm/memory.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index e28b23ee6e786..5bc321a6fb50b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -959,10 +959,9 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
  */
 static inline int
 copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
-		 pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
-		 struct folio **prealloc)
+		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
+		 int *rss, struct folio **prealloc)
 {
-	pte_t pte = ptep_get(src_pte);
 	struct page *page;
 	struct folio *folio;
 
@@ -1094,6 +1093,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 				progress += 8;
 				continue;
 			}
+			ptent = ptep_get(src_pte);
+			VM_WARN_ON_ONCE(!pte_present(ptent));
 
 			/*
 			 * Device exclusive entry restored, continue by copying
@@ -1103,7 +1104,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		}
 		/* copy_present_pte() will clear `*prealloc' if consumed */
 		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
-				       addr, rss, &prealloc);
+				       ptent, addr, rss, &prealloc);
 		/*
 		 * If we need a pre-allocated page for this pte, drop the
 		 * locks, allocate, and try again.

From 388ef2edba67b8e67aba9c430ef85fd88feb5360 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:47 +0100
Subject: [PATCH 020/352] mm/memory: optimize fork() with PTE-mapped THP

BugLink: https://bugs.launchpad.net/bugs/2059316

Let's implement PTE batching when consecutive (present) PTEs map
consecutive pages of the same large folio, and all other PTE bits besides
the PFNs are equal.

We will optimize folio_pte_batch() separately, to ignore selected PTE
bits.  This patch is based on work by Ryan Roberts.

Use __always_inline for __copy_present_ptes() and keep the handling for
single PTEs completely separate from the multi-PTE case: we really want
the compiler to optimize for the single-PTE case with small folios, to not
degrade performance.

Note that PTE batching will never exceed a single page table and will
always stay within VMA boundaries.

Further, processing PTE-mapped THP that maybe pinned and have
PageAnonExclusive set on at least one subpage should work as expected, but
there is room for improvement: We will repeatedly (1) detect a PTE batch
(2) detect that we have to copy a page (3) fall back and allocate a single
page to copy a single page.  For now we won't care as pinned pages are a
corner case, and we should rather look into maintaining only a single
PageAnonExclusive bit for large folios.

Link: https://lkml.kernel.org/r/20240129124649.189745-14-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(backported from commit f8d937761d65c87e9987b88ea7beb7bddc333a0e)
[ dannf: mm_counter_file() in v6.8 took a page instead of a folio ]
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 include/linux/pgtable.h |  31 +++++++++++
 mm/memory.c             | 112 +++++++++++++++++++++++++++++++++-------
 2 files changed, 124 insertions(+), 19 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 351cd9dc7194f..aab227e12493f 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -650,6 +650,37 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
+#ifndef wrprotect_ptes
+/**
+ * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
+ *		    folio.
+ * @mm: Address space the pages are mapped into.
+ * @addr: Address the first page is mapped at.
+ * @ptep: Page table pointer for the first entry.
+ * @nr: Number of entries to write-protect.
+ *
+ * May be overridden by the architecture; otherwise, implemented as a simple
+ * loop over ptep_set_wrprotect().
+ *
+ * Note that PTE bits in the PTE range besides the PFN can differ. For example,
+ * some PTEs might be write-protected.
+ *
+ * Context: The caller holds the page table lock.  The PTEs map consecutive
+ * pages that belong to the same folio.  The PTEs are all in the same PMD.
+ */
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+		pte_t *ptep, unsigned int nr)
+{
+	for (;;) {
+		ptep_set_wrprotect(mm, addr, ptep);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+#endif
+
 /*
  * On some architectures hardware does not set page access bit when accessing
  * memory page, it is responsibility of software setting this bit. It brings
diff --git a/mm/memory.c b/mm/memory.c
index 5bc321a6fb50b..749562f5bf1e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -930,15 +930,15 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	return 0;
 }
 
-static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
+static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
 		struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
-		pte_t pte, unsigned long addr)
+		pte_t pte, unsigned long addr, int nr)
 {
 	struct mm_struct *src_mm = src_vma->vm_mm;
 
 	/* If it's a COW mapping, write protect it both processes. */
 	if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
-		ptep_set_wrprotect(src_mm, addr, src_pte);
+		wrprotect_ptes(src_mm, addr, src_pte, nr);
 		pte = pte_wrprotect(pte);
 	}
 
@@ -950,26 +950,93 @@ static inline void __copy_present_pte(struct vm_area_struct *dst_vma,
 	if (!userfaultfd_wp(dst_vma))
 		pte = pte_clear_uffd_wp(pte);
 
-	set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
+	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+}
+
+/*
+ * Detect a PTE batch: consecutive (present) PTEs that map consecutive
+ * pages of the same folio.
+ *
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN.
+ */
+static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
+		pte_t *start_ptep, pte_t pte, int max_nr)
+{
+	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
+	const pte_t *end_ptep = start_ptep + max_nr;
+	pte_t expected_pte = pte_next_pfn(pte);
+	pte_t *ptep = start_ptep + 1;
+
+	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
+
+	while (ptep != end_ptep) {
+		pte = ptep_get(ptep);
+
+		if (!pte_same(pte, expected_pte))
+			break;
+
+		/*
+		 * Stop immediately once we reached the end of the folio. In
+		 * corner cases the next PFN might fall into a different
+		 * folio.
+		 */
+		if (pte_pfn(pte) == folio_end_pfn)
+			break;
+
+		expected_pte = pte_next_pfn(expected_pte);
+		ptep++;
+	}
+
+	return ptep - start_ptep;
 }
 
 /*
- * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy one present PTE, trying to batch-process subsequent PTEs that map
+ * consecutive pages of the same folio by copying them as well.
+ *
+ * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
+ * Otherwise, returns the number of copied PTEs (at least 1).
  */
 static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		 pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
-		 int *rss, struct folio **prealloc)
+		 int max_nr, int *rss, struct folio **prealloc)
 {
 	struct page *page;
 	struct folio *folio;
+	int err, nr;
 
 	page = vm_normal_page(src_vma, addr, pte);
 	if (unlikely(!page))
 		goto copy_pte;
 
 	folio = page_folio(page);
+
+	/*
+	 * If we likely have to copy, just don't bother with batching. Make
+	 * sure that the common "small folio" case is as fast as possible
+	 * by keeping the batching logic separate.
+	 */
+	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
+		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr);
+		folio_ref_add(folio, nr);
+		if (folio_test_anon(folio)) {
+			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+								  nr, src_vma))) {
+				folio_ref_sub(folio, nr);
+				return -EAGAIN;
+			}
+			rss[MM_ANONPAGES] += nr;
+			VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+		} else {
+			folio_dup_file_rmap_ptes(folio, page, nr);
+			rss[mm_counter_file(page)] += nr;
+		}
+		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
+				    addr, nr);
+		return nr;
+	}
+
 	folio_get(folio);
 	if (folio_test_anon(folio)) {
 		/*
@@ -981,8 +1048,9 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
 			/* Page may be pinned, we have to copy. */
 			folio_put(folio);
-			return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
-						 addr, rss, prealloc, page);
+			err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+						addr, rss, prealloc, page);
+			return err ? err : 1;
 		}
 		rss[MM_ANONPAGES]++;
 		VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
@@ -992,8 +1060,8 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	}
 
 copy_pte:
-	__copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, pte, addr);
-	return 0;
+	__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
+	return 1;
 }
 
 static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1030,10 +1098,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	pte_t *src_pte, *dst_pte;
 	pte_t ptent;
 	spinlock_t *src_ptl, *dst_ptl;
-	int progress, ret = 0;
+	int progress, max_nr, ret = 0;
 	int rss[NR_MM_COUNTERS];
 	swp_entry_t entry = (swp_entry_t){0};
 	struct folio *prealloc = NULL;
+	int nr;
 
 again:
 	progress = 0;
@@ -1064,6 +1133,8 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 	arch_enter_lazy_mmu_mode();
 
 	do {
+		nr = 1;
+
 		/*
 		 * We are holding two locks at this point - either of them
 		 * could generate latencies in another task on another CPU.
@@ -1102,9 +1173,10 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			 */
 			WARN_ON_ONCE(ret != -ENOENT);
 		}
-		/* copy_present_pte() will clear `*prealloc' if consumed */
-		ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
-				       ptent, addr, rss, &prealloc);
+		/* copy_present_ptes() will clear `*prealloc' if consumed */
+		max_nr = (end - addr) / PAGE_SIZE;
+		ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+					ptent, addr, max_nr, rss, &prealloc);
 		/*
 		 * If we need a pre-allocated page for this pte, drop the
 		 * locks, allocate, and try again.
@@ -1121,8 +1193,10 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 			folio_put(prealloc);
 			prealloc = NULL;
 		}
-		progress += 8;
-	} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+		nr = ret;
+		progress += 8 * nr;
+	} while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
+		 addr != end);
 
 	arch_leave_lazy_mmu_mode();
 	pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1143,7 +1217,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		prealloc = folio_prealloc(src_mm, src_vma, addr, false);
 		if (!prealloc)
 			return -ENOMEM;
-	} else if (ret) {
+	} else if (ret < 0) {
 		VM_WARN_ON_ONCE(1);
 	}
 

From 2877a4c7dbfc475ccecd8c2f38725b3faa063fdb Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:48 +0100
Subject: [PATCH 021/352] mm/memory: ignore dirty/accessed/soft-dirty bits in
 folio_pte_batch()

BugLink: https://bugs.launchpad.net/bugs/2059316

Let's always ignore the accessed/young bit: we'll always mark the PTE as
old in our child process during fork, and upcoming users will similarly
not care.

Ignore the dirty bit only if we don't want to duplicate the dirty bit into
the child process during fork.  Maybe, we could just set all PTEs in the
child dirty if any PTE is dirty.  For now, let's keep the behavior
unchanged, this can be optimized later if required.

Ignore the soft-dirty bit only if the bit doesn't have any meaning in the
src vma, and similarly won't have any in the copied dst vma.

For now, we won't bother with the uffd-wp bit.

Link: https://lkml.kernel.org/r/20240129124649.189745-15-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 25365e10699aa0e320345d019194fbea9f37a4ae)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 mm/memory.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 749562f5bf1e2..bf77d63b3dd9c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -953,24 +953,44 @@ static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
 	set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
 }
 
+/* Flags for folio_pte_batch(). */
+typedef int __bitwise fpb_t;
+
+/* Compare PTEs after pte_mkclean(), ignoring the dirty bit. */
+#define FPB_IGNORE_DIRTY		((__force fpb_t)BIT(0))
+
+/* Compare PTEs after pte_clear_soft_dirty(), ignoring the soft-dirty bit. */
+#define FPB_IGNORE_SOFT_DIRTY		((__force fpb_t)BIT(1))
+
+static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
+{
+	if (flags & FPB_IGNORE_DIRTY)
+		pte = pte_mkclean(pte);
+	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
+		pte = pte_clear_soft_dirty(pte);
+	return pte_mkold(pte);
+}
+
 /*
  * Detect a PTE batch: consecutive (present) PTEs that map consecutive
  * pages of the same folio.
  *
- * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN.
+ * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
+ * the accessed bit, dirty bit (with FPB_IGNORE_DIRTY) and soft-dirty bit
+ * (with FPB_IGNORE_SOFT_DIRTY).
  */
 static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
-		pte_t *start_ptep, pte_t pte, int max_nr)
+		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags)
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
-	pte_t expected_pte = pte_next_pfn(pte);
+	pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags);
 	pte_t *ptep = start_ptep + 1;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 
 	while (ptep != end_ptep) {
-		pte = ptep_get(ptep);
+		pte = __pte_batch_clear_ignored(ptep_get(ptep), flags);
 
 		if (!pte_same(pte, expected_pte))
 			break;
@@ -1004,6 +1024,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 {
 	struct page *page;
 	struct folio *folio;
+	fpb_t flags = 0;
 	int err, nr;
 
 	page = vm_normal_page(src_vma, addr, pte);
@@ -1018,7 +1039,12 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 	 * by keeping the batching logic separate.
 	 */
 	if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
-		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr);
+		if (src_vma->vm_flags & VM_SHARED)
+			flags |= FPB_IGNORE_DIRTY;
+		if (!vma_soft_dirty_enabled(src_vma))
+			flags |= FPB_IGNORE_SOFT_DIRTY;
+
+		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags);
 		folio_ref_add(folio, nr);
 		if (folio_test_anon(folio)) {
 			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,

From b8f39c8d3a939122520fdf602ac743a9ddf3b5ea Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Mon, 29 Jan 2024 13:46:49 +0100
Subject: [PATCH 022/352] mm/memory: ignore writable bit in folio_pte_batch()

BugLink: https://bugs.launchpad.net/bugs/2059316

...  and conditionally return to the caller if any PTE except the first
one is writable.  fork() has to make sure to properly write-protect in
case any PTE is writable.  Other users (e.g., page unmaping) are expected
to not care.

Link: https://lkml.kernel.org/r/20240129124649.189745-16-david@redhat.com
Signed-off-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Alexandre Ghiti <alexghiti@rivosinc.com>
Cc: Aneesh Kumar K.V <aneesh.kumar@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christophe Leroy <christophe.leroy@csgroup.eu>
Cc: David S. Miller <davem@davemloft.net>
Cc: Dinh Nguyen <dinguyen@kernel.org>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Naveen N. Rao <naveen.n.rao@linux.ibm.com>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Russell King (Oracle) <linux@armlinux.org.uk>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit d7c0e5f722ab229153c22efc836bf220479bdce6)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 mm/memory.c | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index bf77d63b3dd9c..32498c300fdf0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -968,7 +968,7 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
 		pte = pte_mkclean(pte);
 	if (likely(flags & FPB_IGNORE_SOFT_DIRTY))
 		pte = pte_clear_soft_dirty(pte);
-	return pte_mkold(pte);
+	return pte_wrprotect(pte_mkold(pte));
 }
 
 /*
@@ -976,21 +976,32 @@ static inline pte_t __pte_batch_clear_ignored(pte_t pte, fpb_t flags)
  * pages of the same folio.
  *
  * All PTEs inside a PTE batch have the same PTE bits set, excluding the PFN,
- * the accessed bit, dirty bit (with FPB_IGNORE_DIRTY) and soft-dirty bit
- * (with FPB_IGNORE_SOFT_DIRTY).
+ * the accessed bit, writable bit, dirty bit (with FPB_IGNORE_DIRTY) and
+ * soft-dirty bit (with FPB_IGNORE_SOFT_DIRTY).
+ *
+ * If "any_writable" is set, it will indicate if any other PTE besides the
+ * first (given) PTE is writable.
  */
 static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
-		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags)
+		pte_t *start_ptep, pte_t pte, int max_nr, fpb_t flags,
+		bool *any_writable)
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
 	pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags);
 	pte_t *ptep = start_ptep + 1;
+	bool writable;
+
+	if (any_writable)
+		*any_writable = false;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 
 	while (ptep != end_ptep) {
-		pte = __pte_batch_clear_ignored(ptep_get(ptep), flags);
+		pte = ptep_get(ptep);
+		if (any_writable)
+			writable = !!pte_write(pte);
+		pte = __pte_batch_clear_ignored(pte, flags);
 
 		if (!pte_same(pte, expected_pte))
 			break;
@@ -1003,6 +1014,9 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 		if (pte_pfn(pte) == folio_end_pfn)
 			break;
 
+		if (any_writable)
+			*any_writable |= writable;
+
 		expected_pte = pte_next_pfn(expected_pte);
 		ptep++;
 	}
@@ -1024,6 +1038,7 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 {
 	struct page *page;
 	struct folio *folio;
+	bool any_writable;
 	fpb_t flags = 0;
 	int err, nr;
 
@@ -1044,7 +1059,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 		if (!vma_soft_dirty_enabled(src_vma))
 			flags |= FPB_IGNORE_SOFT_DIRTY;
 
-		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags);
+		nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+				     &any_writable);
 		folio_ref_add(folio, nr);
 		if (folio_test_anon(folio)) {
 			if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
@@ -1058,6 +1074,8 @@ copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
 			folio_dup_file_rmap_ptes(folio, page, nr);
 			rss[mm_counter_file(page)] += nr;
 		}
+		if (any_writable)
+			pte = pte_mkwrite(pte, src_vma);
 		__copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
 				    addr, nr);
 		return nr;

From 07ad4059963b8d5ff09eeedf4dd00914334d32cd Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:48 +0000
Subject: [PATCH 023/352] mm: clarify the spec for set_ptes()

BugLink: https://bugs.launchpad.net/bugs/2059316

Patch series "Transparent Contiguous PTEs for User Mappings", v6.

This is a series to opportunistically and transparently use contpte
mappings (set the contiguous bit in ptes) for user memory when those
mappings meet the requirements.  The change benefits arm64, but there is
some (very) minor refactoring for x86 to enable its integration with
core-mm.

It is part of a wider effort to improve performance by allocating and
mapping variable-sized blocks of memory (folios).  One aim is for the 4K
kernel to approach the performance of the 16K kernel, but without breaking
compatibility and without the associated increase in memory.  Another aim
is to benefit the 16K and 64K kernels by enabling 2M THP, since this is
the contpte size for those kernels.  We have good performance data that
demonstrates both aims are being met (see below).

Of course this is only one half of the change.  We require the mapped
physical memory to be the correct size and alignment for this to actually
be useful (i.e.  64K for 4K pages, or 2M for 16K/64K pages).  Fortunately
folios are solving this problem for us.  Filesystems that support it (XFS,
AFS, EROFS, tmpfs, ...) will allocate large folios up to the PMD size
today, and more filesystems are coming.  And for anonymous memory,
"multi-size THP" is now upstream.

Patch Layout
============

In this version, I've split the patches to better show each optimization:

  - 1-2:    mm prep: misc code and docs cleanups
  - 3-6:    mm,arm64,x86 prep: Add pte_advance_pfn() and make pte_next_pfn() a
            generic wrapper around it
  - 7-11:   arm64 prep: Refactor ptep helpers into new layer
  - 12:     functional contpte implementation
  - 23-18:  various optimizations on top of the contpte implementation

Testing
=======

I've tested this series on both Ampere Altra (bare metal) and Apple M2 (VM):
  - mm selftests (inc new tests written for multi-size THP); no regressions
  - Speedometer Java script benchmark in Chromium web browser; no issues
  - Kernel compilation; no issues
  - Various tests under high memory pressure with swap enabled; no issues

Performance
===========

High Level Use Cases
~~~~~~~~~~~~~~~~~~~~

First some high level use cases (kernel compilation and speedometer JavaScript
benchmarks). These are running on Ampere Altra (I've seen similar improvements
on Android/Pixel 6).

baseline:                  mm-unstable (mTHP switched off)
mTHP:                      + enable 16K, 32K, 64K mTHP sizes "always"
mTHP + contpte:            + this series
mTHP + contpte + exefolio: + patch at [6], which series supports

Kernel Compilation with -j8 (negative is faster):

| kernel                    | real-time | kern-time | user-time |
|---------------------------|-----------|-----------|-----------|
| baseline                  |      0.0% |      0.0% |      0.0% |
| mTHP                      |     -5.0% |    -39.1% |     -0.7% |
| mTHP + contpte            |     -6.0% |    -41.4% |     -1.5% |
| mTHP + contpte + exefolio |     -7.8% |    -43.1% |     -3.4% |

Kernel Compilation with -j80 (negative is faster):

| kernel                    | real-time | kern-time | user-time |
|---------------------------|-----------|-----------|-----------|
| baseline                  |      0.0% |      0.0% |      0.0% |
| mTHP                      |     -5.0% |    -36.6% |     -0.6% |
| mTHP + contpte            |     -6.1% |    -38.2% |     -1.6% |
| mTHP + contpte + exefolio |     -7.4% |    -39.2% |     -3.2% |

Speedometer (positive is faster):

| kernel                    | runs_per_min |
|:--------------------------|--------------|
| baseline                  |         0.0% |
| mTHP                      |         1.5% |
| mTHP + contpte            |         3.2% |
| mTHP + contpte + exefolio |         4.5% |

Micro Benchmarks
~~~~~~~~~~~~~~~~

The following microbenchmarks are intended to demonstrate the performance of
fork() and munmap() do not regress. I'm showing results for order-0 (4K)
mappings, and for order-9 (2M) PTE-mapped THP. Thanks to David for sharing his
benchmarks.

baseline:                  mm-unstable + batch zap [7] series
contpte-basic:             + patches 0-19; functional contpte implementation
contpte-batch:             + patches 20-23; implement new batched APIs
contpte-inline:            + patch 24; __always_inline to help compiler
contpte-fold:              + patch 25; fold contpte mapping when sensible

Primary platform is Ampere Altra bare metal. I'm also showing results for M2 VM
(on top of MacOS) for reference, although experience suggests this might not be
the most reliable for performance numbers of this sort:

| FORK           |         order-0        |         order-9        |
| Ampere Altra   |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      2.7% |       0.0% |      0.2% |
| contpte-basic  |       6.3% |      1.4% |    1948.7% |      0.2% |
| contpte-batch  |       7.6% |      2.0% |      -1.9% |      0.4% |
| contpte-inline |       3.6% |      1.5% |      -1.0% |      0.2% |
| contpte-fold   |       4.6% |      2.1% |      -1.8% |      0.2% |

| MUNMAP         |         order-0        |         order-9        |
| Ampere Altra   |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      0.5% |       0.0% |      0.3% |
| contpte-basic  |       1.8% |      0.3% |    1104.8% |      0.1% |
| contpte-batch  |      -0.3% |      0.4% |       2.7% |      0.1% |
| contpte-inline |      -0.1% |      0.6% |       0.9% |      0.1% |
| contpte-fold   |       0.1% |      0.6% |       0.8% |      0.1% |

| FORK           |         order-0        |         order-9        |
| Apple M2 VM    |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      1.4% |       0.0% |      0.8% |
| contpte-basic  |       6.8% |      1.2% |     469.4% |      1.4% |
| contpte-batch  |      -7.7% |      2.0% |      -8.9% |      0.7% |
| contpte-inline |      -6.0% |      2.1% |      -6.0% |      2.0% |
| contpte-fold   |       5.9% |      1.4% |      -6.4% |      1.4% |

| MUNMAP         |         order-0        |         order-9        |
| Apple M2 VM    |------------------------|------------------------|
| (pte-map)      |       mean |     stdev |       mean |     stdev |
|----------------|------------|-----------|------------|-----------|
| baseline       |       0.0% |      0.6% |       0.0% |      0.4% |
| contpte-basic  |       1.6% |      0.6% |     233.6% |      0.7% |
| contpte-batch  |       1.9% |      0.3% |      -3.9% |      0.4% |
| contpte-inline |       2.2% |      0.8% |      -1.6% |      0.9% |
| contpte-fold   |       1.5% |      0.7% |      -1.7% |      0.7% |

Misc
~~~~

John Hubbard at Nvidia has indicated dramatic 10x performance improvements
for some workloads at [8], when using 64K base page kernel.

[1] https://lore.kernel.org/linux-arm-kernel/20230622144210.2623299-1-ryan.roberts@arm.com/
[2] https://lore.kernel.org/linux-arm-kernel/20231115163018.1303287-1-ryan.roberts@arm.com/
[3] https://lore.kernel.org/linux-arm-kernel/20231204105440.61448-1-ryan.roberts@arm.com/
[4] https://lore.kernel.org/lkml/20231218105100.172635-1-ryan.roberts@arm.com/
[5] https://lore.kernel.org/linux-mm/633af0a7-0823-424f-b6ef-374d99483f05@arm.com/
[6] https://lore.kernel.org/lkml/08c16f7d-f3b3-4f22-9acc-da943f647dc3@arm.com/
[7] https://lore.kernel.org/linux-mm/20240214204435.167852-1-david@redhat.com/
[8] https://lore.kernel.org/linux-mm/c507308d-bdd4-5f9e-d4ff-e96e4520be85@nvidia.com/
[9] https://gitlab.arm.com/linux-arm/linux-rr/-/tree/features/granule_perf/contpte-lkml_v6

This patch (of 18):

set_ptes() spec implies that it can only be used to set a present pte
because it interprets the PFN field to increment it.  However,
set_pte_at() has been implemented on top of set_ptes() since set_ptes()
was introduced, and set_pte_at() allows setting a pte to a not-present
state.  So clarify the spec to state that when nr==1, new state of pte may
be present or not present.  When nr>1, new state of all ptes must be
present.

While we are at it, tighten the spec to set requirements around the
initial state of ptes; when nr==1 it may be either present or not-present.
But when nr>1 all ptes must initially be not-present.  All set_ptes()
callsites already conform to this requirement.  Stating it explicitly is
useful because it allows for a simplification to the upcoming arm64
contpte implementation.

Link: https://lkml.kernel.org/r/20240215103205.2607016-1-ryan.roberts@arm.com
Link: https://lkml.kernel.org/r/20240215103205.2607016-2-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 6280d7317ccae19c776a3b6cf9848c964f958091)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 include/linux/pgtable.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index aab227e12493f..743af6b1e01ba 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -229,6 +229,10 @@ static inline pte_t pte_next_pfn(pte_t pte)
  * @pte: Page table entry for the first page.
  * @nr: Number of pages to map.
  *
+ * When nr==1, initial state of pte may be present or not present, and new state
+ * may be present or not present. When nr>1, initial state of all ptes must be
+ * not present, and new state must be present.
+ *
  * May be overridden by the architecture, or the architecture can define
  * set_pte() and PFN_PTE_SHIFT.
  *

From c1cab95a62c5a112f6ea483031051a3b82bdcd4d Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:49 +0000
Subject: [PATCH 024/352] mm: thp: batch-collapse PMD with set_ptes()

BugLink: https://bugs.launchpad.net/bugs/2059316

Refactor __split_huge_pmd_locked() so that a present PMD can be collapsed
to PTEs in a single batch using set_ptes().

This should improve performance a little bit, but the real motivation is
to remove the need for the arm64 backend to have to fold the contpte
entries.  Instead, since the ptes are set as a batch, the contpte blocks
can be initially set up pre-folded (once the arm64 contpte support is
added in the next few patches).  This leads to noticeable performance
improvement during split.

Link: https://lkml.kernel.org/r/20240215103205.2607016-3-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 2bdba9868a4ffcb1492db7272f34b54387910177)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 mm/huge_memory.c | 58 +++++++++++++++++++++++++++---------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d6686dd357410..bb38fdad92c26 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2562,15 +2562,16 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 
 	pte = pte_offset_map(&_pmd, haddr);
 	VM_BUG_ON(!pte);
-	for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
-		pte_t entry;
-		/*
-		 * Note that NUMA hinting access restrictions are not
-		 * transferred to avoid any possibility of altering
-		 * permissions across VMAs.
-		 */
-		if (freeze || pmd_migration) {
+
+	/*
+	 * Note that NUMA hinting access restrictions are not transferred to
+	 * avoid any possibility of altering permissions across VMAs.
+	 */
+	if (freeze || pmd_migration) {
+		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+			pte_t entry;
 			swp_entry_t swp_entry;
+
 			if (write)
 				swp_entry = make_writable_migration_entry(
 							page_to_pfn(page + i));
@@ -2589,25 +2590,32 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 				entry = pte_swp_mksoft_dirty(entry);
 			if (uffd_wp)
 				entry = pte_swp_mkuffd_wp(entry);
-		} else {
-			entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
-			if (write)
-				entry = pte_mkwrite(entry, vma);
-			if (!young)
-				entry = pte_mkold(entry);
-			/* NOTE: this may set soft-dirty too on some archs */
-			if (dirty)
-				entry = pte_mkdirty(entry);
-			if (soft_dirty)
-				entry = pte_mksoft_dirty(entry);
-			if (uffd_wp)
-				entry = pte_mkuffd_wp(entry);
+
+			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+			set_pte_at(mm, addr, pte + i, entry);
 		}
-		VM_BUG_ON(!pte_none(ptep_get(pte)));
-		set_pte_at(mm, addr, pte, entry);
-		pte++;
+	} else {
+		pte_t entry;
+
+		entry = mk_pte(page, READ_ONCE(vma->vm_page_prot));
+		if (write)
+			entry = pte_mkwrite(entry, vma);
+		if (!young)
+			entry = pte_mkold(entry);
+		/* NOTE: this may set soft-dirty too on some archs */
+		if (dirty)
+			entry = pte_mkdirty(entry);
+		if (soft_dirty)
+			entry = pte_mksoft_dirty(entry);
+		if (uffd_wp)
+			entry = pte_mkuffd_wp(entry);
+
+		for (i = 0; i < HPAGE_PMD_NR; i++)
+			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+
+		set_ptes(mm, haddr, pte, entry, HPAGE_PMD_NR);
 	}
-	pte_unmap(pte - 1);
+	pte_unmap(pte);
 
 	if (!pmd_migration)
 		folio_remove_rmap_pmd(folio, page, vma);

From 1b9815bb5395a8d23830d088af8d9fe77d3ca371 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:50 +0000
Subject: [PATCH 025/352] mm: introduce pte_advance_pfn() and use for
 pte_next_pfn()

BugLink: https://bugs.launchpad.net/bugs/2059316

The goal is to be able to advance a PTE by an arbitrary number of PFNs.
So introduce a new API that takes a nr param.  Define the default
implementation here and allow for architectures to override.
pte_next_pfn() becomes a wrapper around pte_advance_pfn().

Follow up commits will convert each overriding architecture's
pte_next_pfn() to pte_advance_pfn().

Link: https://lkml.kernel.org/r/20240215103205.2607016-4-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 583ceaaa339960e673ac0029f323bb1c6ffc96d7)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 include/linux/pgtable.h | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 743af6b1e01ba..71c298cee66bf 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,14 +212,17 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
-
 #ifndef pte_next_pfn
-static inline pte_t pte_next_pfn(pte_t pte)
+#ifndef pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
-	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+	return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
 }
 #endif
 
+#define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
+#endif
+
 #ifndef set_ptes
 /**
  * set_ptes - Map consecutive pages to a contiguous range of addresses.

From e42922e08099a96d9237e4f5576029bb3379530c Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:51 +0000
Subject: [PATCH 026/352] arm64/mm: convert pte_next_pfn() to pte_advance_pfn()

BugLink: https://bugs.launchpad.net/bugs/2059316

Core-mm needs to be able to advance the pfn by an arbitrary amount, so
override the new pte_advance_pfn() API to do so.

Link: https://lkml.kernel.org/r/20240215103205.2607016-5-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit c1bd2b4028ae5b4d2ada64b31c40cc44cdf00972)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 52d0b0a763f16..b6d3e9e0a9462 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -351,10 +351,10 @@ static inline pgprot_t pte_pgprot(pte_t pte)
 	return __pgprot(pte_val(pfn_pte(pfn, __pgprot(0))) ^ pte_val(pte));
 }
 
-#define pte_next_pfn pte_next_pfn
-static inline pte_t pte_next_pfn(pte_t pte)
+#define pte_advance_pfn pte_advance_pfn
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
-	return pfn_pte(pte_pfn(pte) + 1, pte_pgprot(pte));
+	return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
 }
 
 static inline void set_ptes(struct mm_struct *mm,
@@ -370,7 +370,7 @@ static inline void set_ptes(struct mm_struct *mm,
 		if (--nr == 0)
 			break;
 		ptep++;
-		pte = pte_next_pfn(pte);
+		pte = pte_advance_pfn(pte, 1);
 	}
 }
 #define set_ptes set_ptes

From d3d480c7d1b5cce829e8fdb745a583a2b645cfb8 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:52 +0000
Subject: [PATCH 027/352] x86/mm: convert pte_next_pfn() to pte_advance_pfn()

BugLink: https://bugs.launchpad.net/bugs/2059316

Core-mm needs to be able to advance the pfn by an arbitrary amount, so
override the new pte_advance_pfn() API to do so.

Link: https://lkml.kernel.org/r/20240215103205.2607016-6-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 506b586769ecef8c83fff64de227e7fa84b7be42)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/x86/include/asm/pgtable.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9d077bca6a103..b60b0c897b4cd 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -956,13 +956,13 @@ static inline int pte_same(pte_t a, pte_t b)
 	return a.pte == b.pte;
 }
 
-static inline pte_t pte_next_pfn(pte_t pte)
+static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
 	if (__pte_needs_invert(pte_val(pte)))
-		return __pte(pte_val(pte) - (1UL << PFN_PTE_SHIFT));
-	return __pte(pte_val(pte) + (1UL << PFN_PTE_SHIFT));
+		return __pte(pte_val(pte) - (nr << PFN_PTE_SHIFT));
+	return __pte(pte_val(pte) + (nr << PFN_PTE_SHIFT));
 }
-#define pte_next_pfn	pte_next_pfn
+#define pte_advance_pfn	pte_advance_pfn
 
 static inline int pte_present(pte_t a)
 {

From 6e3937be5babeff7d0eb1d2028276046e7db1d4c Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:53 +0000
Subject: [PATCH 028/352] mm: tidy up pte_next_pfn() definition

BugLink: https://bugs.launchpad.net/bugs/2059316

Now that the all architecture overrides of pte_next_pfn() have been
replaced with pte_advance_pfn(), we can simplify the definition of the
generic pte_next_pfn() macro so that it is unconditionally defined.

Link: https://lkml.kernel.org/r/20240215103205.2607016-7-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit fb23bf6bd288db3187c27b971e558a3e9f70ae96)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 include/linux/pgtable.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 71c298cee66bf..5e15e6d1f71ce 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,7 +212,6 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
-#ifndef pte_next_pfn
 #ifndef pte_advance_pfn
 static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
@@ -221,7 +220,6 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 #endif
 
 #define pte_next_pfn(pte) pte_advance_pfn(pte, 1)
-#endif
 
 #ifndef set_ptes
 /**

From 593b9b132e7e8ca09d11596dcca13db85f7c1881 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:54 +0000
Subject: [PATCH 029/352] arm64/mm: convert READ_ONCE(*ptep) to ptep_get(ptep)

BugLink: https://bugs.launchpad.net/bugs/2059316

There are a number of places in the arch code that read a pte by using the
READ_ONCE() macro.  Refactor these call sites to instead use the
ptep_get() helper, which itself is a READ_ONCE().  Generated code should
be the same.

This will benefit us when we shortly introduce the transparent contpte
support.  In this case, ptep_get() will become more complex so we now have
all the code abstracted through it.

Link: https://lkml.kernel.org/r/20240215103205.2607016-8-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 532736558e8ef2865eae1d84b52dda4422cac810)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 12 +++++++++---
 arch/arm64/kernel/efi.c          |  2 +-
 arch/arm64/mm/fault.c            |  4 ++--
 arch/arm64/mm/hugetlbpage.c      |  6 +++---
 arch/arm64/mm/kasan_init.c       |  2 +-
 arch/arm64/mm/mmu.c              | 12 ++++++------
 arch/arm64/mm/pageattr.c         |  4 ++--
 arch/arm64/mm/trans_pgd.c        |  2 +-
 8 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index b6d3e9e0a9462..de034ca40bad1 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -275,6 +275,12 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 	}
 }
 
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+	return READ_ONCE(*ptep);
+}
+
 extern void __sync_icache_dcache(pte_t pteval);
 bool pgattr_change_is_safe(u64 old, u64 new);
 
@@ -302,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
 		return;
 
-	old_pte = READ_ONCE(*ptep);
+	old_pte = ptep_get(ptep);
 
 	if (!pte_valid(old_pte) || !pte_valid(pte))
 		return;
@@ -904,7 +910,7 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep)
 {
 	pte_t old_pte, pte;
 
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_mkold(pte);
@@ -986,7 +992,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 {
 	pte_t old_pte, pte;
 
-	pte = READ_ONCE(*ptep);
+	pte = ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index 0228001347bea..d0e08e93b2464 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct set_perm_data *spd = data;
 	const efi_memory_desc_t *md = spd->md;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	if (md->attribute & EFI_MEMORY_RO)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 55f6455a82843..a254761fa1bd4 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -191,7 +191,7 @@ static void show_pte(unsigned long addr)
 		if (!ptep)
 			break;
 
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 		pr_cont(", pte=%016llx", pte_val(pte));
 		pte_unmap(ptep);
 	} while(0);
@@ -214,7 +214,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
 			  pte_t entry, int dirty)
 {
 	pteval_t old_pteval, pteval;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	if (pte_same(pte, entry))
 		return 0;
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 8116ac599f801..55ee0bd3f6be2 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -492,7 +492,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	size_t pgsize;
 	pte_t pte;
 
-	if (!pte_cont(READ_ONCE(*ptep))) {
+	if (!pte_cont(ptep_get(ptep))) {
 		ptep_set_wrprotect(mm, addr, ptep);
 		return;
 	}
@@ -517,7 +517,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 	size_t pgsize;
 	int ncontig;
 
-	if (!pte_cont(READ_ONCE(*ptep)))
+	if (!pte_cont(ptep_get(ptep)))
 		return ptep_clear_flush(vma, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -550,7 +550,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
 		 * when the permission changes from executable to non-executable
 		 * in cases where cpu is affected with errata #2645198.
 		 */
-		if (pte_user_exec(READ_ONCE(*ptep)))
+		if (pte_user_exec(ptep_get(ptep)))
 			return huge_ptep_clear_flush(vma, addr, ptep);
 	}
 	return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index 4c7ad574b946b..c2a9f4f6c7dd0 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -113,7 +113,7 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 			memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
 		next = addr + PAGE_SIZE;
 		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
-	} while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep)));
+	} while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep)));
 }
 
 static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 1ac7467d34c9c..720fff5b35feb 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -173,7 +173,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
-		pte_t old_pte = READ_ONCE(*ptep);
+		pte_t old_pte = ptep_get(ptep);
 
 		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
@@ -182,7 +182,7 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		 * only allow updates to the permission attributes.
 		 */
 		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
-					      READ_ONCE(pte_val(*ptep))));
+					      pte_val(ptep_get(ptep))));
 
 		phys += PAGE_SIZE;
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
@@ -854,7 +854,7 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 		if (pte_none(pte))
 			continue;
 
@@ -987,7 +987,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = READ_ONCE(*ptep);
+		pte = ptep_get(ptep);
 
 		/*
 		 * This is just a sanity check here which verifies that
@@ -1006,7 +1006,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 	 */
 	ptep = pte_offset_kernel(pmdp, 0UL);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		if (!pte_none(READ_ONCE(ptep[i])))
+		if (!pte_none(ptep_get(&ptep[i])))
 			return;
 	}
 
@@ -1475,7 +1475,7 @@ pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte
 		 * when the permission changes from executable to non-executable
 		 * in cases where cpu is affected with errata #2645198.
 		 */
-		if (pte_user_exec(READ_ONCE(*ptep)))
+		if (pte_user_exec(ptep_get(ptep)))
 			return ptep_clear_flush(vma, addr, ptep);
 	}
 	return ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 0a62f458c5cb0..e0e35bd942222 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -36,7 +36,7 @@ bool can_set_direct_map(void)
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
-	pte_t pte = READ_ONCE(*ptep);
+	pte_t pte = ptep_get(ptep);
 
 	pte = clear_pte_bit(pte, cdata->clear_mask);
 	pte = set_pte_bit(pte, cdata->set_mask);
@@ -242,5 +242,5 @@ bool kernel_page_present(struct page *page)
 		return true;
 
 	ptep = pte_offset_kernel(pmdp, addr);
-	return pte_valid(READ_ONCE(*ptep));
+	return pte_valid(ptep_get(ptep));
 }
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index 7b14df3c64776..f71ab4704cce7 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info)
 
 static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 {
-	pte_t pte = READ_ONCE(*src_ptep);
+	pte_t pte = ptep_get(src_ptep);
 
 	if (pte_valid(pte)) {
 		/*

From d5bb0a340d8a2af1617e452561ff1013461d84bb Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:55 +0000
Subject: [PATCH 030/352] arm64/mm: convert set_pte_at() to set_ptes(..., 1)

BugLink: https://bugs.launchpad.net/bugs/2059316

Since set_ptes() was introduced, set_pte_at() has been implemented as a
generic macro around set_ptes(..., 1).  So this change should continue to
generate the same code.  However, making this change prepares us for the
transparent contpte support.  It means we can reroute set_ptes() to
__set_ptes().  Since set_pte_at() is a generic macro, there will be no
equivalent __set_pte_at() to reroute to.

Note that a couple of calls to set_pte_at() remain in the arch code.  This
is intentional, since those call sites are acting on behalf of core-mm and
should continue to call into the public set_ptes() rather than the
arch-private __set_ptes().

Link: https://lkml.kernel.org/r/20240215103205.2607016-9-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 659e193027910a5d3083e34b488ab459d2ef5082)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h |  2 +-
 arch/arm64/kernel/mte.c          |  2 +-
 arch/arm64/kvm/guest.c           |  2 +-
 arch/arm64/mm/fault.c            |  2 +-
 arch/arm64/mm/hugetlbpage.c      | 10 +++++-----
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index de034ca40bad1..9a2df85eb493e 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1084,7 +1084,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #endif /* CONFIG_ARM64_MTE */
 
 /*
- * On AArch64, the cache coherency is handled via the set_pte_at() function.
+ * On AArch64, the cache coherency is handled via the set_ptes() function.
  */
 static inline void update_mmu_cache_range(struct vm_fault *vmf,
 		struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index a41ef3213e1e9..59bfe2e96f8f3 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
 	/*
 	 * If the page content is identical but at least one of the pages is
 	 * tagged, return non-zero to avoid KSM merging. If only one of the
-	 * pages is tagged, set_pte_at() may zero or change the tags of the
+	 * pages is tagged, set_ptes() may zero or change the tags of the
 	 * other page via mte_sync_tags().
 	 */
 	if (page_mte_tagged(page1) || page_mte_tagged(page2))
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 7a6e47e2c6f0e..f5f48e27970d8 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1073,7 +1073,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 		} else {
 			/*
 			 * Only locking to serialise with a concurrent
-			 * set_pte_at() in the VMM but still overriding the
+			 * set_ptes() in the VMM but still overriding the
 			 * tags, hence ignoring the return value.
 			 */
 			try_page_mte_tagging(page);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index a254761fa1bd4..3235e23309ec9 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -205,7 +205,7 @@ static void show_pte(unsigned long addr)
  *
  * It needs to cope with hardware update of the accessed/dirty state by other
  * agents in the system and can safely skip the __sync_icache_dcache() call as,
- * like set_pte_at(), the PTE is never changed from no-exec to exec here.
+ * like set_ptes(), the PTE is never changed from no-exec to exec here.
  *
  * Returns whether or not the PTE actually changed.
  */
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index 55ee0bd3f6be2..bf5449e2498d7 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -254,12 +254,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	if (!pte_present(pte)) {
 		for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
-			set_pte_at(mm, addr, ptep, pte);
+			set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
 	if (!pte_cont(pte)) {
-		set_pte_at(mm, addr, ptep, pte);
+		set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
@@ -270,7 +270,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	clear_flush(mm, addr, ptep, pgsize, ncontig);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -478,7 +478,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 	hugeprot = pte_pgprot(pte);
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 
 	return 1;
 }
@@ -507,7 +507,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	pfn = pte_pfn(pte);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
+		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,

From f96da6d4198d08f05c6c2a548e51b47bc3ee104c Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:56 +0000
Subject: [PATCH 031/352] arm64/mm: convert ptep_clear() to
 ptep_get_and_clear()

BugLink: https://bugs.launchpad.net/bugs/2059316

ptep_clear() is a generic wrapper around the arch-implemented
ptep_get_and_clear().  We are about to convert ptep_get_and_clear() into a
public version and private version (__ptep_get_and_clear()) to support the
transparent contpte work.  We won't have a private version of ptep_clear()
so let's convert it to directly call ptep_get_and_clear().

Link: https://lkml.kernel.org/r/20240215103205.2607016-10-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit cbb0294fdd72a5f63ec59fad5c0a98d63bd572fc)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/mm/hugetlbpage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index bf5449e2498d7..f93d0b7ba9e87 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -236,7 +236,7 @@ static void clear_flush(struct mm_struct *mm,
 	unsigned long i, saddr = addr;
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
-		ptep_clear(mm, addr, ptep);
+		ptep_get_and_clear(mm, addr, ptep);
 
 	flush_tlb_range(&vma, saddr, addr);
 }

From 2dca5e9b4c61765f3db34e23ba11c63c53d15e51 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:57 +0000
Subject: [PATCH 032/352] arm64/mm: new ptep layer to manage contig bit

BugLink: https://bugs.launchpad.net/bugs/2059316

Create a new layer for the in-table PTE manipulation APIs.  For now, The
existing API is prefixed with double underscore to become the arch-private
API and the public API is just a simple wrapper that calls the private
API.

The public API implementation will subsequently be used to transparently
manipulate the contiguous bit where appropriate.  But since there are
already some contig-aware users (e.g.  hugetlb, kernel mapper), we must
first ensure those users use the private API directly so that the future
contig-bit manipulations in the public API do not interfere with those
existing uses.

The following APIs are treated this way:

 - ptep_get
 - set_pte
 - set_ptes
 - pte_clear
 - ptep_get_and_clear
 - ptep_test_and_clear_young
 - ptep_clear_flush_young
 - ptep_set_wrprotect
 - ptep_set_access_flags

Link: https://lkml.kernel.org/r/20240215103205.2607016-11-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 5a00bfd6a52cf31e93d5f1b734087deb32a3cffa)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 83 +++++++++++++++++---------------
 arch/arm64/kernel/efi.c          |  4 +-
 arch/arm64/kernel/mte.c          |  2 +-
 arch/arm64/kvm/guest.c           |  2 +-
 arch/arm64/mm/fault.c            | 12 ++---
 arch/arm64/mm/fixmap.c           |  4 +-
 arch/arm64/mm/hugetlbpage.c      | 40 +++++++--------
 arch/arm64/mm/kasan_init.c       |  6 +--
 arch/arm64/mm/mmu.c              | 14 +++---
 arch/arm64/mm/pageattr.c         |  6 +--
 arch/arm64/mm/trans_pgd.c        |  6 +--
 11 files changed, 93 insertions(+), 86 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9a2df85eb493e..7336d40a893a8 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -93,7 +93,8 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
 	__pte(__phys_to_pte_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
 
 #define pte_none(pte)		(!pte_val(pte))
-#define pte_clear(mm,addr,ptep)	set_pte(ptep, __pte(0))
+#define __pte_clear(mm, addr, ptep) \
+				__set_pte(ptep, __pte(0))
 #define pte_page(pte)		(pfn_to_page(pte_pfn(pte)))
 
 /*
@@ -137,7 +138,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
  * so that we don't erroneously return false for pages that have been
  * remapped as PROT_NONE but are yet to be flushed from the TLB.
  * Note that we can't make any assumptions based on the state of the access
- * flag, since ptep_clear_flush_young() elides a DSB when invalidating the
+ * flag, since __ptep_clear_flush_young() elides a DSB when invalidating the
  * TLB.
  */
 #define pte_accessible(mm, pte)	\
@@ -261,7 +262,7 @@ static inline pte_t pte_mkdevmap(pte_t pte)
 	return set_pte_bit(pte, __pgprot(PTE_DEVMAP | PTE_SPECIAL));
 }
 
-static inline void set_pte(pte_t *ptep, pte_t pte)
+static inline void __set_pte(pte_t *ptep, pte_t pte)
 {
 	WRITE_ONCE(*ptep, pte);
 
@@ -275,8 +276,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 	}
 }
 
-#define ptep_get ptep_get
-static inline pte_t ptep_get(pte_t *ptep)
+static inline pte_t __ptep_get(pte_t *ptep)
 {
 	return READ_ONCE(*ptep);
 }
@@ -308,7 +308,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 	if (!IS_ENABLED(CONFIG_DEBUG_VM))
 		return;
 
-	old_pte = ptep_get(ptep);
+	old_pte = __ptep_get(ptep);
 
 	if (!pte_valid(old_pte) || !pte_valid(pte))
 		return;
@@ -317,7 +317,7 @@ static inline void __check_safe_pte_update(struct mm_struct *mm, pte_t *ptep,
 
 	/*
 	 * Check for potential race with hardware updates of the pte
-	 * (ptep_set_access_flags safely changes valid ptes without going
+	 * (__ptep_set_access_flags safely changes valid ptes without going
 	 * through an invalid entry).
 	 */
 	VM_WARN_ONCE(!pte_young(pte),
@@ -363,23 +363,22 @@ static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 	return pfn_pte(pte_pfn(pte) + nr, pte_pgprot(pte));
 }
 
-static inline void set_ptes(struct mm_struct *mm,
-			    unsigned long __always_unused addr,
-			    pte_t *ptep, pte_t pte, unsigned int nr)
+static inline void __set_ptes(struct mm_struct *mm,
+			      unsigned long __always_unused addr,
+			      pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	page_table_check_ptes_set(mm, ptep, pte, nr);
 	__sync_cache_and_tags(pte, nr);
 
 	for (;;) {
 		__check_safe_pte_update(mm, ptep, pte);
-		set_pte(ptep, pte);
+		__set_pte(ptep, pte);
 		if (--nr == 0)
 			break;
 		ptep++;
 		pte = pte_advance_pfn(pte, 1);
 	}
 }
-#define set_ptes set_ptes
 
 /*
  * Huge pte definitions.
@@ -546,7 +545,7 @@ static inline void __set_pte_at(struct mm_struct *mm,
 {
 	__sync_cache_and_tags(pte, nr);
 	__check_safe_pte_update(mm, ptep, pte);
-	set_pte(ptep, pte);
+	__set_pte(ptep, pte);
 }
 
 static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
@@ -860,8 +859,7 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 	return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
 }
 
-#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
-extern int ptep_set_access_flags(struct vm_area_struct *vma,
+extern int __ptep_set_access_flags(struct vm_area_struct *vma,
 				 unsigned long address, pte_t *ptep,
 				 pte_t entry, int dirty);
 
@@ -871,7 +869,8 @@ static inline int pmdp_set_access_flags(struct vm_area_struct *vma,
 					unsigned long address, pmd_t *pmdp,
 					pmd_t entry, int dirty)
 {
-	return ptep_set_access_flags(vma, address, (pte_t *)pmdp, pmd_pte(entry), dirty);
+	return __ptep_set_access_flags(vma, address, (pte_t *)pmdp,
+							pmd_pte(entry), dirty);
 }
 
 static inline int pud_devmap(pud_t pud)
@@ -905,12 +904,13 @@ static inline bool pud_user_accessible_page(pud_t pud)
 /*
  * Atomic pte/pmd modifications.
  */
-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
-static inline int __ptep_test_and_clear_young(pte_t *ptep)
+static inline int __ptep_test_and_clear_young(struct vm_area_struct *vma,
+					      unsigned long address,
+					      pte_t *ptep)
 {
 	pte_t old_pte, pte;
 
-	pte = ptep_get(ptep);
+	pte = __ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_mkold(pte);
@@ -921,18 +921,10 @@ static inline int __ptep_test_and_clear_young(pte_t *ptep)
 	return pte_young(pte);
 }
 
-static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
-					    unsigned long address,
-					    pte_t *ptep)
-{
-	return __ptep_test_and_clear_young(ptep);
-}
-
-#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
-static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+static inline int __ptep_clear_flush_young(struct vm_area_struct *vma,
 					 unsigned long address, pte_t *ptep)
 {
-	int young = ptep_test_and_clear_young(vma, address, ptep);
+	int young = __ptep_test_and_clear_young(vma, address, ptep);
 
 	if (young) {
 		/*
@@ -955,12 +947,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
 					    unsigned long address,
 					    pmd_t *pmdp)
 {
-	return ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
+	return __ptep_test_and_clear_young(vma, address, (pte_t *)pmdp);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
-static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
 				       unsigned long address, pte_t *ptep)
 {
 	pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
@@ -984,15 +975,15 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 /*
- * ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
  * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
  */
-#define __HAVE_ARCH_PTEP_SET_WRPROTECT
-static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep)
 {
 	pte_t old_pte, pte;
 
-	pte = ptep_get(ptep);
+	pte = __ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
@@ -1006,7 +997,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 				      unsigned long address, pmd_t *pmdp)
 {
-	ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
+	__ptep_set_wrprotect(mm, address, (pte_t *)pmdp);
 }
 
 #define pmdp_establish pmdp_establish
@@ -1084,7 +1075,7 @@ static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
 #endif /* CONFIG_ARM64_MTE */
 
 /*
- * On AArch64, the cache coherency is handled via the set_ptes() function.
+ * On AArch64, the cache coherency is handled via the __set_ptes() function.
  */
 static inline void update_mmu_cache_range(struct vm_fault *vmf,
 		struct vm_area_struct *vma, unsigned long addr, pte_t *ptep,
@@ -1136,6 +1127,22 @@ extern pte_t ptep_modify_prot_start(struct vm_area_struct *vma,
 extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 				    unsigned long addr, pte_t *ptep,
 				    pte_t old_pte, pte_t new_pte);
+
+#define ptep_get				__ptep_get
+#define set_pte					__set_pte
+#define set_ptes				__set_ptes
+#define pte_clear				__pte_clear
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+#define ptep_get_and_clear			__ptep_get_and_clear
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+#define ptep_test_and_clear_young		__ptep_test_and_clear_young
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+#define ptep_clear_flush_young			__ptep_clear_flush_young
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+#define ptep_set_wrprotect			__ptep_set_wrprotect
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+#define ptep_set_access_flags			__ptep_set_access_flags
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c
index d0e08e93b2464..9afcc690fe73c 100644
--- a/arch/arm64/kernel/efi.c
+++ b/arch/arm64/kernel/efi.c
@@ -103,7 +103,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct set_perm_data *spd = data;
 	const efi_memory_desc_t *md = spd->md;
-	pte_t pte = ptep_get(ptep);
+	pte_t pte = __ptep_get(ptep);
 
 	if (md->attribute & EFI_MEMORY_RO)
 		pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
@@ -111,7 +111,7 @@ static int __init set_permissions(pte_t *ptep, unsigned long addr, void *data)
 		pte = set_pte_bit(pte, __pgprot(PTE_PXN));
 	else if (system_supports_bti_kernel() && spd->has_bti)
 		pte = set_pte_bit(pte, __pgprot(PTE_GP));
-	set_pte(ptep, pte);
+	__set_pte(ptep, pte);
 	return 0;
 }
 
diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c
index 59bfe2e96f8f3..dcdcccd40891c 100644
--- a/arch/arm64/kernel/mte.c
+++ b/arch/arm64/kernel/mte.c
@@ -67,7 +67,7 @@ int memcmp_pages(struct page *page1, struct page *page2)
 	/*
 	 * If the page content is identical but at least one of the pages is
 	 * tagged, return non-zero to avoid KSM merging. If only one of the
-	 * pages is tagged, set_ptes() may zero or change the tags of the
+	 * pages is tagged, __set_ptes() may zero or change the tags of the
 	 * other page via mte_sync_tags().
 	 */
 	if (page_mte_tagged(page1) || page_mte_tagged(page2))
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index f5f48e27970d8..5f75b7effb8ee 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -1073,7 +1073,7 @@ int kvm_vm_ioctl_mte_copy_tags(struct kvm *kvm,
 		} else {
 			/*
 			 * Only locking to serialise with a concurrent
-			 * set_ptes() in the VMM but still overriding the
+			 * __set_ptes() in the VMM but still overriding the
 			 * tags, hence ignoring the return value.
 			 */
 			try_page_mte_tagging(page);
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 3235e23309ec9..9a1c66183d168 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -191,7 +191,7 @@ static void show_pte(unsigned long addr)
 		if (!ptep)
 			break;
 
-		pte = ptep_get(ptep);
+		pte = __ptep_get(ptep);
 		pr_cont(", pte=%016llx", pte_val(pte));
 		pte_unmap(ptep);
 	} while(0);
@@ -205,16 +205,16 @@ static void show_pte(unsigned long addr)
  *
  * It needs to cope with hardware update of the accessed/dirty state by other
  * agents in the system and can safely skip the __sync_icache_dcache() call as,
- * like set_ptes(), the PTE is never changed from no-exec to exec here.
+ * like __set_ptes(), the PTE is never changed from no-exec to exec here.
  *
  * Returns whether or not the PTE actually changed.
  */
-int ptep_set_access_flags(struct vm_area_struct *vma,
-			  unsigned long address, pte_t *ptep,
-			  pte_t entry, int dirty)
+int __ptep_set_access_flags(struct vm_area_struct *vma,
+			    unsigned long address, pte_t *ptep,
+			    pte_t entry, int dirty)
 {
 	pteval_t old_pteval, pteval;
-	pte_t pte = ptep_get(ptep);
+	pte_t pte = __ptep_get(ptep);
 
 	if (pte_same(pte, entry))
 		return 0;
diff --git a/arch/arm64/mm/fixmap.c b/arch/arm64/mm/fixmap.c
index c0a3301203bdf..bfc02568805ae 100644
--- a/arch/arm64/mm/fixmap.c
+++ b/arch/arm64/mm/fixmap.c
@@ -121,9 +121,9 @@ void __set_fixmap(enum fixed_addresses idx,
 	ptep = fixmap_pte(addr);
 
 	if (pgprot_val(flags)) {
-		set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
+		__set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags));
 	} else {
-		pte_clear(&init_mm, addr, ptep);
+		__pte_clear(&init_mm, addr, ptep);
 		flush_tlb_kernel_range(addr, addr+PAGE_SIZE);
 	}
 }
diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c
index f93d0b7ba9e87..c3db949560f91 100644
--- a/arch/arm64/mm/hugetlbpage.c
+++ b/arch/arm64/mm/hugetlbpage.c
@@ -152,14 +152,14 @@ pte_t huge_ptep_get(pte_t *ptep)
 {
 	int ncontig, i;
 	size_t pgsize;
-	pte_t orig_pte = ptep_get(ptep);
+	pte_t orig_pte = __ptep_get(ptep);
 
 	if (!pte_present(orig_pte) || !pte_cont(orig_pte))
 		return orig_pte;
 
 	ncontig = num_contig_ptes(page_size(pte_page(orig_pte)), &pgsize);
 	for (i = 0; i < ncontig; i++, ptep++) {
-		pte_t pte = ptep_get(ptep);
+		pte_t pte = __ptep_get(ptep);
 
 		if (pte_dirty(pte))
 			orig_pte = pte_mkdirty(orig_pte);
@@ -184,11 +184,11 @@ static pte_t get_clear_contig(struct mm_struct *mm,
 			     unsigned long pgsize,
 			     unsigned long ncontig)
 {
-	pte_t orig_pte = ptep_get(ptep);
+	pte_t orig_pte = __ptep_get(ptep);
 	unsigned long i;
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++) {
-		pte_t pte = ptep_get_and_clear(mm, addr, ptep);
+		pte_t pte = __ptep_get_and_clear(mm, addr, ptep);
 
 		/*
 		 * If HW_AFDBM is enabled, then the HW could turn on
@@ -236,7 +236,7 @@ static void clear_flush(struct mm_struct *mm,
 	unsigned long i, saddr = addr;
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
-		ptep_get_and_clear(mm, addr, ptep);
+		__ptep_get_and_clear(mm, addr, ptep);
 
 	flush_tlb_range(&vma, saddr, addr);
 }
@@ -254,12 +254,12 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 
 	if (!pte_present(pte)) {
 		for (i = 0; i < ncontig; i++, ptep++, addr += pgsize)
-			set_ptes(mm, addr, ptep, pte, 1);
+			__set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
 	if (!pte_cont(pte)) {
-		set_ptes(mm, addr, ptep, pte, 1);
+		__set_ptes(mm, addr, ptep, pte, 1);
 		return;
 	}
 
@@ -270,7 +270,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 	clear_flush(mm, addr, ptep, pgsize, ncontig);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+		__set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -400,7 +400,7 @@ void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
 	ncontig = num_contig_ptes(sz, &pgsize);
 
 	for (i = 0; i < ncontig; i++, addr += pgsize, ptep++)
-		pte_clear(mm, addr, ptep);
+		__pte_clear(mm, addr, ptep);
 }
 
 pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
@@ -408,10 +408,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
 {
 	int ncontig;
 	size_t pgsize;
-	pte_t orig_pte = ptep_get(ptep);
+	pte_t orig_pte = __ptep_get(ptep);
 
 	if (!pte_cont(orig_pte))
-		return ptep_get_and_clear(mm, addr, ptep);
+		return __ptep_get_and_clear(mm, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 
@@ -431,11 +431,11 @@ static int __cont_access_flags_changed(pte_t *ptep, pte_t pte, int ncontig)
 {
 	int i;
 
-	if (pte_write(pte) != pte_write(ptep_get(ptep)))
+	if (pte_write(pte) != pte_write(__ptep_get(ptep)))
 		return 1;
 
 	for (i = 0; i < ncontig; i++) {
-		pte_t orig_pte = ptep_get(ptep + i);
+		pte_t orig_pte = __ptep_get(ptep + i);
 
 		if (pte_dirty(pte) != pte_dirty(orig_pte))
 			return 1;
@@ -459,7 +459,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 	pte_t orig_pte;
 
 	if (!pte_cont(pte))
-		return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
+		return __ptep_set_access_flags(vma, addr, ptep, pte, dirty);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
 	dpfn = pgsize >> PAGE_SHIFT;
@@ -478,7 +478,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma,
 
 	hugeprot = pte_pgprot(pte);
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+		__set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 
 	return 1;
 }
@@ -492,8 +492,8 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	size_t pgsize;
 	pte_t pte;
 
-	if (!pte_cont(ptep_get(ptep))) {
-		ptep_set_wrprotect(mm, addr, ptep);
+	if (!pte_cont(__ptep_get(ptep))) {
+		__ptep_set_wrprotect(mm, addr, ptep);
 		return;
 	}
 
@@ -507,7 +507,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
 	pfn = pte_pfn(pte);
 
 	for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn)
-		set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
+		__set_ptes(mm, addr, ptep, pfn_pte(pfn, hugeprot), 1);
 }
 
 pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
@@ -517,7 +517,7 @@ pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
 	size_t pgsize;
 	int ncontig;
 
-	if (!pte_cont(ptep_get(ptep)))
+	if (!pte_cont(__ptep_get(ptep)))
 		return ptep_clear_flush(vma, addr, ptep);
 
 	ncontig = find_num_contig(mm, addr, ptep, &pgsize);
@@ -550,7 +550,7 @@ pte_t huge_ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr
 		 * when the permission changes from executable to non-executable
 		 * in cases where cpu is affected with errata #2645198.
 		 */
-		if (pte_user_exec(ptep_get(ptep)))
+		if (pte_user_exec(__ptep_get(ptep)))
 			return huge_ptep_clear_flush(vma, addr, ptep);
 	}
 	return huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index c2a9f4f6c7dd0..9ee16cfce587f 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -112,8 +112,8 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr,
 		if (!early)
 			memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
 		next = addr + PAGE_SIZE;
-		set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
-	} while (ptep++, addr = next, addr != end && pte_none(ptep_get(ptep)));
+		__set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
+	} while (ptep++, addr = next, addr != end && pte_none(__ptep_get(ptep)));
 }
 
 static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr,
@@ -271,7 +271,7 @@ static void __init kasan_init_shadow(void)
 	 * so we should make sure that it maps the zero page read-only.
 	 */
 	for (i = 0; i < PTRS_PER_PTE; i++)
-		set_pte(&kasan_early_shadow_pte[i],
+		__set_pte(&kasan_early_shadow_pte[i],
 			pfn_pte(sym_to_pfn(kasan_early_shadow_page),
 				PAGE_KERNEL_RO));
 
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 720fff5b35feb..104bfcdcd43ef 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -173,16 +173,16 @@ static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 	ptep = pte_set_fixmap_offset(pmdp, addr);
 	do {
-		pte_t old_pte = ptep_get(ptep);
+		pte_t old_pte = __ptep_get(ptep);
 
-		set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
+		__set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot));
 
 		/*
 		 * After the PTE entry has been populated once, we
 		 * only allow updates to the permission attributes.
 		 */
 		BUG_ON(!pgattr_change_is_safe(pte_val(old_pte),
-					      pte_val(ptep_get(ptep))));
+					      pte_val(__ptep_get(ptep))));
 
 		phys += PAGE_SIZE;
 	} while (ptep++, addr += PAGE_SIZE, addr != end);
@@ -854,12 +854,12 @@ static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = ptep_get(ptep);
+		pte = __ptep_get(ptep);
 		if (pte_none(pte))
 			continue;
 
 		WARN_ON(!pte_present(pte));
-		pte_clear(&init_mm, addr, ptep);
+		__pte_clear(&init_mm, addr, ptep);
 		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 		if (free_mapped)
 			free_hotplug_page_range(pte_page(pte),
@@ -987,7 +987,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 
 	do {
 		ptep = pte_offset_kernel(pmdp, addr);
-		pte = ptep_get(ptep);
+		pte = __ptep_get(ptep);
 
 		/*
 		 * This is just a sanity check here which verifies that
@@ -1006,7 +1006,7 @@ static void free_empty_pte_table(pmd_t *pmdp, unsigned long addr,
 	 */
 	ptep = pte_offset_kernel(pmdp, 0UL);
 	for (i = 0; i < PTRS_PER_PTE; i++) {
-		if (!pte_none(ptep_get(&ptep[i])))
+		if (!pte_none(__ptep_get(&ptep[i])))
 			return;
 	}
 
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index e0e35bd942222..0e270a1c51e64 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -36,12 +36,12 @@ bool can_set_direct_map(void)
 static int change_page_range(pte_t *ptep, unsigned long addr, void *data)
 {
 	struct page_change_data *cdata = data;
-	pte_t pte = ptep_get(ptep);
+	pte_t pte = __ptep_get(ptep);
 
 	pte = clear_pte_bit(pte, cdata->clear_mask);
 	pte = set_pte_bit(pte, cdata->set_mask);
 
-	set_pte(ptep, pte);
+	__set_pte(ptep, pte);
 	return 0;
 }
 
@@ -242,5 +242,5 @@ bool kernel_page_present(struct page *page)
 		return true;
 
 	ptep = pte_offset_kernel(pmdp, addr);
-	return pte_valid(ptep_get(ptep));
+	return pte_valid(__ptep_get(ptep));
 }
diff --git a/arch/arm64/mm/trans_pgd.c b/arch/arm64/mm/trans_pgd.c
index f71ab4704cce7..5139a28130c08 100644
--- a/arch/arm64/mm/trans_pgd.c
+++ b/arch/arm64/mm/trans_pgd.c
@@ -33,7 +33,7 @@ static void *trans_alloc(struct trans_pgd_info *info)
 
 static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 {
-	pte_t pte = ptep_get(src_ptep);
+	pte_t pte = __ptep_get(src_ptep);
 
 	if (pte_valid(pte)) {
 		/*
@@ -41,7 +41,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 		 * read only (code, rodata). Clear the RDONLY bit from
 		 * the temporary mappings we use during restore.
 		 */
-		set_pte(dst_ptep, pte_mkwrite_novma(pte));
+		__set_pte(dst_ptep, pte_mkwrite_novma(pte));
 	} else if ((debug_pagealloc_enabled() ||
 		   is_kfence_address((void *)addr)) && !pte_none(pte)) {
 		/*
@@ -55,7 +55,7 @@ static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr)
 		 */
 		BUG_ON(!pfn_valid(pte_pfn(pte)));
 
-		set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
+		__set_pte(dst_ptep, pte_mkpresent(pte_mkwrite_novma(pte)));
 	}
 }
 

From d1d4c487ade80a17498907c173d3d26e9c8ee535 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:58 +0000
Subject: [PATCH 033/352] arm64/mm: dplit __flush_tlb_range() to elide trailing
 DSB

BugLink: https://bugs.launchpad.net/bugs/2059316

Split __flush_tlb_range() into __flush_tlb_range_nosync() +
__flush_tlb_range(), in the same way as the existing flush_tlb_page()
arrangement.  This allows calling __flush_tlb_range_nosync() to elide the
trailing DSB.  Forthcoming "contpte" code will take advantage of this when
clearing the young bit from a contiguous range of ptes.

Ordering between dsb and mmu_notifier_arch_invalidate_secondary_tlbs() has
changed, but now aligns with the ordering of __flush_tlb_page().  It has
been discussed that __flush_tlb_page() may be wrong though.  Regardless,
both will be resolved separately if needed.

Link: https://lkml.kernel.org/r/20240215103205.2607016-12-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit d9d8dc2bd3fb2689309f704fe85e6dde2b1bd73a)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/tlbflush.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index bfeb54f3a971f..a75de2665d844 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -424,7 +424,7 @@ do {									\
 #define __flush_s2_tlb_range_op(op, start, pages, stride, tlb_level) \
 	__flush_tlb_range_op(op, start, pages, stride, 0, tlb_level, false, kvm_lpa2_is_enabled());
 
-static inline void __flush_tlb_range(struct vm_area_struct *vma,
+static inline void __flush_tlb_range_nosync(struct vm_area_struct *vma,
 				     unsigned long start, unsigned long end,
 				     unsigned long stride, bool last_level,
 				     int tlb_level)
@@ -458,10 +458,19 @@ static inline void __flush_tlb_range(struct vm_area_struct *vma,
 		__flush_tlb_range_op(vae1is, start, pages, stride, asid,
 				     tlb_level, true, lpa2_is_enabled());
 
-	dsb(ish);
 	mmu_notifier_arch_invalidate_secondary_tlbs(vma->vm_mm, start, end);
 }
 
+static inline void __flush_tlb_range(struct vm_area_struct *vma,
+				     unsigned long start, unsigned long end,
+				     unsigned long stride, bool last_level,
+				     int tlb_level)
+{
+	__flush_tlb_range_nosync(vma, start, end, stride,
+				 last_level, tlb_level);
+	dsb(ish);
+}
+
 static inline void flush_tlb_range(struct vm_area_struct *vma,
 				   unsigned long start, unsigned long end)
 {

From 8021db76fba846eb7283bdce1906383b21339b0c Mon Sep 17 00:00:00 2001
From: dann frazier <dann.frazier@canonical.com>
Date: Wed, 27 Mar 2024 14:32:55 -0600
Subject: [PATCH 034/352] NVIDIA: [Config] arm64: ARM64_CONTPTE=y

BugLink: https://bugs.launchpad.net/bugs/2059316

Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/config/annotations | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/debian.nvidia-adv/config/annotations b/debian.nvidia-adv/config/annotations
index 7d9fd0b373b69..729639479558d 100644
--- a/debian.nvidia-adv/config/annotations
+++ b/debian.nvidia-adv/config/annotations
@@ -9,6 +9,9 @@ include "../../debian.master/config/annotations"
 CONFIG_AAEON_IWMI_WDT                           policy<{'amd64': '-'}>
 CONFIG_AAEON_IWMI_WDT                           note<'{Disable all Ubuntu ODM drivers}'>
 
+CONFIG_ARM64_CONTPTE                            policy<{'arm64': 'y'}>
+CONFIG_ARM64_CONTPTE                            note<'LP: #2059316'>
+
 CONFIG_ARM64_ERRATUM_1902691                    policy<{'arm64': 'y'}>
 CONFIG_ARM64_ERRATUM_1902691                    note<'{Required for Grace enablement}'>
 

From 833a6db559d2b7e44a89959c12f6bc2dbc31ff05 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:31:59 +0000
Subject: [PATCH 035/352] arm64/mm: wire up PTE_CONT for user mappings

BugLink: https://bugs.launchpad.net/bugs/2059316

With the ptep API sufficiently refactored, we can now introduce a new
"contpte" API layer, which transparently manages the PTE_CONT bit for user
mappings.

In this initial implementation, only suitable batches of PTEs, set via
set_ptes(), are mapped with the PTE_CONT bit.  Any subsequent modification
of individual PTEs will cause an "unfold" operation to repaint the contpte
block as individual PTEs before performing the requested operation.
While, a modification of a single PTE could cause the block of PTEs to
which it belongs to become eligible for "folding" into a contpte entry,
"folding" is not performed in this initial implementation due to the costs
of checking the requirements are met.  Due to this, contpte mappings will
degrade back to normal pte mappings over time if/when protections are
changed.  This will be solved in a future patch.

Since a contpte block only has a single access and dirty bit, the semantic
here changes slightly; when getting a pte (e.g.  ptep_get()) that is part
of a contpte mapping, the access and dirty information are pulled from the
block (so all ptes in the block return the same access/dirty info).  When
changing the access/dirty info on a pte (e.g.  ptep_set_access_flags())
that is part of a contpte mapping, this change will affect the whole
contpte block.  This is works fine in practice since we guarantee that
only a single folio is mapped by a contpte block, and the core-mm tracks
access/dirty information per folio.

In order for the public functions, which used to be pure inline, to
continue to be callable by modules, export all the contpte_* symbols that
are now called by those public inline functions.

The feature is enabled/disabled with the ARM64_CONTPTE Kconfig parameter
at build time.  It defaults to enabled as long as its dependency,
TRANSPARENT_HUGEPAGE is also enabled.  The core-mm depends upon
TRANSPARENT_HUGEPAGE to be able to allocate large folios, so if its not
enabled, then there is no chance of meeting the physical contiguity
requirement for contpte mappings.

Link: https://lkml.kernel.org/r/20240215103205.2607016-13-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 4602e5757bcceb231c3a13c36c373ad4a750eddb)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/Kconfig               |   9 +
 arch/arm64/include/asm/pgtable.h | 167 ++++++++++++++++++
 arch/arm64/mm/Makefile           |   1 +
 arch/arm64/mm/contpte.c          | 285 +++++++++++++++++++++++++++++++
 include/linux/efi.h              |   5 +
 5 files changed, 467 insertions(+)
 create mode 100644 arch/arm64/mm/contpte.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index dca14c61af975..401c4ac267649 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2229,6 +2229,15 @@ config UNWIND_PATCH_PAC_INTO_SCS
 	select UNWIND_TABLES
 	select DYNAMIC_SCS
 
+config ARM64_CONTPTE
+	bool "Contiguous PTE mappings for user memory" if EXPERT
+	depends on TRANSPARENT_HUGEPAGE
+	default y
+	help
+	  When enabled, user mappings are configured using the PTE contiguous
+	  bit, for any mappings that meet the size and alignment requirements.
+	  This reduces TLB pressure and improves performance.
+
 endmenu # "Kernel Features"
 
 menu "Boot options"
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 7336d40a893a8..831099cfc96bd 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -133,6 +133,10 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
  */
 #define pte_valid_not_user(pte) \
 	((pte_val(pte) & (PTE_VALID | PTE_USER | PTE_UXN)) == (PTE_VALID | PTE_UXN))
+/*
+ * Returns true if the pte is valid and has the contiguous bit set.
+ */
+#define pte_valid_cont(pte)	(pte_valid(pte) && pte_cont(pte))
 /*
  * Could the pte be present in the TLB? We must check mm_tlb_flush_pending
  * so that we don't erroneously return false for pages that have been
@@ -1128,6 +1132,167 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 				    unsigned long addr, pte_t *ptep,
 				    pte_t old_pte, pte_t new_pte);
 
+#ifdef CONFIG_ARM64_CONTPTE
+
+/*
+ * The contpte APIs are used to transparently manage the contiguous bit in ptes
+ * where it is possible and makes sense to do so. The PTE_CONT bit is considered
+ * a private implementation detail of the public ptep API (see below).
+ */
+extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
+extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
+extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
+extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr);
+extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep);
+extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				pte_t entry, int dirty);
+
+static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte)
+{
+	if (unlikely(pte_valid_cont(pte)))
+		__contpte_try_unfold(mm, addr, ptep, pte);
+}
+
+/*
+ * The below functions constitute the public API that arm64 presents to the
+ * core-mm to manipulate PTE entries within their page tables (or at least this
+ * is the subset of the API that arm64 needs to implement). These public
+ * versions will automatically and transparently apply the contiguous bit where
+ * it makes sense to do so. Therefore any users that are contig-aware (e.g.
+ * hugetlb, kernel mapper) should NOT use these APIs, but instead use the
+ * private versions, which are prefixed with double underscore. All of these
+ * APIs except for ptep_get_lockless() are expected to be called with the PTL
+ * held. Although the contiguous bit is considered private to the
+ * implementation, it is deliberately allowed to leak through the getters (e.g.
+ * ptep_get()), back to core code. This is required so that pte_leaf_size() can
+ * provide an accurate size for perf_get_pgtable_size(). But this leakage means
+ * its possible a pte will be passed to a setter with the contiguous bit set, so
+ * we explicitly clear the contiguous bit in those cases to prevent accidentally
+ * setting it in the pgtable.
+ */
+
+#define ptep_get ptep_get
+static inline pte_t ptep_get(pte_t *ptep)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(pte)))
+		return pte;
+
+	return contpte_ptep_get(ptep, pte);
+}
+
+#define ptep_get_lockless ptep_get_lockless
+static inline pte_t ptep_get_lockless(pte_t *ptep)
+{
+	pte_t pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(pte)))
+		return pte;
+
+	return contpte_ptep_get_lockless(ptep);
+}
+
+static inline void set_pte(pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We don't have the mm or vaddr so cannot unfold contig entries (since
+	 * it requires tlb maintenance). set_pte() is not used in core code, so
+	 * this should never even be called. Regardless do our best to service
+	 * any call and emit a warning if there is any attempt to set a pte on
+	 * top of an existing contig range.
+	 */
+	pte_t orig_pte = __ptep_get(ptep);
+
+	WARN_ON_ONCE(pte_valid_cont(orig_pte));
+	__set_pte(ptep, pte_mknoncont(pte));
+}
+
+#define set_ptes set_ptes
+static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	pte = pte_mknoncont(pte);
+
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		__set_ptes(mm, addr, ptep, pte, 1);
+	} else {
+		contpte_set_ptes(mm, addr, ptep, pte, nr);
+	}
+}
+
+static inline void pte_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	__pte_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
+static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	return __ptep_get_and_clear(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
+static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_test_and_clear_young(vma, addr, ptep);
+
+	return contpte_ptep_test_and_clear_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
+static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_clear_flush_young(vma, addr, ptep);
+
+	return contpte_ptep_clear_flush_young(vma, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_SET_WRPROTECT
+static inline void ptep_set_wrprotect(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep)
+{
+	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+	__ptep_set_wrprotect(mm, addr, ptep);
+}
+
+#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
+static inline int ptep_set_access_flags(struct vm_area_struct *vma,
+				unsigned long addr, pte_t *ptep,
+				pte_t entry, int dirty)
+{
+	pte_t orig_pte = __ptep_get(ptep);
+
+	entry = pte_mknoncont(entry);
+
+	if (likely(!pte_valid_cont(orig_pte)))
+		return __ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+
+	return contpte_ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+}
+
+#else /* CONFIG_ARM64_CONTPTE */
+
 #define ptep_get				__ptep_get
 #define set_pte					__set_pte
 #define set_ptes				__set_ptes
@@ -1143,6 +1308,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags			__ptep_set_access_flags
 
+#endif /* CONFIG_ARM64_CONTPTE */
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_PGTABLE_H */
diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile
index dbd1bc95967d0..60454256945b8 100644
--- a/arch/arm64/mm/Makefile
+++ b/arch/arm64/mm/Makefile
@@ -3,6 +3,7 @@ obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   cache.o copypage.o flush.o \
 				   ioremap.o mmap.o pgd.o mmu.o \
 				   context.o proc.o pageattr.o fixmap.o
+obj-$(CONFIG_ARM64_CONTPTE)	+= contpte.o
 obj-$(CONFIG_HUGETLB_PAGE)	+= hugetlbpage.o
 obj-$(CONFIG_PTDUMP_CORE)	+= ptdump.o
 obj-$(CONFIG_PTDUMP_DEBUGFS)	+= ptdump_debugfs.o
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
new file mode 100644
index 0000000000000..6d7f40667fa23
--- /dev/null
+++ b/arch/arm64/mm/contpte.c
@@ -0,0 +1,285 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2023 ARM Ltd.
+ */
+
+#include <linux/mm.h>
+#include <linux/efi.h>
+#include <linux/export.h>
+#include <asm/tlbflush.h>
+
+static inline bool mm_is_user(struct mm_struct *mm)
+{
+	/*
+	 * Don't attempt to apply the contig bit to kernel mappings, because
+	 * dynamically adding/removing the contig bit can cause page faults.
+	 * These racing faults are ok for user space, since they get serialized
+	 * on the PTL. But kernel mappings can't tolerate faults.
+	 */
+	if (unlikely(mm_is_efi(mm)))
+		return false;
+	return mm != &init_mm;
+}
+
+static inline pte_t *contpte_align_down(pte_t *ptep)
+{
+	return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
+}
+
+static void contpte_convert(struct mm_struct *mm, unsigned long addr,
+			    pte_t *ptep, pte_t pte)
+{
+	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
+	unsigned long start_addr;
+	pte_t *start_ptep;
+	int i;
+
+	start_ptep = ptep = contpte_align_down(ptep);
+	start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+	pte = pfn_pte(ALIGN_DOWN(pte_pfn(pte), CONT_PTES), pte_pgprot(pte));
+
+	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE) {
+		pte_t ptent = __ptep_get_and_clear(mm, addr, ptep);
+
+		if (pte_dirty(ptent))
+			pte = pte_mkdirty(pte);
+
+		if (pte_young(ptent))
+			pte = pte_mkyoung(pte);
+	}
+
+	__flush_tlb_range(&vma, start_addr, addr, PAGE_SIZE, true, 3);
+
+	__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
+}
+
+void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
+			pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We have already checked that the ptes are contiguous in
+	 * contpte_try_unfold(), so just check that the mm is user space.
+	 */
+	if (!mm_is_user(mm))
+		return;
+
+	pte = pte_mknoncont(pte);
+	contpte_convert(mm, addr, ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_unfold);
+
+pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
+{
+	/*
+	 * Gather access/dirty bits, which may be populated in any of the ptes
+	 * of the contig range. We are guaranteed to be holding the PTL, so any
+	 * contiguous range cannot be unfolded or otherwise modified under our
+	 * feet.
+	 */
+
+	pte_t pte;
+	int i;
+
+	ptep = contpte_align_down(ptep);
+
+	for (i = 0; i < CONT_PTES; i++, ptep++) {
+		pte = __ptep_get(ptep);
+
+		if (pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
+	}
+
+	return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get);
+
+pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
+{
+	/*
+	 * Gather access/dirty bits, which may be populated in any of the ptes
+	 * of the contig range. We may not be holding the PTL, so any contiguous
+	 * range may be unfolded/modified/refolded under our feet. Therefore we
+	 * ensure we read a _consistent_ contpte range by checking that all ptes
+	 * in the range are valid and have CONT_PTE set, that all pfns are
+	 * contiguous and that all pgprots are the same (ignoring access/dirty).
+	 * If we find a pte that is not consistent, then we must be racing with
+	 * an update so start again. If the target pte does not have CONT_PTE
+	 * set then that is considered consistent on its own because it is not
+	 * part of a contpte range.
+	 */
+
+	pgprot_t orig_prot;
+	unsigned long pfn;
+	pte_t orig_pte;
+	pgprot_t prot;
+	pte_t *ptep;
+	pte_t pte;
+	int i;
+
+retry:
+	orig_pte = __ptep_get(orig_ptep);
+
+	if (!pte_valid_cont(orig_pte))
+		return orig_pte;
+
+	orig_prot = pte_pgprot(pte_mkold(pte_mkclean(orig_pte)));
+	ptep = contpte_align_down(orig_ptep);
+	pfn = pte_pfn(orig_pte) - (orig_ptep - ptep);
+
+	for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
+		pte = __ptep_get(ptep);
+		prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+
+		if (!pte_valid_cont(pte) ||
+		   pte_pfn(pte) != pfn ||
+		   pgprot_val(prot) != pgprot_val(orig_prot))
+			goto retry;
+
+		if (pte_dirty(pte))
+			orig_pte = pte_mkdirty(orig_pte);
+
+		if (pte_young(pte))
+			orig_pte = pte_mkyoung(orig_pte);
+	}
+
+	return orig_pte;
+}
+EXPORT_SYMBOL(contpte_ptep_get_lockless);
+
+void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, pte_t pte, unsigned int nr)
+{
+	unsigned long next;
+	unsigned long end;
+	unsigned long pfn;
+	pgprot_t prot;
+
+	/*
+	 * The set_ptes() spec guarantees that when nr > 1, the initial state of
+	 * all ptes is not-present. Therefore we never need to unfold or
+	 * otherwise invalidate a range before we set the new ptes.
+	 * contpte_set_ptes() should never be called for nr < 2.
+	 */
+	VM_WARN_ON(nr == 1);
+
+	if (!mm_is_user(mm))
+		return __set_ptes(mm, addr, ptep, pte, nr);
+
+	end = addr + (nr << PAGE_SHIFT);
+	pfn = pte_pfn(pte);
+	prot = pte_pgprot(pte);
+
+	do {
+		next = pte_cont_addr_end(addr, end);
+		nr = (next - addr) >> PAGE_SHIFT;
+		pte = pfn_pte(pfn, prot);
+
+		if (((addr | next | (pfn << PAGE_SHIFT)) & ~CONT_PTE_MASK) == 0)
+			pte = pte_mkcont(pte);
+		else
+			pte = pte_mknoncont(pte);
+
+		__set_ptes(mm, addr, ptep, pte, nr);
+
+		addr = next;
+		ptep += nr;
+		pfn += nr;
+
+	} while (addr != end);
+}
+EXPORT_SYMBOL(contpte_set_ptes);
+
+int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep)
+{
+	/*
+	 * ptep_clear_flush_young() technically requires us to clear the access
+	 * flag for a _single_ pte. However, the core-mm code actually tracks
+	 * access/dirty per folio, not per page. And since we only create a
+	 * contig range when the range is covered by a single folio, we can get
+	 * away with clearing young for the whole contig range here, so we avoid
+	 * having to unfold.
+	 */
+
+	int young = 0;
+	int i;
+
+	ptep = contpte_align_down(ptep);
+	addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+	for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+		young |= __ptep_test_and_clear_young(vma, addr, ptep);
+
+	return young;
+}
+EXPORT_SYMBOL(contpte_ptep_test_and_clear_young);
+
+int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep)
+{
+	int young;
+
+	young = contpte_ptep_test_and_clear_young(vma, addr, ptep);
+
+	if (young) {
+		/*
+		 * See comment in __ptep_clear_flush_young(); same rationale for
+		 * eliding the trailing DSB applies here.
+		 */
+		addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+		__flush_tlb_range_nosync(vma, addr, addr + CONT_PTE_SIZE,
+					 PAGE_SIZE, true, 3);
+	}
+
+	return young;
+}
+EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
+
+int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
+					unsigned long addr, pte_t *ptep,
+					pte_t entry, int dirty)
+{
+	unsigned long start_addr;
+	pte_t orig_pte;
+	int i;
+
+	/*
+	 * Gather the access/dirty bits for the contiguous range. If nothing has
+	 * changed, its a noop.
+	 */
+	orig_pte = pte_mknoncont(ptep_get(ptep));
+	if (pte_val(orig_pte) == pte_val(entry))
+		return 0;
+
+	/*
+	 * We can fix up access/dirty bits without having to unfold the contig
+	 * range. But if the write bit is changing, we must unfold.
+	 */
+	if (pte_write(orig_pte) == pte_write(entry)) {
+		/*
+		 * For HW access management, we technically only need to update
+		 * the flag on a single pte in the range. But for SW access
+		 * management, we need to update all the ptes to prevent extra
+		 * faults. Avoid per-page tlb flush in __ptep_set_access_flags()
+		 * and instead flush the whole range at the end.
+		 */
+		ptep = contpte_align_down(ptep);
+		start_addr = addr = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+
+		for (i = 0; i < CONT_PTES; i++, ptep++, addr += PAGE_SIZE)
+			__ptep_set_access_flags(vma, addr, ptep, entry, 0);
+
+		if (dirty)
+			__flush_tlb_range(vma, start_addr, addr,
+							PAGE_SIZE, true, 3);
+	} else {
+		__contpte_try_unfold(vma->vm_mm, addr, ptep, orig_pte);
+		__ptep_set_access_flags(vma, addr, ptep, entry, dirty);
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(contpte_ptep_set_access_flags);
diff --git a/include/linux/efi.h b/include/linux/efi.h
index 02224080917f3..bd9bb4db314a2 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -696,6 +696,11 @@ extern struct efi {
 
 extern struct mm_struct efi_mm;
 
+static inline bool mm_is_efi(struct mm_struct *mm)
+{
+	return IS_ENABLED(CONFIG_EFI) && mm == &efi_mm;
+}
+
 static inline int
 efi_guidcmp (efi_guid_t left, efi_guid_t right)
 {

From 35fc7ca4b38f7b35cee260e80dd0daa9624099b5 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:00 +0000
Subject: [PATCH 036/352] arm64/mm: implement new wrprotect_ptes() batch API

BugLink: https://bugs.launchpad.net/bugs/2059316

Optimize the contpte implementation to fix some of the fork performance
regression introduced by the initial contpte commit.  Subsequent patches
will solve it entirely.

During fork(), any private memory in the parent must be write-protected.
Previously this was done 1 PTE at a time.  But the core-mm supports
batched wrprotect via the new wrprotect_ptes() API.  So let's implement
that API and for fully covered contpte mappings, we no longer need to
unfold the contpte.  This has 2 benefits:

  - reduced unfolding, reduces the number of tlbis that must be issued.
  - The memory remains contpte-mapped ("folded") in the parent, so it
    continues to benefit from the more efficient use of the TLB after
    the fork.

The optimization to wrprotect a whole contpte block without unfolding is
possible thanks to the tightening of the Arm ARM in respect to the
definition and behaviour when 'Misprogramming the Contiguous bit'.  See
section D21194 at https://developer.arm.com/documentation/102105/ja-07/

Link: https://lkml.kernel.org/r/20240215103205.2607016-14-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 311a6cf29690bb8295327bad0e76e0ad48cadcc4)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 61 ++++++++++++++++++++++++++------
 arch/arm64/mm/contpte.c          | 38 ++++++++++++++++++++
 2 files changed, 89 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 831099cfc96bd..8643227c318bf 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -978,16 +978,12 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-/*
- * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
- * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
- */
-static inline void __ptep_set_wrprotect(struct mm_struct *mm,
-					unsigned long address, pte_t *ptep)
+static inline void ___ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep,
+					pte_t pte)
 {
-	pte_t old_pte, pte;
+	pte_t old_pte;
 
-	pte = __ptep_get(ptep);
 	do {
 		old_pte = pte;
 		pte = pte_wrprotect(pte);
@@ -996,6 +992,25 @@ static inline void __ptep_set_wrprotect(struct mm_struct *mm,
 	} while (pte_val(pte) != pte_val(old_pte));
 }
 
+/*
+ * __ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
+ */
+static inline void __ptep_set_wrprotect(struct mm_struct *mm,
+					unsigned long address, pte_t *ptep)
+{
+	___ptep_set_wrprotect(mm, address, ptep, __ptep_get(ptep));
+}
+
+static inline void __wrprotect_ptes(struct mm_struct *mm, unsigned long address,
+				pte_t *ptep, unsigned int nr)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr; i++, address += PAGE_SIZE, ptep++)
+		__ptep_set_wrprotect(mm, address, ptep);
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_SET_WRPROTECT
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
@@ -1149,6 +1164,8 @@ extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
+extern void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr);
 extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep,
 				pte_t entry, int dirty);
@@ -1268,12 +1285,35 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 	return contpte_ptep_clear_flush_young(vma, addr, ptep);
 }
 
+#define wrprotect_ptes wrprotect_ptes
+static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr)
+{
+	if (likely(nr == 1)) {
+		/*
+		 * Optimization: wrprotect_ptes() can only be called for present
+		 * ptes so we only need to check contig bit as condition for
+		 * unfold, and we can remove the contig bit from the pte we read
+		 * to avoid re-reading. This speeds up fork() which is sensitive
+		 * for order-0 folios. Equivalent to contpte_try_unfold().
+		 */
+		pte_t orig_pte = __ptep_get(ptep);
+
+		if (unlikely(pte_cont(orig_pte))) {
+			__contpte_try_unfold(mm, addr, ptep, orig_pte);
+			orig_pte = pte_mknoncont(orig_pte);
+		}
+		___ptep_set_wrprotect(mm, addr, ptep, orig_pte);
+	} else {
+		contpte_wrprotect_ptes(mm, addr, ptep, nr);
+	}
+}
+
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 static inline void ptep_set_wrprotect(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep)
 {
-	contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
-	__ptep_set_wrprotect(mm, addr, ptep);
+	wrprotect_ptes(mm, addr, ptep, 1);
 }
 
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
@@ -1305,6 +1345,7 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 #define ptep_clear_flush_young			__ptep_clear_flush_young
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define ptep_set_wrprotect			__ptep_set_wrprotect
+#define wrprotect_ptes				__wrprotect_ptes
 #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
 #define ptep_set_access_flags			__ptep_set_access_flags
 
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 6d7f40667fa23..bedb585245356 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -26,6 +26,26 @@ static inline pte_t *contpte_align_down(pte_t *ptep)
 	return PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
 }
 
+static void contpte_try_unfold_partial(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, unsigned int nr)
+{
+	/*
+	 * Unfold any partially covered contpte block at the beginning and end
+	 * of the range.
+	 */
+
+	if (ptep != contpte_align_down(ptep) || nr < CONT_PTES)
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+
+	if (ptep + nr != contpte_align_down(ptep + nr)) {
+		unsigned long last_addr = addr + PAGE_SIZE * (nr - 1);
+		pte_t *last_ptep = ptep + nr - 1;
+
+		contpte_try_unfold(mm, last_addr, last_ptep,
+				   __ptep_get(last_ptep));
+	}
+}
+
 static void contpte_convert(struct mm_struct *mm, unsigned long addr,
 			    pte_t *ptep, pte_t pte)
 {
@@ -238,6 +258,24 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
 
+void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
+					pte_t *ptep, unsigned int nr)
+{
+	/*
+	 * If wrprotecting an entire contig range, we can avoid unfolding. Just
+	 * set wrprotect and wait for the later mmu_gather flush to invalidate
+	 * the tlb. Until the flush, the page may or may not be wrprotected.
+	 * After the flush, it is guaranteed wrprotected. If it's a partial
+	 * range though, we must unfold, because we can't have a case where
+	 * CONT_PTE is set but wrprotect applies to a subset of the PTEs; this
+	 * would cause it to continue to be unpredictable after the flush.
+	 */
+
+	contpte_try_unfold_partial(mm, addr, ptep, nr);
+	__wrprotect_ptes(mm, addr, ptep, nr);
+}
+EXPORT_SYMBOL(contpte_wrprotect_ptes);
+
 int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep,
 					pte_t entry, int dirty)

From 8977bc8163e0313465460ef5a4e282b9d7382ba2 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:01 +0000
Subject: [PATCH 037/352] arm64/mm: implement new [get_and_]clear_full_ptes()
 batch APIs

BugLink: https://bugs.launchpad.net/bugs/2059316

Optimize the contpte implementation to fix some of the
exit/munmap/dontneed performance regression introduced by the initial
contpte commit.  Subsequent patches will solve it entirely.

During exit(), munmap() or madvise(MADV_DONTNEED), mappings must be
cleared.  Previously this was done 1 PTE at a time.  But the core-mm
supports batched clear via the new [get_and_]clear_full_ptes() APIs.  So
let's implement those APIs and for fully covered contpte mappings, we no
longer need to unfold the contpte.  This significantly reduces unfolding
operations, reducing the number of tlbis that must be issued.

Link: https://lkml.kernel.org/r/20240215103205.2607016-15-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 6b1e4efb6f5499ae8f9f5cdda7502285a0edbf51)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 67 ++++++++++++++++++++++++++++++++
 arch/arm64/mm/contpte.c          | 17 ++++++++
 2 files changed, 84 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 8643227c318bf..a8f1a35e30867 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -965,6 +965,37 @@ static inline pte_t __ptep_get_and_clear(struct mm_struct *mm,
 	return pte;
 }
 
+static inline void __clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	for (;;) {
+		__ptep_get_and_clear(mm, addr, ptep);
+		if (--nr == 0)
+			break;
+		ptep++;
+		addr += PAGE_SIZE;
+	}
+}
+
+static inline pte_t __get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	pte_t pte, tmp_pte;
+
+	pte = __ptep_get_and_clear(mm, addr, ptep);
+	while (--nr) {
+		ptep++;
+		addr += PAGE_SIZE;
+		tmp_pte = __ptep_get_and_clear(mm, addr, ptep);
+		if (pte_dirty(tmp_pte))
+			pte = pte_mkdirty(pte);
+		if (pte_young(tmp_pte))
+			pte = pte_mkyoung(pte);
+	}
+	return pte;
+}
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
@@ -1160,6 +1191,11 @@ extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
 extern pte_t contpte_ptep_get_lockless(pte_t *orig_ptep);
 extern void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte, unsigned int nr);
+extern void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full);
+extern pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full);
 extern int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep);
 extern int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
@@ -1253,6 +1289,35 @@ static inline void pte_clear(struct mm_struct *mm,
 	__pte_clear(mm, addr, ptep);
 }
 
+#define clear_full_ptes clear_full_ptes
+static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		__clear_full_ptes(mm, addr, ptep, nr, full);
+	} else {
+		contpte_clear_full_ptes(mm, addr, ptep, nr, full);
+	}
+}
+
+#define get_and_clear_full_ptes get_and_clear_full_ptes
+static inline pte_t get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	pte_t pte;
+
+	if (likely(nr == 1)) {
+		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
+		pte = __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+	} else {
+		pte = contpte_get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+	}
+
+	return pte;
+}
+
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep)
@@ -1337,6 +1402,8 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 #define set_pte					__set_pte
 #define set_ptes				__set_ptes
 #define pte_clear				__pte_clear
+#define clear_full_ptes				__clear_full_ptes
+#define get_and_clear_full_ptes			__get_and_clear_full_ptes
 #define __HAVE_ARCH_PTEP_GET_AND_CLEAR
 #define ptep_get_and_clear			__ptep_get_and_clear
 #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index bedb585245356..50e0173dc5eee 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -212,6 +212,23 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
 }
 EXPORT_SYMBOL(contpte_set_ptes);
 
+void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, unsigned int nr, int full)
+{
+	contpte_try_unfold_partial(mm, addr, ptep, nr);
+	__clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL(contpte_clear_full_ptes);
+
+pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep,
+				unsigned int nr, int full)
+{
+	contpte_try_unfold_partial(mm, addr, ptep, nr);
+	return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
+}
+EXPORT_SYMBOL(contpte_get_and_clear_full_ptes);
+
 int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep)
 {

From 1505769e4fb702be74d7e00547e87c5f679d8f73 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:02 +0000
Subject: [PATCH 038/352] mm: add pte_batch_hint() to reduce scanning in
 folio_pte_batch()

BugLink: https://bugs.launchpad.net/bugs/2059316

Some architectures (e.g.  arm64) can tell from looking at a pte, if some
follow-on ptes also map contiguous physical memory with the same pgprot.
(for arm64, these are contpte mappings).

Take advantage of this knowledge to optimize folio_pte_batch() so that it
can skip these ptes when scanning to create a batch.  By default, if an
arch does not opt-in, folio_pte_batch() returns a compile-time 1, so the
changes are optimized out and the behaviour is as before.

arm64 will opt-in to providing this hint in the next patch, which will
greatly reduce the cost of ptep_get() when scanning a range of contptes.

Link: https://lkml.kernel.org/r/20240215103205.2607016-16-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit c6ec76a2ebc5829e5826b218d2e1475ec11b333e)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 include/linux/pgtable.h | 21 +++++++++++++++++++++
 mm/memory.c             | 19 ++++++++++++-------
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e15e6d1f71ce..55b16b5d8f0b2 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -212,6 +212,27 @@ static inline int pmd_dirty(pmd_t pmd)
 #define arch_flush_lazy_mmu_mode()	do {} while (0)
 #endif
 
+#ifndef pte_batch_hint
+/**
+ * pte_batch_hint - Number of pages that can be added to batch without scanning.
+ * @ptep: Page table pointer for the entry.
+ * @pte: Page table entry.
+ *
+ * Some architectures know that a set of contiguous ptes all map the same
+ * contiguous memory with the same permissions. In this case, it can provide a
+ * hint to aid pte batching without the core code needing to scan every pte.
+ *
+ * An architecture implementation may ignore the PTE accessed state. Further,
+ * the dirty state must apply atomically to all the PTEs described by the hint.
+ *
+ * May be overridden by the architecture, else pte_batch_hint is always 1.
+ */
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+	return 1;
+}
+#endif
+
 #ifndef pte_advance_pfn
 static inline pte_t pte_advance_pfn(pte_t pte, unsigned long nr)
 {
diff --git a/mm/memory.c b/mm/memory.c
index 32498c300fdf0..146a7b15c676d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -988,16 +988,20 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 {
 	unsigned long folio_end_pfn = folio_pfn(folio) + folio_nr_pages(folio);
 	const pte_t *end_ptep = start_ptep + max_nr;
-	pte_t expected_pte = __pte_batch_clear_ignored(pte_next_pfn(pte), flags);
-	pte_t *ptep = start_ptep + 1;
+	pte_t expected_pte, *ptep;
 	bool writable;
+	int nr;
 
 	if (any_writable)
 		*any_writable = false;
 
 	VM_WARN_ON_FOLIO(!pte_present(pte), folio);
 
-	while (ptep != end_ptep) {
+	nr = pte_batch_hint(start_ptep, pte);
+	expected_pte = __pte_batch_clear_ignored(pte_advance_pfn(pte, nr), flags);
+	ptep = start_ptep + nr;
+
+	while (ptep < end_ptep) {
 		pte = ptep_get(ptep);
 		if (any_writable)
 			writable = !!pte_write(pte);
@@ -1011,17 +1015,18 @@ static inline int folio_pte_batch(struct folio *folio, unsigned long addr,
 		 * corner cases the next PFN might fall into a different
 		 * folio.
 		 */
-		if (pte_pfn(pte) == folio_end_pfn)
+		if (pte_pfn(pte) >= folio_end_pfn)
 			break;
 
 		if (any_writable)
 			*any_writable |= writable;
 
-		expected_pte = pte_next_pfn(expected_pte);
-		ptep++;
+		nr = pte_batch_hint(ptep, pte);
+		expected_pte = pte_advance_pfn(expected_pte, nr);
+		ptep += nr;
 	}
 
-	return ptep - start_ptep;
+	return min(ptep - start_ptep, max_nr);
 }
 
 /*

From cf7951dc332872cbc628fbf3396da583de2960bf Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:03 +0000
Subject: [PATCH 039/352] arm64/mm: implement pte_batch_hint()

BugLink: https://bugs.launchpad.net/bugs/2059316

When core code iterates over a range of ptes and calls ptep_get() for each
of them, if the range happens to cover contpte mappings, the number of pte
reads becomes amplified by a factor of the number of PTEs in a contpte
block.  This is because for each call to ptep_get(), the implementation
must read all of the ptes in the contpte block to which it belongs to
gather the access and dirty bits.

This causes a hotspot for fork(), as well as operations that unmap memory
such as munmap(), exit and madvise(MADV_DONTNEED).  Fortunately we can fix
this by implementing pte_batch_hint() which allows their iterators to skip
getting the contpte tail ptes when gathering the batch of ptes to operate
on.  This results in the number of PTE reads returning to 1 per pte.

Link: https://lkml.kernel.org/r/20240215103205.2607016-17-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Tested-by: John Hubbard <jhubbard@nvidia.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit fb5451e5f72b31002760083a99fbb41771c4f1ad)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index a8f1a35e30867..d759a20d2929a 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1213,6 +1213,15 @@ static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 		__contpte_try_unfold(mm, addr, ptep, pte);
 }
 
+#define pte_batch_hint pte_batch_hint
+static inline unsigned int pte_batch_hint(pte_t *ptep, pte_t pte)
+{
+	if (!pte_valid_cont(pte))
+		return 1;
+
+	return CONT_PTES - (((unsigned long)ptep >> 3) & (CONT_PTES - 1));
+}
+
 /*
  * The below functions constitute the public API that arm64 presents to the
  * core-mm to manipulate PTE entries within their page tables (or at least this

From f2227271925a2424a6d4ac456e231ff8c85f1acd Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:04 +0000
Subject: [PATCH 040/352] arm64/mm: __always_inline to improve fork() perf

BugLink: https://bugs.launchpad.net/bugs/2059316

As set_ptes() and wrprotect_ptes() become a bit more complex, the compiler
may choose not to inline them.  But this is critical for fork()
performance.  So mark the functions, along with contpte_try_unfold() which
is called by them, as __always_inline.  This is worth ~1% on the fork()
microbenchmark with order-0 folios (the common case).

Link: https://lkml.kernel.org/r/20240215103205.2607016-18-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit b972fc6afba002319fe23bc698ce6431ee43868c)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index d759a20d2929a..8310875133ffc 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1206,8 +1206,8 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep,
 				pte_t entry, int dirty);
 
-static inline void contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
-					pte_t *ptep, pte_t pte)
+static __always_inline void contpte_try_unfold(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, pte_t pte)
 {
 	if (unlikely(pte_valid_cont(pte)))
 		__contpte_try_unfold(mm, addr, ptep, pte);
@@ -1278,7 +1278,7 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 }
 
 #define set_ptes set_ptes
-static inline void set_ptes(struct mm_struct *mm, unsigned long addr,
+static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte, unsigned int nr)
 {
 	pte = pte_mknoncont(pte);
@@ -1360,8 +1360,8 @@ static inline int ptep_clear_flush_young(struct vm_area_struct *vma,
 }
 
 #define wrprotect_ptes wrprotect_ptes
-static inline void wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
-				pte_t *ptep, unsigned int nr)
+static __always_inline void wrprotect_ptes(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, unsigned int nr)
 {
 	if (likely(nr == 1)) {
 		/*

From 0a4aaf87e30cfe78a9bd9e865d3aede7dc00d646 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Thu, 15 Feb 2024 10:32:05 +0000
Subject: [PATCH 041/352] arm64/mm: automatically fold contpte mappings

BugLink: https://bugs.launchpad.net/bugs/2059316

There are situations where a change to a single PTE could cause the
contpte block in which it resides to become foldable (i.e.  could be
repainted with the contiguous bit).  Such situations arise, for example,
when user space temporarily changes protections, via mprotect, for
individual pages, such can be the case for certain garbage collectors.

We would like to detect when such a PTE change occurs.  However this can
be expensive due to the amount of checking required.  Therefore only
perform the checks when an indiviual PTE is modified via mprotect
(ptep_modify_prot_commit() -> set_pte_at() -> set_ptes(nr=1)) and only
when we are setting the final PTE in a contpte-aligned block.

Link: https://lkml.kernel.org/r/20240215103205.2607016-19-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Barry Song <21cnbao@gmail.com>
Cc: Borislav Petkov (AMD) <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morse <james.morse@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will@kernel.org>
Cc: Yang Shi <shy828301@gmail.com>
Cc: Zi Yan <ziy@nvidia.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit f0c2264958e18bc7bc35b567d51b99461e4de34f)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/pgtable.h | 26 +++++++++++++
 arch/arm64/mm/contpte.c          | 64 ++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 8310875133ffc..401087e8a43dc 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1185,6 +1185,8 @@ extern void ptep_modify_prot_commit(struct vm_area_struct *vma,
  * where it is possible and makes sense to do so. The PTE_CONT bit is considered
  * a private implementation detail of the public ptep API (see below).
  */
+extern void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+				pte_t *ptep, pte_t pte);
 extern void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, pte_t pte);
 extern pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte);
@@ -1206,6 +1208,29 @@ extern int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 				unsigned long addr, pte_t *ptep,
 				pte_t entry, int dirty);
 
+static __always_inline void contpte_try_fold(struct mm_struct *mm,
+				unsigned long addr, pte_t *ptep, pte_t pte)
+{
+	/*
+	 * Only bother trying if both the virtual and physical addresses are
+	 * aligned and correspond to the last entry in a contig range. The core
+	 * code mostly modifies ranges from low to high, so this is the likely
+	 * the last modification in the contig range, so a good time to fold.
+	 * We can't fold special mappings, because there is no associated folio.
+	 */
+
+	const unsigned long contmask = CONT_PTES - 1;
+	bool valign = ((addr >> PAGE_SHIFT) & contmask) == contmask;
+
+	if (unlikely(valign)) {
+		bool palign = (pte_pfn(pte) & contmask) == contmask;
+
+		if (unlikely(palign &&
+		    pte_valid(pte) && !pte_cont(pte) && !pte_special(pte)))
+			__contpte_try_fold(mm, addr, ptep, pte);
+	}
+}
+
 static __always_inline void contpte_try_unfold(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep, pte_t pte)
 {
@@ -1286,6 +1311,7 @@ static __always_inline void set_ptes(struct mm_struct *mm, unsigned long addr,
 	if (likely(nr == 1)) {
 		contpte_try_unfold(mm, addr, ptep, __ptep_get(ptep));
 		__set_ptes(mm, addr, ptep, pte, 1);
+		contpte_try_fold(mm, addr, ptep, pte);
 	} else {
 		contpte_set_ptes(mm, addr, ptep, pte, nr);
 	}
diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 50e0173dc5eee..16788f07716d5 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -73,6 +73,70 @@ static void contpte_convert(struct mm_struct *mm, unsigned long addr,
 	__set_ptes(mm, start_addr, start_ptep, pte, CONT_PTES);
 }
 
+void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
+			pte_t *ptep, pte_t pte)
+{
+	/*
+	 * We have already checked that the virtual and pysical addresses are
+	 * correctly aligned for a contpte mapping in contpte_try_fold() so the
+	 * remaining checks are to ensure that the contpte range is fully
+	 * covered by a single folio, and ensure that all the ptes are valid
+	 * with contiguous PFNs and matching prots. We ignore the state of the
+	 * access and dirty bits for the purpose of deciding if its a contiguous
+	 * range; the folding process will generate a single contpte entry which
+	 * has a single access and dirty bit. Those 2 bits are the logical OR of
+	 * their respective bits in the constituent pte entries. In order to
+	 * ensure the contpte range is covered by a single folio, we must
+	 * recover the folio from the pfn, but special mappings don't have a
+	 * folio backing them. Fortunately contpte_try_fold() already checked
+	 * that the pte is not special - we never try to fold special mappings.
+	 * Note we can't use vm_normal_page() for this since we don't have the
+	 * vma.
+	 */
+
+	unsigned long folio_start, folio_end;
+	unsigned long cont_start, cont_end;
+	pte_t expected_pte, subpte;
+	struct folio *folio;
+	struct page *page;
+	unsigned long pfn;
+	pte_t *orig_ptep;
+	pgprot_t prot;
+
+	int i;
+
+	if (!mm_is_user(mm))
+		return;
+
+	page = pte_page(pte);
+	folio = page_folio(page);
+	folio_start = addr - (page - &folio->page) * PAGE_SIZE;
+	folio_end = folio_start + folio_nr_pages(folio) * PAGE_SIZE;
+	cont_start = ALIGN_DOWN(addr, CONT_PTE_SIZE);
+	cont_end = cont_start + CONT_PTE_SIZE;
+
+	if (folio_start > cont_start || folio_end < cont_end)
+		return;
+
+	pfn = ALIGN_DOWN(pte_pfn(pte), CONT_PTES);
+	prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
+	expected_pte = pfn_pte(pfn, prot);
+	orig_ptep = ptep;
+	ptep = contpte_align_down(ptep);
+
+	for (i = 0; i < CONT_PTES; i++) {
+		subpte = pte_mkold(pte_mkclean(__ptep_get(ptep)));
+		if (!pte_same(subpte, expected_pte))
+			return;
+		expected_pte = pte_advance_pfn(expected_pte, 1);
+		ptep++;
+	}
+
+	pte = pte_mkcont(pte);
+	contpte_convert(mm, addr, orig_ptep, pte);
+}
+EXPORT_SYMBOL(__contpte_try_fold);
+
 void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 			pte_t *ptep, pte_t pte)
 {

From 4c5fe997c67de8f799e4fd5c1edd9f9b1ef60fe5 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 26 Feb 2024 12:03:20 +0000
Subject: [PATCH 042/352] arm64/mm: export contpte symbols only to GPL users

BugLink: https://bugs.launchpad.net/bugs/2059316

Patch series "Address some contpte nits".

These 2 patches address some nits raised by Catalin late in the review cycle for
my contpte series [1].

[1] https://lore.kernel.org/linux-mm/20240215103205.2607016-1-ryan.roberts@arm.com/

This patch (of 2):

The contpte symbols must be exported since some of the public inline
ptep_* APIs are called from modules and these inlines now call the contpte
functions.  Originally they were exported as EXPORT_SYMBOL() for fear of
breaking out-of-tree modules.  But we subsequently concluded that
EXPORT_SYMBOL_GPL() should be safe since these functions are deeply core
mm routines, and any module operating at this level is not going to be
able to survive on EXPORT_SYMBOL alone.

Link: https://lkml.kernel.org/r/20240226120321.1055731-1-ryan.roberts@arm.com
Link: https://lore.kernel.org/linux-mm/f9fc2b31-11cb-4969-8961-9c89fea41b74@nvidia.com/
Link: https://lkml.kernel.org/r/20240226120321.1055731-2-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 912609e96cd728766373d84903f12a6d836de518)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/mm/contpte.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index 16788f07716d5..be0a226c4ff9b 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -135,7 +135,7 @@ void __contpte_try_fold(struct mm_struct *mm, unsigned long addr,
 	pte = pte_mkcont(pte);
 	contpte_convert(mm, addr, orig_ptep, pte);
 }
-EXPORT_SYMBOL(__contpte_try_fold);
+EXPORT_SYMBOL_GPL(__contpte_try_fold);
 
 void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 			pte_t *ptep, pte_t pte)
@@ -150,7 +150,7 @@ void __contpte_try_unfold(struct mm_struct *mm, unsigned long addr,
 	pte = pte_mknoncont(pte);
 	contpte_convert(mm, addr, ptep, pte);
 }
-EXPORT_SYMBOL(__contpte_try_unfold);
+EXPORT_SYMBOL_GPL(__contpte_try_unfold);
 
 pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
 {
@@ -178,7 +178,7 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
 
 	return orig_pte;
 }
-EXPORT_SYMBOL(contpte_ptep_get);
+EXPORT_SYMBOL_GPL(contpte_ptep_get);
 
 pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
 {
@@ -231,7 +231,7 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
 
 	return orig_pte;
 }
-EXPORT_SYMBOL(contpte_ptep_get_lockless);
+EXPORT_SYMBOL_GPL(contpte_ptep_get_lockless);
 
 void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
 					pte_t *ptep, pte_t pte, unsigned int nr)
@@ -274,7 +274,7 @@ void contpte_set_ptes(struct mm_struct *mm, unsigned long addr,
 
 	} while (addr != end);
 }
-EXPORT_SYMBOL(contpte_set_ptes);
+EXPORT_SYMBOL_GPL(contpte_set_ptes);
 
 void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
 				pte_t *ptep, unsigned int nr, int full)
@@ -282,7 +282,7 @@ void contpte_clear_full_ptes(struct mm_struct *mm, unsigned long addr,
 	contpte_try_unfold_partial(mm, addr, ptep, nr);
 	__clear_full_ptes(mm, addr, ptep, nr, full);
 }
-EXPORT_SYMBOL(contpte_clear_full_ptes);
+EXPORT_SYMBOL_GPL(contpte_clear_full_ptes);
 
 pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
 				unsigned long addr, pte_t *ptep,
@@ -291,7 +291,7 @@ pte_t contpte_get_and_clear_full_ptes(struct mm_struct *mm,
 	contpte_try_unfold_partial(mm, addr, ptep, nr);
 	return __get_and_clear_full_ptes(mm, addr, ptep, nr, full);
 }
-EXPORT_SYMBOL(contpte_get_and_clear_full_ptes);
+EXPORT_SYMBOL_GPL(contpte_get_and_clear_full_ptes);
 
 int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep)
@@ -316,7 +316,7 @@ int contpte_ptep_test_and_clear_young(struct vm_area_struct *vma,
 
 	return young;
 }
-EXPORT_SYMBOL(contpte_ptep_test_and_clear_young);
+EXPORT_SYMBOL_GPL(contpte_ptep_test_and_clear_young);
 
 int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep)
@@ -337,7 +337,7 @@ int contpte_ptep_clear_flush_young(struct vm_area_struct *vma,
 
 	return young;
 }
-EXPORT_SYMBOL(contpte_ptep_clear_flush_young);
+EXPORT_SYMBOL_GPL(contpte_ptep_clear_flush_young);
 
 void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
 					pte_t *ptep, unsigned int nr)
@@ -355,7 +355,7 @@ void contpte_wrprotect_ptes(struct mm_struct *mm, unsigned long addr,
 	contpte_try_unfold_partial(mm, addr, ptep, nr);
 	__wrprotect_ptes(mm, addr, ptep, nr);
 }
-EXPORT_SYMBOL(contpte_wrprotect_ptes);
+EXPORT_SYMBOL_GPL(contpte_wrprotect_ptes);
 
 int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 					unsigned long addr, pte_t *ptep,
@@ -401,4 +401,4 @@ int contpte_ptep_set_access_flags(struct vm_area_struct *vma,
 
 	return 1;
 }
-EXPORT_SYMBOL(contpte_ptep_set_access_flags);
+EXPORT_SYMBOL_GPL(contpte_ptep_set_access_flags);

From 9552f71d16212f9e8c4439e91c8409efc5dbddb2 Mon Sep 17 00:00:00 2001
From: Ryan Roberts <ryan.roberts@arm.com>
Date: Mon, 26 Feb 2024 12:03:21 +0000
Subject: [PATCH 043/352] arm64/mm: improve comment in
 contpte_ptep_get_lockless()

BugLink: https://bugs.launchpad.net/bugs/2059316

Make clear the atmicity/consistency requirements of the API and how we
achieve them.

Link: https://lore.kernel.org/linux-mm/Zc-Tqqfksho3BHmU@arm.com/
Link: https://lkml.kernel.org/r/20240226120321.1055731-3-ryan.roberts@arm.com
Signed-off-by: Ryan Roberts <ryan.roberts@arm.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 94c18d5f7e0d612ce3fb9cb4aa8cfb1308d57a0a)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/mm/contpte.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/contpte.c b/arch/arm64/mm/contpte.c
index be0a226c4ff9b..1b64b4c3f8bf8 100644
--- a/arch/arm64/mm/contpte.c
+++ b/arch/arm64/mm/contpte.c
@@ -183,16 +183,20 @@ EXPORT_SYMBOL_GPL(contpte_ptep_get);
 pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
 {
 	/*
-	 * Gather access/dirty bits, which may be populated in any of the ptes
-	 * of the contig range. We may not be holding the PTL, so any contiguous
-	 * range may be unfolded/modified/refolded under our feet. Therefore we
-	 * ensure we read a _consistent_ contpte range by checking that all ptes
-	 * in the range are valid and have CONT_PTE set, that all pfns are
-	 * contiguous and that all pgprots are the same (ignoring access/dirty).
-	 * If we find a pte that is not consistent, then we must be racing with
-	 * an update so start again. If the target pte does not have CONT_PTE
-	 * set then that is considered consistent on its own because it is not
-	 * part of a contpte range.
+	 * The ptep_get_lockless() API requires us to read and return *orig_ptep
+	 * so that it is self-consistent, without the PTL held, so we may be
+	 * racing with other threads modifying the pte. Usually a READ_ONCE()
+	 * would suffice, but for the contpte case, we also need to gather the
+	 * access and dirty bits from across all ptes in the contiguous block,
+	 * and we can't read all of those neighbouring ptes atomically, so any
+	 * contiguous range may be unfolded/modified/refolded under our feet.
+	 * Therefore we ensure we read a _consistent_ contpte range by checking
+	 * that all ptes in the range are valid and have CONT_PTE set, that all
+	 * pfns are contiguous and that all pgprots are the same (ignoring
+	 * access/dirty). If we find a pte that is not consistent, then we must
+	 * be racing with an update so start again. If the target pte does not
+	 * have CONT_PTE set then that is considered consistent on its own
+	 * because it is not part of a contpte range.
 	 */
 
 	pgprot_t orig_prot;

From 3717a07f2fb1618323d228b07a90bff5c4961ea0 Mon Sep 17 00:00:00 2001
From: Ian May <ian.may@canonical.com>
Date: Wed, 24 Apr 2024 22:45:17 -0500
Subject: [PATCH 044/352] UBUNTU: [Packaging] blacklist coresight_etm4x

BugLink: https://bugs.launchpad.net/bugs/2061930
BugLink: https://bugs.launchpad.net/bugs/2067106

There are systems in production that don't have
firmware that supports coresight_etm4x.  Instead of
removing completely, blacklist coresight_etm4x so
systems with the correct firmware can use the module.

Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/modprobe.d/common.conf | 1 +
 1 file changed, 1 insertion(+)

diff --git a/debian.nvidia-adv/modprobe.d/common.conf b/debian.nvidia-adv/modprobe.d/common.conf
index e0fbbd6e060d4..619c9a23fe210 100644
--- a/debian.nvidia-adv/modprobe.d/common.conf
+++ b/debian.nvidia-adv/modprobe.d/common.conf
@@ -1,3 +1,4 @@
 # LP:1434842 -- disable OSS drivers by default to allow pulseaudio to emulate
 blacklist snd-mixer-oss
 blacklist snd-pcm-oss
+blacklist coresight_etm4x

From 4825d17822c47e482f469c9e211feb96461ff08c Mon Sep 17 00:00:00 2001
From: Sourab Gupta <sougupta@nvidia.com>
Date: Thu, 6 Jun 2024 01:33:08 +0000
Subject: [PATCH 045/352] NVIDIA: SAUCE: NFS: Export nvfs register and
 unregister functions as GPL

BugLink: https://bugs.launchpad.net/bugs/2068544

On linux kernel 6.6 and above, __symbol_get() on the registration functions
from nvidia-fs was failing as a GPL modules are no longer allowed to
__symbol_get() on non-gpl exported symbols. This change fixes that issue for nfs.

Signed-off-by: Sourab Gupta <sougupta@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 net/sunrpc/xprtrdma/nvfs_rpc_rdma.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c b/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c
index 8691ec73bca26..2fd7ecca82b15 100644
--- a/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/nvfs_rpc_rdma.c
@@ -36,7 +36,7 @@ int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
 	}
 	return -EOPNOTSUPP;
 }
-EXPORT_SYMBOL(REGISTER_FUNC);
+EXPORT_SYMBOL_GPL(REGISTER_FUNC);
 
 // protected via nvfs_module_mutex
 void UNREGISTER_FUNC(void)
@@ -47,5 +47,5 @@ void UNREGISTER_FUNC(void)
 	} while (nvfs_count_ops());
 	nvfs_ops = NULL;
 }
-EXPORT_SYMBOL(UNREGISTER_FUNC);
+EXPORT_SYMBOL_GPL(UNREGISTER_FUNC);
 #endif

From 0c52df3ee1f372a3501c06bdc0ff4f502df522cd Mon Sep 17 00:00:00 2001
From: Sourab Gupta <sougupta@nvidia.com>
Date: Thu, 6 Jun 2024 01:15:08 +0000
Subject: [PATCH 046/352] NVIDIA: SAUCE: NVMe/NVMeoF: Export nvfs register and
 unregister functions as GPL

BugLink: https://bugs.launchpad.net/bugs/2068544

On linux kernel 6.6 and above, __symbol_get() on the registration functions
from nvidia-fs was failing as a GPL modules are no longer allowed to
__symbol_get() on non-gpl exported symbols. This change fixes that issue.

Signed-off-by: Sourab Gupta <sougupta@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/nvme/host/nvfs-dma.c  | 4 ++--
 drivers/nvme/host/nvfs-rdma.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/nvfs-dma.c b/drivers/nvme/host/nvfs-dma.c
index f79a3f1fac391..4b0bbdc35caa1 100644
--- a/drivers/nvme/host/nvfs-dma.c
+++ b/drivers/nvme/host/nvfs-dma.c
@@ -38,7 +38,7 @@ int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
 
 
 }
-EXPORT_SYMBOL(REGISTER_FUNC);
+EXPORT_SYMBOL_GPL(REGISTER_FUNC);
 
 // protected via nvfs_module_mutex
 void UNREGISTER_FUNC(void)
@@ -49,5 +49,5 @@ void UNREGISTER_FUNC(void)
         } while (nvfs_count_ops());
         nvfs_ops = NULL;
 }
-EXPORT_SYMBOL(UNREGISTER_FUNC);
+EXPORT_SYMBOL_GPL(UNREGISTER_FUNC);
 #endif
diff --git a/drivers/nvme/host/nvfs-rdma.c b/drivers/nvme/host/nvfs-rdma.c
index 2bdebc8693fbb..178669e23383e 100644
--- a/drivers/nvme/host/nvfs-rdma.c
+++ b/drivers/nvme/host/nvfs-rdma.c
@@ -37,7 +37,7 @@ int REGISTER_FUNC(struct nvfs_dma_rw_ops *ops)
              return -EOPNOTSUPP;
 
 }
-EXPORT_SYMBOL(REGISTER_FUNC);
+EXPORT_SYMBOL_GPL(REGISTER_FUNC);
 
 // protected via nvfs_module_mutex
 void UNREGISTER_FUNC(void)
@@ -48,5 +48,5 @@ void UNREGISTER_FUNC(void)
         } while (nvfs_count_ops());
         nvfs_ops = NULL;
 }
-EXPORT_SYMBOL(UNREGISTER_FUNC);
+EXPORT_SYMBOL_GPL(UNREGISTER_FUNC);
 #endif

From e8e0b8793f074c0a557e27b3911fce560509375b Mon Sep 17 00:00:00 2001
From: David Thompson <davthompson@nvidia.com>
Date: Tue, 5 Mar 2024 16:21:37 -0500
Subject: [PATCH 047/352] mlxbf_gige: add support to display pause frame
 counters

BugLink: https://bugs.launchpad.net/bugs/2068067

This patch updates the mlxbf_gige driver to support the
"get_pause_stats()" callback, which enables display of
pause frame counters via "ethtool -I -a oob_net0".

The pause frame counters are only enabled if the "counters_en"
bit is asserted in the LLU general config register. The driver
will only report stats, and thus overwrite the default stats
state of ETHTOOL_STAT_NOT_SET, if "counters_en" is asserted.

Reviewed-by: Asmaa Mnebhi <asmaa@nvidia.com>
Signed-off-by: David Thompson <davthompson@nvidia.com>
Link: https://lore.kernel.org/r/20240305212137.3525-1-davthompson@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
(cherry picked from commit c2234161985212d28711c1030337515d3852db80)
Signed-off-by: David Thompson <davthompson@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 .../mellanox/mlxbf_gige/mlxbf_gige_ethtool.c  | 36 +++++++++++++++++++
 .../mellanox/mlxbf_gige/mlxbf_gige_regs.h     | 30 ++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_ethtool.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_ethtool.c
index 253d7ad9b8095..8b63968bbee98 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_ethtool.c
@@ -124,6 +124,41 @@ static void mlxbf_gige_get_pauseparam(struct net_device *netdev,
 	pause->tx_pause = 1;
 }
 
+static bool mlxbf_gige_llu_counters_enabled(struct mlxbf_gige *priv)
+{
+	u32 data;
+
+	if (priv->hw_version == MLXBF_GIGE_VERSION_BF2) {
+		data = readl(priv->llu_base + MLXBF_GIGE_BF2_LLU_GENERAL_CONFIG);
+		if (data & MLXBF_GIGE_BF2_LLU_COUNTERS_EN)
+			return true;
+	} else {
+		data = readl(priv->llu_base + MLXBF_GIGE_BF3_LLU_GENERAL_CONFIG);
+		if (data & MLXBF_GIGE_BF3_LLU_COUNTERS_EN)
+			return true;
+	}
+
+	return false;
+}
+
+static void mlxbf_gige_get_pause_stats(struct net_device *netdev,
+				       struct ethtool_pause_stats *pause_stats)
+{
+	struct mlxbf_gige *priv = netdev_priv(netdev);
+	u64 data_lo, data_hi;
+
+	/* Read LLU counters to provide stats only if counters are enabled */
+	if (mlxbf_gige_llu_counters_enabled(priv)) {
+		data_lo = readl(priv->llu_base + MLXBF_GIGE_TX_PAUSE_CNT_LO);
+		data_hi = readl(priv->llu_base + MLXBF_GIGE_TX_PAUSE_CNT_HI);
+		pause_stats->tx_pause_frames = (data_hi << 32) | data_lo;
+
+		data_lo = readl(priv->llu_base + MLXBF_GIGE_RX_PAUSE_CNT_LO);
+		data_hi = readl(priv->llu_base + MLXBF_GIGE_RX_PAUSE_CNT_HI);
+		pause_stats->rx_pause_frames = (data_hi << 32) | data_lo;
+	}
+}
+
 const struct ethtool_ops mlxbf_gige_ethtool_ops = {
 	.get_link		= ethtool_op_get_link,
 	.get_ringparam		= mlxbf_gige_get_ringparam,
@@ -134,6 +169,7 @@ const struct ethtool_ops mlxbf_gige_ethtool_ops = {
 	.get_ethtool_stats      = mlxbf_gige_get_ethtool_stats,
 	.nway_reset		= phy_ethtool_nway_reset,
 	.get_pauseparam		= mlxbf_gige_get_pauseparam,
+	.get_pause_stats	= mlxbf_gige_get_pause_stats,
 	.get_link_ksettings	= phy_ethtool_get_link_ksettings,
 	.set_link_ksettings	= phy_ethtool_set_link_ksettings,
 };
diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h
index cd0973229c9bb..98a8681c21b9c 100644
--- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h
+++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_regs.h
@@ -99,4 +99,34 @@
 #define MLXBF_GIGE_100M_IPG_SIZE                      119
 #define MLXBF_GIGE_10M_IPG_SIZE                       1199
 
+/* Offsets into OOB LLU block for pause frame counters */
+#define MLXBF_GIGE_BF2_TX_PAUSE_CNT_HI                0x33d8
+#define MLXBF_GIGE_BF2_TX_PAUSE_CNT_LO                0x33dc
+#define MLXBF_GIGE_BF2_RX_PAUSE_CNT_HI                0x3210
+#define MLXBF_GIGE_BF2_RX_PAUSE_CNT_LO                0x3214
+
+#define MLXBF_GIGE_BF3_TX_PAUSE_CNT_HI                0x3a88
+#define MLXBF_GIGE_BF3_TX_PAUSE_CNT_LO                0x3a8c
+#define MLXBF_GIGE_BF3_RX_PAUSE_CNT_HI                0x38c0
+#define MLXBF_GIGE_BF3_RX_PAUSE_CNT_LO                0x38c4
+
+#define MLXBF_GIGE_TX_PAUSE_CNT_HI ((priv->hw_version == MLXBF_GIGE_VERSION_BF2) ? \
+				    MLXBF_GIGE_BF2_TX_PAUSE_CNT_HI :    \
+				    MLXBF_GIGE_BF3_TX_PAUSE_CNT_HI)
+#define MLXBF_GIGE_TX_PAUSE_CNT_LO ((priv->hw_version == MLXBF_GIGE_VERSION_BF2) ? \
+				    MLXBF_GIGE_BF2_TX_PAUSE_CNT_LO : \
+				    MLXBF_GIGE_BF3_TX_PAUSE_CNT_LO)
+#define MLXBF_GIGE_RX_PAUSE_CNT_HI ((priv->hw_version == MLXBF_GIGE_VERSION_BF2) ? \
+				    MLXBF_GIGE_BF2_RX_PAUSE_CNT_HI : \
+				    MLXBF_GIGE_BF3_RX_PAUSE_CNT_HI)
+#define MLXBF_GIGE_RX_PAUSE_CNT_LO ((priv->hw_version == MLXBF_GIGE_VERSION_BF2) ? \
+				    MLXBF_GIGE_BF2_RX_PAUSE_CNT_LO : \
+				    MLXBF_GIGE_BF3_RX_PAUSE_CNT_LO)
+
+#define MLXBF_GIGE_BF2_LLU_GENERAL_CONFIG             0x2110
+#define MLXBF_GIGE_BF3_LLU_GENERAL_CONFIG             0x2030
+
+#define MLXBF_GIGE_BF2_LLU_COUNTERS_EN                BIT(0)
+#define MLXBF_GIGE_BF3_LLU_COUNTERS_EN                BIT(4)
+
 #endif /* !defined(__MLXBF_GIGE_REGS_H__) */

From a3feb2470c8b21fc782defd418d9b031ee184756 Mon Sep 17 00:00:00 2001
From: Shravan Kumar Ramani <shravankr@nvidia.com>
Date: Tue, 13 Feb 2024 06:15:25 -0500
Subject: [PATCH 048/352] platform/mellanox: mlxbf-pmc: Replace uintN_t with
 kernel-style types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BugLink: https://bugs.launchpad.net/bugs/2069777

Use u8, u32 and u64 instead of respective uintN_t types.
Remove unnecessary newlines for function argument lists.

Signed-off-by: Shravan Kumar Ramani <shravankr@nvidia.com>
Link: https://lore.kernel.org/r/39be055af3506ce6f843d11e45d71620f2a96e26.1707808180.git.shravankr@nvidia.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
(cherry picked from commit fd23023e2aaa78320243801666690deb751143c2)
Signed-off-by: David Thompson <davthompson@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/platform/mellanox/mlxbf-pmc.c | 109 +++++++++++---------------
 1 file changed, 47 insertions(+), 62 deletions(-)

diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c
index b1995ac268d77..86044d1b8fa58 100644
--- a/drivers/platform/mellanox/mlxbf-pmc.c
+++ b/drivers/platform/mellanox/mlxbf-pmc.c
@@ -149,17 +149,17 @@ struct mlxbf_pmc_block_info {
  */
 struct mlxbf_pmc_context {
 	struct platform_device *pdev;
-	uint32_t total_blocks;
-	uint32_t tile_count;
-	uint8_t llt_enable;
-	uint8_t mss_enable;
-	uint32_t group_num;
+	u32 total_blocks;
+	u32 tile_count;
+	u8 llt_enable;
+	u8 mss_enable;
+	u32 group_num;
 	struct device *hwmon_dev;
 	const char *block_name[MLXBF_PMC_MAX_BLOCKS];
 	struct mlxbf_pmc_block_info block[MLXBF_PMC_MAX_BLOCKS];
 	const struct attribute_group *groups[MLXBF_PMC_MAX_BLOCKS];
 	bool svc_sreg_support;
-	uint32_t sreg_tbl_perf;
+	u32 sreg_tbl_perf;
 	unsigned int event_set;
 };
 
@@ -865,8 +865,7 @@ static struct mlxbf_pmc_context *pmc;
 static const char *mlxbf_pmc_svc_uuid_str = "89c036b4-e7d7-11e6-8797-001aca00bfc4";
 
 /* Calls an SMC to access a performance register */
-static int mlxbf_pmc_secure_read(void __iomem *addr, uint32_t command,
-				 uint64_t *result)
+static int mlxbf_pmc_secure_read(void __iomem *addr, u32 command, u64 *result)
 {
 	struct arm_smccc_res res;
 	int status, err = 0;
@@ -892,8 +891,7 @@ static int mlxbf_pmc_secure_read(void __iomem *addr, uint32_t command,
 }
 
 /* Read from a performance counter */
-static int mlxbf_pmc_read(void __iomem *addr, uint32_t command,
-			  uint64_t *result)
+static int mlxbf_pmc_read(void __iomem *addr, u32 command, u64 *result)
 {
 	if (pmc->svc_sreg_support)
 		return mlxbf_pmc_secure_read(addr, command, result);
@@ -907,22 +905,21 @@ static int mlxbf_pmc_read(void __iomem *addr, uint32_t command,
 }
 
 /* Convenience function for 32-bit reads */
-static int mlxbf_pmc_readl(void __iomem *addr, uint32_t *result)
+static int mlxbf_pmc_readl(void __iomem *addr, u32 *result)
 {
-	uint64_t read_out;
+	u64 read_out;
 	int status;
 
 	status = mlxbf_pmc_read(addr, MLXBF_PMC_READ_REG_32, &read_out);
 	if (status)
 		return status;
-	*result = (uint32_t)read_out;
+	*result = (u32)read_out;
 
 	return 0;
 }
 
 /* Calls an SMC to access a performance register */
-static int mlxbf_pmc_secure_write(void __iomem *addr, uint32_t command,
-				  uint64_t value)
+static int mlxbf_pmc_secure_write(void __iomem *addr, u32 command, u64 value)
 {
 	struct arm_smccc_res res;
 	int status, err = 0;
@@ -945,7 +942,7 @@ static int mlxbf_pmc_secure_write(void __iomem *addr, uint32_t command,
 }
 
 /* Write to a performance counter */
-static int mlxbf_pmc_write(void __iomem *addr, int command, uint64_t value)
+static int mlxbf_pmc_write(void __iomem *addr, int command, u64 value)
 {
 	if (pmc->svc_sreg_support)
 		return mlxbf_pmc_secure_write(addr, command, value);
@@ -959,7 +956,7 @@ static int mlxbf_pmc_write(void __iomem *addr, int command, uint64_t value)
 }
 
 /* Check if the register offset is within the mapped region for the block */
-static bool mlxbf_pmc_valid_range(int blk_num, uint32_t offset)
+static bool mlxbf_pmc_valid_range(int blk_num, u32 offset)
 {
 	if ((offset >= 0) && !(offset % MLXBF_PMC_REG_SIZE) &&
 	    (offset + MLXBF_PMC_REG_SIZE <= pmc->block[blk_num].blk_size))
@@ -1082,7 +1079,7 @@ static char *mlxbf_pmc_get_event_name(const char *blk, int evt)
 /* Method to enable/disable/reset l3cache counters */
 static int mlxbf_pmc_config_l3_counters(int blk_num, bool enable, bool reset)
 {
-	uint32_t perfcnt_cfg = 0;
+	u32 perfcnt_cfg = 0;
 
 	if (enable)
 		perfcnt_cfg |= MLXBF_PMC_L3C_PERF_CNT_CFG_EN;
@@ -1095,12 +1092,9 @@ static int mlxbf_pmc_config_l3_counters(int blk_num, bool enable, bool reset)
 }
 
 /* Method to handle l3cache counter programming */
-static int mlxbf_pmc_program_l3_counter(int blk_num, uint32_t cnt_num,
-					uint32_t evt)
+static int mlxbf_pmc_program_l3_counter(int blk_num, u32 cnt_num, u32 evt)
 {
-	uint32_t perfcnt_sel_1 = 0;
-	uint32_t perfcnt_sel = 0;
-	uint32_t *wordaddr;
+	u32 perfcnt_sel_1 = 0, perfcnt_sel = 0, *wordaddr;
 	void __iomem *pmcaddr;
 	int ret;
 
@@ -1162,11 +1156,10 @@ static int mlxbf_pmc_program_l3_counter(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to handle crspace counter programming */
-static int mlxbf_pmc_program_crspace_counter(int blk_num, uint32_t cnt_num,
-					     uint32_t evt)
+static int mlxbf_pmc_program_crspace_counter(int blk_num, u32 cnt_num, u32 evt)
 {
-	uint32_t word;
 	void *addr;
+	u32 word;
 	int ret;
 
 	addr = pmc->block[blk_num].mmio_base +
@@ -1187,7 +1180,7 @@ static int mlxbf_pmc_program_crspace_counter(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to clear crspace counter value */
-static int mlxbf_pmc_clear_crspace_counter(int blk_num, uint32_t cnt_num)
+static int mlxbf_pmc_clear_crspace_counter(int blk_num, u32 cnt_num)
 {
 	void *addr;
 
@@ -1199,10 +1192,9 @@ static int mlxbf_pmc_clear_crspace_counter(int blk_num, uint32_t cnt_num)
 }
 
 /* Method to program a counter to monitor an event */
-static int mlxbf_pmc_program_counter(int blk_num, uint32_t cnt_num,
-				     uint32_t evt, bool is_l3)
+static int mlxbf_pmc_program_counter(int blk_num, u32 cnt_num, u32 evt, bool is_l3)
 {
-	uint64_t perfctl, perfevt, perfmon_cfg;
+	u64 perfctl, perfevt, perfmon_cfg;
 
 	if (cnt_num >= pmc->block[blk_num].counters)
 		return -ENODEV;
@@ -1263,12 +1255,11 @@ static int mlxbf_pmc_program_counter(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to handle l3 counter reads */
-static int mlxbf_pmc_read_l3_counter(int blk_num, uint32_t cnt_num,
-				     uint64_t *result)
+static int mlxbf_pmc_read_l3_counter(int blk_num, u32 cnt_num, u64 *result)
 {
-	uint32_t perfcnt_low = 0, perfcnt_high = 0;
-	uint64_t value;
+	u32 perfcnt_low = 0, perfcnt_high = 0;
 	int status;
+	u64 value;
 
 	status = mlxbf_pmc_readl(pmc->block[blk_num].mmio_base +
 					 MLXBF_PMC_L3C_PERF_CNT_LOW +
@@ -1295,11 +1286,10 @@ static int mlxbf_pmc_read_l3_counter(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to handle crspace counter reads */
-static int mlxbf_pmc_read_crspace_counter(int blk_num, uint32_t cnt_num,
-					  uint64_t *result)
+static int mlxbf_pmc_read_crspace_counter(int blk_num, u32 cnt_num, u64 *result)
 {
-	uint32_t value;
 	int status = 0;
+	u32 value;
 
 	status = mlxbf_pmc_readl(pmc->block[blk_num].mmio_base +
 		MLXBF_PMC_CRSPACE_PERFMON_VAL0(pmc->block[blk_num].counters) +
@@ -1313,11 +1303,10 @@ static int mlxbf_pmc_read_crspace_counter(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to read the counter value */
-static int mlxbf_pmc_read_counter(int blk_num, uint32_t cnt_num, bool is_l3,
-				  uint64_t *result)
+static int mlxbf_pmc_read_counter(int blk_num, u32 cnt_num, bool is_l3, u64 *result)
 {
-	uint32_t perfcfg_offset, perfval_offset;
-	uint64_t perfmon_cfg;
+	u32 perfcfg_offset, perfval_offset;
+	u64 perfmon_cfg;
 	int status;
 
 	if (cnt_num >= pmc->block[blk_num].counters)
@@ -1351,13 +1340,11 @@ static int mlxbf_pmc_read_counter(int blk_num, uint32_t cnt_num, bool is_l3,
 }
 
 /* Method to read L3 block event */
-static int mlxbf_pmc_read_l3_event(int blk_num, uint32_t cnt_num,
-				   uint64_t *result)
+static int mlxbf_pmc_read_l3_event(int blk_num, u32 cnt_num, u64 *result)
 {
-	uint32_t perfcnt_sel = 0, perfcnt_sel_1 = 0;
-	uint32_t *wordaddr;
+	u32 perfcnt_sel = 0, perfcnt_sel_1 = 0, *wordaddr;
 	void __iomem *pmcaddr;
-	uint64_t evt;
+	u64 evt;
 
 	/* Select appropriate register information */
 	switch (cnt_num) {
@@ -1405,10 +1392,9 @@ static int mlxbf_pmc_read_l3_event(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to read crspace block event */
-static int mlxbf_pmc_read_crspace_event(int blk_num, uint32_t cnt_num,
-					uint64_t *result)
+static int mlxbf_pmc_read_crspace_event(int blk_num, u32 cnt_num, u64 *result)
 {
-	uint32_t word, evt;
+	u32 word, evt;
 	void *addr;
 	int ret;
 
@@ -1429,11 +1415,10 @@ static int mlxbf_pmc_read_crspace_event(int blk_num, uint32_t cnt_num,
 }
 
 /* Method to find the event currently being monitored by a counter */
-static int mlxbf_pmc_read_event(int blk_num, uint32_t cnt_num, bool is_l3,
-				uint64_t *result)
+static int mlxbf_pmc_read_event(int blk_num, u32 cnt_num, bool is_l3, u64 *result)
 {
-	uint32_t perfcfg_offset, perfval_offset;
-	uint64_t perfmon_cfg, perfevt;
+	u32 perfcfg_offset, perfval_offset;
+	u64 perfmon_cfg, perfevt;
 
 	if (cnt_num >= pmc->block[blk_num].counters)
 		return -EINVAL;
@@ -1469,9 +1454,9 @@ static int mlxbf_pmc_read_event(int blk_num, uint32_t cnt_num, bool is_l3,
 }
 
 /* Method to read a register */
-static int mlxbf_pmc_read_reg(int blk_num, uint32_t offset, uint64_t *result)
+static int mlxbf_pmc_read_reg(int blk_num, u32 offset, u64 *result)
 {
-	uint32_t ecc_out;
+	u32 ecc_out;
 
 	if (strstr(pmc->block_name[blk_num], "ecc")) {
 		if (mlxbf_pmc_readl(pmc->block[blk_num].mmio_base + offset,
@@ -1490,7 +1475,7 @@ static int mlxbf_pmc_read_reg(int blk_num, uint32_t offset, uint64_t *result)
 }
 
 /* Method to write to a register */
-static int mlxbf_pmc_write_reg(int blk_num, uint32_t offset, uint64_t data)
+static int mlxbf_pmc_write_reg(int blk_num, u32 offset, u64 data)
 {
 	if (strstr(pmc->block_name[blk_num], "ecc")) {
 		return mlxbf_pmc_write(pmc->block[blk_num].mmio_base + offset,
@@ -1512,7 +1497,7 @@ static ssize_t mlxbf_pmc_counter_show(struct device *dev,
 		attr, struct mlxbf_pmc_attribute, dev_attr);
 	int blk_num, cnt_num, offset;
 	bool is_l3 = false;
-	uint64_t value;
+	u64 value;
 
 	blk_num = attr_counter->nr;
 	cnt_num = attr_counter->index;
@@ -1546,7 +1531,7 @@ static ssize_t mlxbf_pmc_counter_store(struct device *dev,
 		attr, struct mlxbf_pmc_attribute, dev_attr);
 	int blk_num, cnt_num, offset, err, data;
 	bool is_l3 = false;
-	uint64_t evt_num;
+	u64 evt_num;
 
 	blk_num = attr_counter->nr;
 	cnt_num = attr_counter->index;
@@ -1597,7 +1582,7 @@ static ssize_t mlxbf_pmc_event_show(struct device *dev,
 		attr, struct mlxbf_pmc_attribute, dev_attr);
 	int blk_num, cnt_num, err;
 	bool is_l3 = false;
-	uint64_t evt_num;
+	u64 evt_num;
 	char *evt_name;
 
 	blk_num = attr_event->nr;
@@ -1686,7 +1671,7 @@ static ssize_t mlxbf_pmc_enable_show(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_enable = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	uint32_t perfcnt_cfg, word;
+	u32 perfcnt_cfg, word;
 	int blk_num, value;
 
 	blk_num = attr_enable->nr;
@@ -1718,7 +1703,7 @@ static ssize_t mlxbf_pmc_enable_store(struct device *dev,
 	struct mlxbf_pmc_attribute *attr_enable = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
 	int err, en, blk_num;
-	uint32_t word;
+	u32 word;
 
 	blk_num = attr_enable->nr;
 
@@ -1914,7 +1899,7 @@ static bool mlxbf_pmc_guid_match(const guid_t *guid,
 /* Helper to map the Performance Counters from the varios blocks */
 static int mlxbf_pmc_map_counters(struct device *dev)
 {
-	uint64_t info[MLXBF_PMC_INFO_SZ];
+	u64 info[MLXBF_PMC_INFO_SZ];
 	int i, tile_num, ret;
 
 	for (i = 0; i < pmc->total_blocks; ++i) {

From e9124b23e6a85b01fdc18f228f183e1dde42b19b Mon Sep 17 00:00:00 2001
From: Shravan Kumar Ramani <shravankr@nvidia.com>
Date: Tue, 13 Feb 2024 06:15:26 -0500
Subject: [PATCH 049/352] platform/mellanox: mlxbf-pmc: Cleanup signed/unsigned
 mix-up
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BugLink: https://bugs.launchpad.net/bugs/2069777

Use unsigned integer types for register values and array indices.
Use %u instead of %d accordingly.

Signed-off-by: Shravan Kumar Ramani <shravankr@nvidia.com>
Link: https://lore.kernel.org/r/d8548c70339a29258a906b2b518e5c48f669795c.1707808180.git.shravankr@nvidia.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
(cherry picked from commit 1ae9ffd303c2028048be4cef6221a17442c0175d)
Signed-off-by: David Thompson <davthompson@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/platform/mellanox/mlxbf-pmc.c | 129 ++++++++++++++------------
 1 file changed, 68 insertions(+), 61 deletions(-)

diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c
index 86044d1b8fa58..250405bb59a73 100644
--- a/drivers/platform/mellanox/mlxbf-pmc.c
+++ b/drivers/platform/mellanox/mlxbf-pmc.c
@@ -99,8 +99,8 @@
  */
 struct mlxbf_pmc_attribute {
 	struct device_attribute dev_attr;
-	int index;
-	int nr;
+	unsigned int index;
+	unsigned int nr;
 };
 
 /**
@@ -121,7 +121,7 @@ struct mlxbf_pmc_block_info {
 	void __iomem *mmio_base;
 	size_t blk_size;
 	size_t counters;
-	int type;
+	unsigned int type;
 	struct mlxbf_pmc_attribute *attr_counter;
 	struct mlxbf_pmc_attribute *attr_event;
 	struct mlxbf_pmc_attribute attr_event_list;
@@ -169,7 +169,7 @@ struct mlxbf_pmc_context {
  * @evt_name: Name of the event
  */
 struct mlxbf_pmc_events {
-	int evt_num;
+	u32 evt_num;
 	char *evt_name;
 };
 
@@ -956,7 +956,7 @@ static int mlxbf_pmc_write(void __iomem *addr, int command, u64 value)
 }
 
 /* Check if the register offset is within the mapped region for the block */
-static bool mlxbf_pmc_valid_range(int blk_num, u32 offset)
+static bool mlxbf_pmc_valid_range(unsigned int blk_num, u32 offset)
 {
 	if ((offset >= 0) && !(offset % MLXBF_PMC_REG_SIZE) &&
 	    (offset + MLXBF_PMC_REG_SIZE <= pmc->block[blk_num].blk_size))
@@ -966,8 +966,7 @@ static bool mlxbf_pmc_valid_range(int blk_num, u32 offset)
 }
 
 /* Get the event list corresponding to a certain block */
-static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk,
-							   int *size)
+static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk, size_t *size)
 {
 	const struct mlxbf_pmc_events *events;
 
@@ -1044,7 +1043,8 @@ static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk,
 static int mlxbf_pmc_get_event_num(const char *blk, const char *evt)
 {
 	const struct mlxbf_pmc_events *events;
-	int i, size;
+	unsigned int i;
+	size_t size;
 
 	events = mlxbf_pmc_event_list(blk, &size);
 	if (!events)
@@ -1059,10 +1059,11 @@ static int mlxbf_pmc_get_event_num(const char *blk, const char *evt)
 }
 
 /* Get the event number given the name */
-static char *mlxbf_pmc_get_event_name(const char *blk, int evt)
+static char *mlxbf_pmc_get_event_name(const char *blk, u32 evt)
 {
 	const struct mlxbf_pmc_events *events;
-	int i, size;
+	unsigned int i;
+	size_t size;
 
 	events = mlxbf_pmc_event_list(blk, &size);
 	if (!events)
@@ -1077,7 +1078,7 @@ static char *mlxbf_pmc_get_event_name(const char *blk, int evt)
 }
 
 /* Method to enable/disable/reset l3cache counters */
-static int mlxbf_pmc_config_l3_counters(int blk_num, bool enable, bool reset)
+static int mlxbf_pmc_config_l3_counters(unsigned int blk_num, bool enable, bool reset)
 {
 	u32 perfcnt_cfg = 0;
 
@@ -1092,7 +1093,7 @@ static int mlxbf_pmc_config_l3_counters(int blk_num, bool enable, bool reset)
 }
 
 /* Method to handle l3cache counter programming */
-static int mlxbf_pmc_program_l3_counter(int blk_num, u32 cnt_num, u32 evt)
+static int mlxbf_pmc_program_l3_counter(unsigned int blk_num, u32 cnt_num, u32 evt)
 {
 	u32 perfcnt_sel_1 = 0, perfcnt_sel = 0, *wordaddr;
 	void __iomem *pmcaddr;
@@ -1156,7 +1157,7 @@ static int mlxbf_pmc_program_l3_counter(int blk_num, u32 cnt_num, u32 evt)
 }
 
 /* Method to handle crspace counter programming */
-static int mlxbf_pmc_program_crspace_counter(int blk_num, u32 cnt_num, u32 evt)
+static int mlxbf_pmc_program_crspace_counter(unsigned int blk_num, u32 cnt_num, u32 evt)
 {
 	void *addr;
 	u32 word;
@@ -1180,7 +1181,7 @@ static int mlxbf_pmc_program_crspace_counter(int blk_num, u32 cnt_num, u32 evt)
 }
 
 /* Method to clear crspace counter value */
-static int mlxbf_pmc_clear_crspace_counter(int blk_num, u32 cnt_num)
+static int mlxbf_pmc_clear_crspace_counter(unsigned int blk_num, u32 cnt_num)
 {
 	void *addr;
 
@@ -1192,7 +1193,7 @@ static int mlxbf_pmc_clear_crspace_counter(int blk_num, u32 cnt_num)
 }
 
 /* Method to program a counter to monitor an event */
-static int mlxbf_pmc_program_counter(int blk_num, u32 cnt_num, u32 evt, bool is_l3)
+static int mlxbf_pmc_program_counter(unsigned int blk_num, u32 cnt_num, u32 evt, bool is_l3)
 {
 	u64 perfctl, perfevt, perfmon_cfg;
 
@@ -1255,7 +1256,7 @@ static int mlxbf_pmc_program_counter(int blk_num, u32 cnt_num, u32 evt, bool is_
 }
 
 /* Method to handle l3 counter reads */
-static int mlxbf_pmc_read_l3_counter(int blk_num, u32 cnt_num, u64 *result)
+static int mlxbf_pmc_read_l3_counter(unsigned int blk_num, u32 cnt_num, u64 *result)
 {
 	u32 perfcnt_low = 0, perfcnt_high = 0;
 	int status;
@@ -1286,7 +1287,7 @@ static int mlxbf_pmc_read_l3_counter(int blk_num, u32 cnt_num, u64 *result)
 }
 
 /* Method to handle crspace counter reads */
-static int mlxbf_pmc_read_crspace_counter(int blk_num, u32 cnt_num, u64 *result)
+static int mlxbf_pmc_read_crspace_counter(unsigned int blk_num, u32 cnt_num, u64 *result)
 {
 	int status = 0;
 	u32 value;
@@ -1303,7 +1304,7 @@ static int mlxbf_pmc_read_crspace_counter(int blk_num, u32 cnt_num, u64 *result)
 }
 
 /* Method to read the counter value */
-static int mlxbf_pmc_read_counter(int blk_num, u32 cnt_num, bool is_l3, u64 *result)
+static int mlxbf_pmc_read_counter(unsigned int blk_num, u32 cnt_num, bool is_l3, u64 *result)
 {
 	u32 perfcfg_offset, perfval_offset;
 	u64 perfmon_cfg;
@@ -1340,7 +1341,7 @@ static int mlxbf_pmc_read_counter(int blk_num, u32 cnt_num, bool is_l3, u64 *res
 }
 
 /* Method to read L3 block event */
-static int mlxbf_pmc_read_l3_event(int blk_num, u32 cnt_num, u64 *result)
+static int mlxbf_pmc_read_l3_event(unsigned int blk_num, u32 cnt_num, u64 *result)
 {
 	u32 perfcnt_sel = 0, perfcnt_sel_1 = 0, *wordaddr;
 	void __iomem *pmcaddr;
@@ -1392,7 +1393,7 @@ static int mlxbf_pmc_read_l3_event(int blk_num, u32 cnt_num, u64 *result)
 }
 
 /* Method to read crspace block event */
-static int mlxbf_pmc_read_crspace_event(int blk_num, u32 cnt_num, u64 *result)
+static int mlxbf_pmc_read_crspace_event(unsigned int blk_num, u32 cnt_num, u64 *result)
 {
 	u32 word, evt;
 	void *addr;
@@ -1415,7 +1416,7 @@ static int mlxbf_pmc_read_crspace_event(int blk_num, u32 cnt_num, u64 *result)
 }
 
 /* Method to find the event currently being monitored by a counter */
-static int mlxbf_pmc_read_event(int blk_num, u32 cnt_num, bool is_l3, u64 *result)
+static int mlxbf_pmc_read_event(unsigned int blk_num, u32 cnt_num, bool is_l3, u64 *result)
 {
 	u32 perfcfg_offset, perfval_offset;
 	u64 perfmon_cfg, perfevt;
@@ -1454,7 +1455,7 @@ static int mlxbf_pmc_read_event(int blk_num, u32 cnt_num, bool is_l3, u64 *resul
 }
 
 /* Method to read a register */
-static int mlxbf_pmc_read_reg(int blk_num, u32 offset, u64 *result)
+static int mlxbf_pmc_read_reg(unsigned int blk_num, u32 offset, u64 *result)
 {
 	u32 ecc_out;
 
@@ -1475,7 +1476,7 @@ static int mlxbf_pmc_read_reg(int blk_num, u32 offset, u64 *result)
 }
 
 /* Method to write to a register */
-static int mlxbf_pmc_write_reg(int blk_num, u32 offset, u64 data)
+static int mlxbf_pmc_write_reg(unsigned int blk_num, u32 offset, u64 data)
 {
 	if (strstr(pmc->block_name[blk_num], "ecc")) {
 		return mlxbf_pmc_write(pmc->block[blk_num].mmio_base + offset,
@@ -1495,7 +1496,7 @@ static ssize_t mlxbf_pmc_counter_show(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_counter = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	int blk_num, cnt_num, offset;
+	unsigned int blk_num, cnt_num, offset;
 	bool is_l3 = false;
 	u64 value;
 
@@ -1529,14 +1530,15 @@ static ssize_t mlxbf_pmc_counter_store(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_counter = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	int blk_num, cnt_num, offset, err, data;
+	unsigned int blk_num, cnt_num, offset, data;
 	bool is_l3 = false;
 	u64 evt_num;
+	int err;
 
 	blk_num = attr_counter->nr;
 	cnt_num = attr_counter->index;
 
-	err = kstrtoint(buf, 0, &data);
+	err = kstrtouint(buf, 0, &data);
 	if (err < 0)
 		return err;
 
@@ -1565,7 +1567,7 @@ static ssize_t mlxbf_pmc_counter_store(struct device *dev,
 		if (err)
 			return err;
 	} else if (pmc->block[blk_num].type == MLXBF_PMC_TYPE_CRSPACE) {
-		if (sscanf(attr->attr.name, "counter%d", &cnt_num) != 1)
+		if (sscanf(attr->attr.name, "counter%u", &cnt_num) != 1)
 			return -EINVAL;
 		err = mlxbf_pmc_clear_crspace_counter(blk_num, cnt_num);
 	} else
@@ -1580,10 +1582,11 @@ static ssize_t mlxbf_pmc_event_show(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_event = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	int blk_num, cnt_num, err;
+	unsigned int blk_num, cnt_num;
 	bool is_l3 = false;
-	u64 evt_num;
 	char *evt_name;
+	u64 evt_num;
+	int err;
 
 	blk_num = attr_event->nr;
 	cnt_num = attr_event->index;
@@ -1609,8 +1612,9 @@ static ssize_t mlxbf_pmc_event_store(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_event = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	int blk_num, cnt_num, evt_num, err;
+	unsigned int blk_num, cnt_num, evt_num;
 	bool is_l3 = false;
+	int err;
 
 	blk_num = attr_event->nr;
 	cnt_num = attr_event->index;
@@ -1621,7 +1625,7 @@ static ssize_t mlxbf_pmc_event_store(struct device *dev,
 		if (evt_num < 0)
 			return -EINVAL;
 	} else {
-		err = kstrtoint(buf, 0, &evt_num);
+		err = kstrtouint(buf, 0, &evt_num);
 		if (err < 0)
 			return err;
 	}
@@ -1643,9 +1647,11 @@ static ssize_t mlxbf_pmc_event_list_show(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_event_list = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	int blk_num, i, size, len = 0, ret = 0;
 	const struct mlxbf_pmc_events *events;
 	char e_info[MLXBF_PMC_EVENT_INFO_LEN];
+	unsigned int blk_num, i, len = 0;
+	size_t size;
+	int ret = 0;
 
 	blk_num = attr_event_list->nr;
 
@@ -1671,8 +1677,8 @@ static ssize_t mlxbf_pmc_enable_show(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_enable = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
+	unsigned int blk_num, value;
 	u32 perfcnt_cfg, word;
-	int blk_num, value;
 
 	blk_num = attr_enable->nr;
 
@@ -1692,7 +1698,7 @@ static ssize_t mlxbf_pmc_enable_show(struct device *dev,
 		value = FIELD_GET(MLXBF_PMC_L3C_PERF_CNT_CFG_EN, perfcnt_cfg);
 	}
 
-	return sysfs_emit(buf, "%d\n", value);
+	return sysfs_emit(buf, "%u\n", value);
 }
 
 /* Store function for "enable" sysfs files - only for l3cache & crspace */
@@ -1702,12 +1708,13 @@ static ssize_t mlxbf_pmc_enable_store(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_enable = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	int err, en, blk_num;
+	unsigned int en, blk_num;
 	u32 word;
+	int err;
 
 	blk_num = attr_enable->nr;
 
-	err = kstrtoint(buf, 0, &en);
+	err = kstrtouint(buf, 0, &en);
 	if (err < 0)
 		return err;
 
@@ -1745,10 +1752,10 @@ static ssize_t mlxbf_pmc_enable_store(struct device *dev,
 }
 
 /* Populate attributes for blocks with counters to monitor performance */
-static int mlxbf_pmc_init_perftype_counter(struct device *dev, int blk_num)
+static int mlxbf_pmc_init_perftype_counter(struct device *dev, unsigned int blk_num)
 {
 	struct mlxbf_pmc_attribute *attr;
-	int i = 0, j = 0;
+	unsigned int i = 0, j = 0;
 
 	/* "event_list" sysfs to list events supported by the block */
 	attr = &pmc->block[blk_num].attr_event_list;
@@ -1797,8 +1804,7 @@ static int mlxbf_pmc_init_perftype_counter(struct device *dev, int blk_num)
 		attr->dev_attr.store = mlxbf_pmc_counter_store;
 		attr->index = j;
 		attr->nr = blk_num;
-		attr->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL,
-							  "counter%d", j);
+		attr->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL, "counter%u", j);
 		if (!attr->dev_attr.attr.name)
 			return -ENOMEM;
 		pmc->block[blk_num].block_attr[++i] = &attr->dev_attr.attr;
@@ -1810,8 +1816,7 @@ static int mlxbf_pmc_init_perftype_counter(struct device *dev, int blk_num)
 		attr->dev_attr.store = mlxbf_pmc_event_store;
 		attr->index = j;
 		attr->nr = blk_num;
-		attr->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL,
-							  "event%d", j);
+		attr->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL, "event%u", j);
 		if (!attr->dev_attr.attr.name)
 			return -ENOMEM;
 		pmc->block[blk_num].block_attr[++i] = &attr->dev_attr.attr;
@@ -1822,30 +1827,31 @@ static int mlxbf_pmc_init_perftype_counter(struct device *dev, int blk_num)
 }
 
 /* Populate attributes for blocks with registers to monitor performance */
-static int mlxbf_pmc_init_perftype_reg(struct device *dev, int blk_num)
+static int mlxbf_pmc_init_perftype_reg(struct device *dev, unsigned int blk_num)
 {
-	struct mlxbf_pmc_attribute *attr;
 	const struct mlxbf_pmc_events *events;
-	int i = 0, j = 0;
+	struct mlxbf_pmc_attribute *attr;
+	unsigned int i = 0;
+	size_t count = 0;
 
-	events = mlxbf_pmc_event_list(pmc->block_name[blk_num], &j);
+	events = mlxbf_pmc_event_list(pmc->block_name[blk_num], &count);
 	if (!events)
 		return -EINVAL;
 
 	pmc->block[blk_num].attr_event = devm_kcalloc(
-		dev, j, sizeof(struct mlxbf_pmc_attribute), GFP_KERNEL);
+		dev, count, sizeof(struct mlxbf_pmc_attribute), GFP_KERNEL);
 	if (!pmc->block[blk_num].attr_event)
 		return -ENOMEM;
 
-	while (j > 0) {
-		--j;
-		attr = &pmc->block[blk_num].attr_event[j];
+	while (count > 0) {
+		--count;
+		attr = &pmc->block[blk_num].attr_event[count];
 		attr->dev_attr.attr.mode = 0644;
 		attr->dev_attr.show = mlxbf_pmc_counter_show;
 		attr->dev_attr.store = mlxbf_pmc_counter_store;
 		attr->nr = blk_num;
 		attr->dev_attr.attr.name = devm_kasprintf(dev, GFP_KERNEL,
-							  events[j].evt_name);
+							  events[count].evt_name);
 		if (!attr->dev_attr.attr.name)
 			return -ENOMEM;
 		pmc->block[blk_num].block_attr[i] = &attr->dev_attr.attr;
@@ -1857,7 +1863,7 @@ static int mlxbf_pmc_init_perftype_reg(struct device *dev, int blk_num)
 }
 
 /* Helper to create the bfperf sysfs sub-directories and files */
-static int mlxbf_pmc_create_groups(struct device *dev, int blk_num)
+static int mlxbf_pmc_create_groups(struct device *dev, unsigned int blk_num)
 {
 	int err;
 
@@ -1900,18 +1906,19 @@ static bool mlxbf_pmc_guid_match(const guid_t *guid,
 static int mlxbf_pmc_map_counters(struct device *dev)
 {
 	u64 info[MLXBF_PMC_INFO_SZ];
-	int i, tile_num, ret;
+	unsigned int tile_num, i;
+	int ret;
 
 	for (i = 0; i < pmc->total_blocks; ++i) {
 		/* Create sysfs for tiles only if block number <  tile_count */
 		if (strstr(pmc->block_name[i], "tilenet")) {
-			if (sscanf(pmc->block_name[i], "tilenet%d", &tile_num) != 1)
+			if (sscanf(pmc->block_name[i], "tilenet%u", &tile_num) != 1)
 				continue;
 
 			if (tile_num >= pmc->tile_count)
 				continue;
 		} else if (strstr(pmc->block_name[i], "tile")) {
-			if (sscanf(pmc->block_name[i], "tile%d", &tile_num) != 1)
+			if (sscanf(pmc->block_name[i], "tile%u", &tile_num) != 1)
 				continue;
 
 			if (tile_num >= pmc->tile_count)
@@ -1921,9 +1928,9 @@ static int mlxbf_pmc_map_counters(struct device *dev)
 		/* Create sysfs only for enabled MSS blocks */
 		if (strstr(pmc->block_name[i], "mss") &&
 		    pmc->event_set == MLXBF_PMC_EVENT_SET_BF3) {
-			int mss_num;
+			unsigned int mss_num;
 
-			if (sscanf(pmc->block_name[i], "mss%d", &mss_num) != 1)
+			if (sscanf(pmc->block_name[i], "mss%u", &mss_num) != 1)
 				continue;
 
 			if (!((pmc->mss_enable >> mss_num) & 0x1))
@@ -1932,17 +1939,17 @@ static int mlxbf_pmc_map_counters(struct device *dev)
 
 		/* Create sysfs only for enabled LLT blocks */
 		if (strstr(pmc->block_name[i], "llt_miss")) {
-			int llt_num;
+			unsigned int llt_num;
 
-			if (sscanf(pmc->block_name[i], "llt_miss%d", &llt_num) != 1)
+			if (sscanf(pmc->block_name[i], "llt_miss%u", &llt_num) != 1)
 				continue;
 
 			if (!((pmc->llt_enable >> llt_num) & 0x1))
 				continue;
 		} else if (strstr(pmc->block_name[i], "llt")) {
-			int llt_num;
+			unsigned int llt_num;
 
-			if (sscanf(pmc->block_name[i], "llt%d", &llt_num) != 1)
+			if (sscanf(pmc->block_name[i], "llt%u", &llt_num) != 1)
 				continue;
 
 			if (!((pmc->llt_enable >> llt_num) & 0x1))

From e64c51575af9d6ae69ffca967b78cd8da66e8373 Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@redhat.com>
Date: Thu, 22 Feb 2024 15:57:29 -0500
Subject: [PATCH 050/352] platform/mellanox: mlxbf-pmc: mlxbf_pmc_event_list():
 make size ptr optional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BugLink: https://bugs.launchpad.net/bugs/2069777

The mlxbf_pmc_event_list() function returns a pointer to an array of
supported events and the array size. The array size is returned via
a pointer passed as an argument, which is mandatory.

However, we want to be able to use mlxbf_pmc_event_list() just to check
if a block name is implemented/supported. For this usage passing the size
argument is not necessary so let's make it optional.

Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/182de8ec6b9c33152f2ba6b248c35b0311abf5e4.1708635408.git.luizcap@redhat.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
(cherry picked from commit 0d46439bda37e2e13f14d6a9e211c4f645c6336a)
Signed-off-by: David Thompson <davthompson@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/platform/mellanox/mlxbf-pmc.c | 40 +++++++++++++++------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c
index 250405bb59a73..b71636eb3db1a 100644
--- a/drivers/platform/mellanox/mlxbf-pmc.c
+++ b/drivers/platform/mellanox/mlxbf-pmc.c
@@ -966,32 +966,33 @@ static bool mlxbf_pmc_valid_range(unsigned int blk_num, u32 offset)
 }
 
 /* Get the event list corresponding to a certain block */
-static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk, size_t *size)
+static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk, size_t *psize)
 {
 	const struct mlxbf_pmc_events *events;
+	size_t size;
 
 	if (strstr(blk, "tilenet")) {
 		events = mlxbf_pmc_hnfnet_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_hnfnet_events);
+		size = ARRAY_SIZE(mlxbf_pmc_hnfnet_events);
 	} else if (strstr(blk, "tile")) {
 		events = mlxbf_pmc_hnf_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_hnf_events);
+		size = ARRAY_SIZE(mlxbf_pmc_hnf_events);
 	} else if (strstr(blk, "triogen")) {
 		events = mlxbf_pmc_smgen_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_smgen_events);
+		size = ARRAY_SIZE(mlxbf_pmc_smgen_events);
 	} else if (strstr(blk, "trio")) {
 		switch (pmc->event_set) {
 		case MLXBF_PMC_EVENT_SET_BF1:
 			events = mlxbf_pmc_trio_events_1;
-			*size = ARRAY_SIZE(mlxbf_pmc_trio_events_1);
+			size = ARRAY_SIZE(mlxbf_pmc_trio_events_1);
 			break;
 		case MLXBF_PMC_EVENT_SET_BF2:
 			events = mlxbf_pmc_trio_events_2;
-			*size = ARRAY_SIZE(mlxbf_pmc_trio_events_2);
+			size = ARRAY_SIZE(mlxbf_pmc_trio_events_2);
 			break;
 		default:
 			events = NULL;
-			*size = 0;
+			size = 0;
 			break;
 		}
 	} else if (strstr(blk, "mss")) {
@@ -999,43 +1000,46 @@ static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk, size
 		case MLXBF_PMC_EVENT_SET_BF1:
 		case MLXBF_PMC_EVENT_SET_BF2:
 			events = mlxbf_pmc_mss_events_1;
-			*size = ARRAY_SIZE(mlxbf_pmc_mss_events_1);
+			size = ARRAY_SIZE(mlxbf_pmc_mss_events_1);
 			break;
 		case MLXBF_PMC_EVENT_SET_BF3:
 			events = mlxbf_pmc_mss_events_3;
-			*size = ARRAY_SIZE(mlxbf_pmc_mss_events_3);
+			size = ARRAY_SIZE(mlxbf_pmc_mss_events_3);
 			break;
 		default:
 			events = NULL;
-			*size = 0;
+			size = 0;
 			break;
 		}
 	} else if (strstr(blk, "ecc")) {
 		events = mlxbf_pmc_ecc_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_ecc_events);
+		size = ARRAY_SIZE(mlxbf_pmc_ecc_events);
 	} else if (strstr(blk, "pcie")) {
 		events = mlxbf_pmc_pcie_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_pcie_events);
+		size = ARRAY_SIZE(mlxbf_pmc_pcie_events);
 	} else if (strstr(blk, "l3cache")) {
 		events = mlxbf_pmc_l3c_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_l3c_events);
+		size = ARRAY_SIZE(mlxbf_pmc_l3c_events);
 	} else if (strstr(blk, "gic")) {
 		events = mlxbf_pmc_smgen_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_smgen_events);
+		size = ARRAY_SIZE(mlxbf_pmc_smgen_events);
 	} else if (strstr(blk, "smmu")) {
 		events = mlxbf_pmc_smgen_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_smgen_events);
+		size = ARRAY_SIZE(mlxbf_pmc_smgen_events);
 	} else if (strstr(blk, "llt_miss")) {
 		events = mlxbf_pmc_llt_miss_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_llt_miss_events);
+		size = ARRAY_SIZE(mlxbf_pmc_llt_miss_events);
 	} else if (strstr(blk, "llt")) {
 		events = mlxbf_pmc_llt_events;
-		*size = ARRAY_SIZE(mlxbf_pmc_llt_events);
+		size = ARRAY_SIZE(mlxbf_pmc_llt_events);
 	} else {
 		events = NULL;
-		*size = 0;
+		size = 0;
 	}
 
+	if (psize)
+		*psize = size;
+
 	return events;
 }
 

From 642283f450dc4a1c986dc3911145450f168196ac Mon Sep 17 00:00:00 2001
From: Luiz Capitulino <luizcap@redhat.com>
Date: Thu, 22 Feb 2024 15:57:30 -0500
Subject: [PATCH 051/352] platform/mellanox: mlxbf-pmc: Ignore unsupported
 performance blocks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BugLink: https://bugs.launchpad.net/bugs/2069777

Currently, the driver has two behaviors to deal with new & unsupported
performance blocks reported by the firmware:

 1. For register and unknown block types, the driver will fail to load
    with the following error message:

    [ 4510.956369] mlxbf-pmc: probe of MLNXBFD2:00 failed with error -22

 2. For counter and crspace blocks, the driver will load and sysfs files
    will be created but getting the contents of event_list or trying to
    setup the counter will fail

Instead, let's ignore and log unsupported blocks. This means the driver
will always load and unsupported blocks will never show up in sysfs.

Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Link: https://lore.kernel.org/r/f8e2e6210b43e825b69824b420c801cd513d401d.1708635408.git.luizcap@redhat.com
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
(cherry picked from commit c0459eeb64e955b8fb9dd9f73b937f750387ef9a)
Signed-off-by: David Thompson <davthompson@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/platform/mellanox/mlxbf-pmc.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c
index b71636eb3db1a..746567767e5b3 100644
--- a/drivers/platform/mellanox/mlxbf-pmc.c
+++ b/drivers/platform/mellanox/mlxbf-pmc.c
@@ -1043,6 +1043,11 @@ static const struct mlxbf_pmc_events *mlxbf_pmc_event_list(const char *blk, size
 	return events;
 }
 
+static bool mlxbf_pmc_event_supported(const char *blk)
+{
+	return !!mlxbf_pmc_event_list(blk, NULL);
+}
+
 /* Get the event number given the name */
 static int mlxbf_pmc_get_event_num(const char *blk, const char *evt)
 {
@@ -1761,6 +1766,9 @@ static int mlxbf_pmc_init_perftype_counter(struct device *dev, unsigned int blk_
 	struct mlxbf_pmc_attribute *attr;
 	unsigned int i = 0, j = 0;
 
+	if (!mlxbf_pmc_event_supported(pmc->block_name[blk_num]))
+		return -ENOENT;
+
 	/* "event_list" sysfs to list events supported by the block */
 	attr = &pmc->block[blk_num].attr_event_list;
 	attr->dev_attr.attr.mode = 0444;
@@ -1840,7 +1848,7 @@ static int mlxbf_pmc_init_perftype_reg(struct device *dev, unsigned int blk_num)
 
 	events = mlxbf_pmc_event_list(pmc->block_name[blk_num], &count);
 	if (!events)
-		return -EINVAL;
+		return -ENOENT;
 
 	pmc->block[blk_num].attr_event = devm_kcalloc(
 		dev, count, sizeof(struct mlxbf_pmc_attribute), GFP_KERNEL);
@@ -1878,7 +1886,7 @@ static int mlxbf_pmc_create_groups(struct device *dev, unsigned int blk_num)
 	else if (pmc->block[blk_num].type == MLXBF_PMC_TYPE_REGISTER)
 		err = mlxbf_pmc_init_perftype_reg(dev, blk_num);
 	else
-		err = -EINVAL;
+		err = -ENOENT;
 
 	if (err)
 		return err;
@@ -1983,6 +1991,10 @@ static int mlxbf_pmc_map_counters(struct device *dev)
 			return -ENOMEM;
 
 		ret = mlxbf_pmc_create_groups(dev, i);
+		if (ret == -ENOENT) {
+			dev_warn(dev, "ignoring unsupported block: '%s'\n", pmc->block_name[i]);
+			continue;
+		}
 		if (ret)
 			return ret;
 	}

From 3b9ee75081aa3d7c370d5c17b2a5c9531a35154d Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 29 Feb 2024 16:11:36 +0300
Subject: [PATCH 052/352] platform/mellanox: mlxbf-pmc: fix signedness bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

BugLink: https://bugs.launchpad.net/bugs/2069777

These need to be signed for the error handling to work.  The
mlxbf_pmc_get_event_num() function returns int so int type is correct.

Fixes: 1ae9ffd303c2 ("platform/mellanox: mlxbf-pmc: Cleanup signed/unsigned mix-up")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Link: https://lore.kernel.org/r/a4af764e-990b-4ebd-b342-852844374032@moroto.mountain
Reviewed-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
Signed-off-by: Ilpo Järvinen <ilpo.jarvinen@linux.intel.com>
(cherry picked from commit 7c8772fef2c25b951660ff31aa1d2174b45af043)
Signed-off-by: David Thompson <davthompson@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/platform/mellanox/mlxbf-pmc.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/platform/mellanox/mlxbf-pmc.c b/drivers/platform/mellanox/mlxbf-pmc.c
index 746567767e5b3..4ed9c7fd2b62a 100644
--- a/drivers/platform/mellanox/mlxbf-pmc.c
+++ b/drivers/platform/mellanox/mlxbf-pmc.c
@@ -1505,8 +1505,9 @@ static ssize_t mlxbf_pmc_counter_show(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_counter = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	unsigned int blk_num, cnt_num, offset;
+	unsigned int blk_num, cnt_num;
 	bool is_l3 = false;
+	int offset;
 	u64 value;
 
 	blk_num = attr_counter->nr;
@@ -1539,9 +1540,10 @@ static ssize_t mlxbf_pmc_counter_store(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_counter = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	unsigned int blk_num, cnt_num, offset, data;
+	unsigned int blk_num, cnt_num, data;
 	bool is_l3 = false;
 	u64 evt_num;
+	int offset;
 	int err;
 
 	blk_num = attr_counter->nr;
@@ -1621,8 +1623,9 @@ static ssize_t mlxbf_pmc_event_store(struct device *dev,
 {
 	struct mlxbf_pmc_attribute *attr_event = container_of(
 		attr, struct mlxbf_pmc_attribute, dev_attr);
-	unsigned int blk_num, cnt_num, evt_num;
+	unsigned int blk_num, cnt_num;
 	bool is_l3 = false;
+	int evt_num;
 	int err;
 
 	blk_num = attr_event->nr;

From fcd28c309a707e4ec030523d7056ae9f28c6e962 Mon Sep 17 00:00:00 2001
From: Vidya Sagar <vidyas@nvidia.com>
Date: Tue, 16 Jan 2024 20:02:58 +0530
Subject: [PATCH 053/352] PCI: Clear Secondary Status errors after enumeration

BugLink: https://bugs.launchpad.net/bugs/2071654

We enumerate devices by attempting config reads to the Vendor ID of each
possible device.  On conventional PCI, if no device responds, the read
terminates with a Master Abort (PCI r3.0, sec 6.1).  On PCIe, the config
read is terminated as an Unsupported Request (PCIe r6.0, sec 2.3.2,
7.5.1.3.7).  In either case, if the read addressed a device below a bridge,
it is logged by setting "Received Master Abort" in the bridge Secondary
Status register.

Clear any errors logged in the Secondary Status register after enumeration.

Link: https://lore.kernel.org/r/20240116143258.483235-1-vidyas@nvidia.com
Signed-off-by: Vidya Sagar <vidyas@nvidia.com>
[bhelgaas: simplify commit log]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
(cherry picked from commit 7bf9d2af7e89f65a79225e26d261b52ce4ee3e95)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/pci/probe.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 1434bf495db3c..f6fd97f0795a3 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -1482,6 +1482,9 @@ static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,
 	}
 
 out:
+	/* Clear errors in the Secondary Status Register */
+	pci_write_config_word(dev, PCI_SEC_STATUS, 0xffff);
+
 	pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
 
 	pm_runtime_put(&dev->dev);

From 7764adf1295ee12f0f66933862ea03d63c20e4ae Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:14 -0300
Subject: [PATCH 054/352] x86: Stop using weak symbols for __iowrite32_copy()

BugLink: https://bugs.launchpad.net/bugs/2071655

Start switching iomap_copy routines over to use #define and arch provided
inline/macro functions instead of weak symbols.

Inline functions allow more compiler optimization and this is often a
driver hot path.

x86 has the only weak implementation for __iowrite32_copy(), so replace it
with a static inline containing the same single instruction inline
assembly. The compiler will generate the "mov edx,ecx" in a more optimal
way.

Remove iomap_copy_64.S

Link: https://lore.kernel.org/r/1-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 20516d6e51dd9994afda8d556507cfbe7853384b)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/x86/include/asm/io.h    | 17 +++++++++++++++++
 arch/x86/lib/Makefile        |  1 -
 arch/x86/lib/iomap_copy_64.S | 15 ---------------
 include/linux/io.h           |  5 ++++-
 lib/iomap_copy.c             |  6 +++---
 5 files changed, 24 insertions(+), 20 deletions(-)
 delete mode 100644 arch/x86/lib/iomap_copy_64.S

diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 294cd2a408181..4b99ed326b174 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -209,6 +209,23 @@ void memset_io(volatile void __iomem *, int, size_t);
 #define memcpy_toio memcpy_toio
 #define memset_io memset_io
 
+#ifdef CONFIG_X86_64
+/*
+ * Commit 0f07496144c2 ("[PATCH] Add faster __iowrite32_copy routine for
+ * x86_64") says that circa 2006 rep movsl is noticeably faster than a copy
+ * loop.
+ */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	asm volatile("rep ; movsl"
+		     : "=&c"(count), "=&D"(to), "=&S"(from)
+		     : "0"(count), "1"(to), "2"(from)
+		     : "memory");
+}
+#define __iowrite32_copy __iowrite32_copy
+#endif
+
 /*
  * ISA space is 'always mapped' on a typical x86 system, no need to
  * explicitly ioremap() it. The fact that the ISA IO space is mapped
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f0dae4fb6d071..b26fcbdaa620b 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -53,7 +53,6 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
         lib-y += atomic64_386_32.o
 endif
 else
-        obj-y += iomap_copy_64.o
 ifneq ($(CONFIG_GENERIC_CSUM),y)
         lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
 endif
diff --git a/arch/x86/lib/iomap_copy_64.S b/arch/x86/lib/iomap_copy_64.S
deleted file mode 100644
index 6ff2f56cb0f71..0000000000000
--- a/arch/x86/lib/iomap_copy_64.S
+++ /dev/null
@@ -1,15 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright 2006 PathScale, Inc.  All Rights Reserved.
- */
-
-#include <linux/linkage.h>
-
-/*
- * override generic version in lib/iomap_copy.c
- */
-SYM_FUNC_START(__iowrite32_copy)
-	movl %edx,%ecx
-	rep movsl
-	RET
-SYM_FUNC_END(__iowrite32_copy)
diff --git a/include/linux/io.h b/include/linux/io.h
index 7304f2a69960a..68cd551b6af11 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -16,7 +16,10 @@
 struct device;
 struct resource;
 
-__visible void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+#ifndef __iowrite32_copy
+void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
+#endif
+
 void __ioread32_copy(void *to, const void __iomem *from, size_t count);
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
 
diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c
index 5de7c04e05ef5..8ddcbb53507df 100644
--- a/lib/iomap_copy.c
+++ b/lib/iomap_copy.c
@@ -16,9 +16,8 @@
  * time.  Order of access is not guaranteed, nor is a memory barrier
  * performed afterwards.
  */
-void __attribute__((weak)) __iowrite32_copy(void __iomem *to,
-					    const void *from,
-					    size_t count)
+#ifndef __iowrite32_copy
+void __iowrite32_copy(void __iomem *to, const void *from, size_t count)
 {
 	u32 __iomem *dst = to;
 	const u32 *src = from;
@@ -28,6 +27,7 @@ void __attribute__((weak)) __iowrite32_copy(void __iomem *to,
 		__raw_writel(*src++, dst++);
 }
 EXPORT_SYMBOL_GPL(__iowrite32_copy);
+#endif
 
 /**
  * __ioread32_copy - copy data from MMIO space, in 32-bit units

From 67efc371d1377efab042561defc7cbc5a71a0402 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:15 -0300
Subject: [PATCH 055/352] s390: Implement __iowrite32_copy()

BugLink: https://bugs.launchpad.net/bugs/2071655

It is trivial to implement an inline to do this, so provide it in the s390
headers. Like the 64 bit version it should just invoke zpci_memcpy_toio()
with the correct size.

Link: https://lore.kernel.org/r/2-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 6ae798cbef4ba1f180aa1a590e33a2d89f7cc34f)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/s390/include/asm/io.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 4453ad7c11ace..00704fc8a54b3 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -73,6 +73,14 @@ static inline void ioport_unmap(void __iomem *p)
 #define __raw_writel	zpci_write_u32
 #define __raw_writeq	zpci_write_u64
 
+/* combine single writes by using store-block insn */
+static inline void __iowrite32_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 4);
+}
+#define __iowrite32_copy __iowrite32_copy
+
 #endif /* CONFIG_PCI */
 
 #include <asm-generic/io.h>

From 0138b27d3e81cc27215c99ccdb2f1846f8db915c Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:16 -0300
Subject: [PATCH 056/352] s390: Stop using weak symbols for __iowrite64_copy()

BugLink: https://bugs.launchpad.net/bugs/2071655

Complete switching the __iowriteXX_copy() routines over to use #define and
arch provided inline/macro functions instead of weak symbols.

S390 has an implementation that simply calls another memcpy
function. Inline this so the callers don't have to do two jumps.

Link: https://lore.kernel.org/r/3-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Acked-by: Niklas Schnelle <schnelle@linux.ibm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit e7bc47b16622d1016b3b77bbdb20fb9e213045f2)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/s390/include/asm/io.h | 7 +++++++
 arch/s390/pci/pci.c        | 6 ------
 include/linux/io.h         | 3 +++
 lib/iomap_copy.c           | 7 +++----
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/arch/s390/include/asm/io.h b/arch/s390/include/asm/io.h
index 00704fc8a54b3..0fbc992d7a5ea 100644
--- a/arch/s390/include/asm/io.h
+++ b/arch/s390/include/asm/io.h
@@ -81,6 +81,13 @@ static inline void __iowrite32_copy(void __iomem *to, const void *from,
 }
 #define __iowrite32_copy __iowrite32_copy
 
+static inline void __iowrite64_copy(void __iomem *to, const void *from,
+				    size_t count)
+{
+	zpci_memcpy_toio(to, from, count * 8);
+}
+#define __iowrite64_copy __iowrite64_copy
+
 #endif /* CONFIG_PCI */
 
 #include <asm-generic/io.h>
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 52a44e353796c..fb81337a73eaa 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -249,12 +249,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
 	return 0;
 }
 
-/* combine single writes by using store-block insn */
-void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
-{
-	zpci_memcpy_toio(to, from, count * 8);
-}
-
 void __iomem *ioremap_prot(phys_addr_t phys_addr, size_t size,
 			   unsigned long prot)
 {
diff --git a/include/linux/io.h b/include/linux/io.h
index 68cd551b6af11..fe12be9de6f7a 100644
--- a/include/linux/io.h
+++ b/include/linux/io.h
@@ -21,7 +21,10 @@ void __iowrite32_copy(void __iomem *to, const void *from, size_t count);
 #endif
 
 void __ioread32_copy(void *to, const void __iomem *from, size_t count);
+
+#ifndef __iowrite64_copy
 void __iowrite64_copy(void __iomem *to, const void *from, size_t count);
+#endif
 
 #ifdef CONFIG_MMU
 int ioremap_page_range(unsigned long addr, unsigned long end,
diff --git a/lib/iomap_copy.c b/lib/iomap_copy.c
index 8ddcbb53507df..2fd5712fb7c02 100644
--- a/lib/iomap_copy.c
+++ b/lib/iomap_copy.c
@@ -60,9 +60,8 @@ EXPORT_SYMBOL_GPL(__ioread32_copy);
  * time.  Order of access is not guaranteed, nor is a memory barrier
  * performed afterwards.
  */
-void __attribute__((weak)) __iowrite64_copy(void __iomem *to,
-					    const void *from,
-					    size_t count)
+#ifndef __iowrite64_copy
+void __iowrite64_copy(void __iomem *to, const void *from, size_t count)
 {
 #ifdef CONFIG_64BIT
 	u64 __iomem *dst = to;
@@ -75,5 +74,5 @@ void __attribute__((weak)) __iowrite64_copy(void __iomem *to,
 	__iowrite32_copy(to, from, count * 2);
 #endif
 }
-
 EXPORT_SYMBOL_GPL(__iowrite64_copy);
+#endif

From 0228eb48f2cefb29130c1906d2f17d99a81480b4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:17 -0300
Subject: [PATCH 057/352] arm64/io: Provide a WC friendly __iowriteXX_copy()

BugLink: https://bugs.launchpad.net/bugs/2071655

The kernel provides driver support for using write combining IO memory
through the __iowriteXX_copy() API which is commonly used as an optional
optimization to generate 16/32/64 byte MemWr TLPs in a PCIe environment.

iomap_copy.c provides a generic implementation as a simple 4/8 byte at a
time copy loop that has worked well with past ARM64 CPUs, giving a high
frequency of large TLPs being successfully formed.

However modern ARM64 CPUs are quite sensitive to how the write combining
CPU HW is operated and a compiler generated loop with intermixed
load/store is not sufficient to frequently generate a large TLP. The CPUs
would like to see the entire TLP generated by consecutive store
instructions from registers. Compilers like gcc tend to intermix loads and
stores and have poor code generation, in part, due to the ARM64 situation
that writeq() does not codegen anything other than "[xN]". However even
with that resolved compilers like clang still do not have good code
generation.

This means on modern ARM64 CPUs the rate at which __iowriteXX_copy()
successfully generates large TLPs is very small (less than 1 in 10,000)
tries), to the point that the use of WC is pointless.

Implement __iowrite32/64_copy() specifically for ARM64 and use inline
assembly to build consecutive blocks of STR instructions. Provide direct
support for 64/32/16 large TLP generation in this manner. Optimize for
common constant lengths so that the compiler can directly inline the store
blocks.

This brings the frequency of large TLP generation up to a high level that
is comparable with older CPU generations.

As the __iowriteXX_copy() family of APIs is intended for use with WC
incorporate the DGH hint directly into the function.

Link: https://lore.kernel.org/r/4-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: linux-arch@vger.kernel.org
Cc: linux-arm-kernel@lists.infradead.org
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit ead79118dae6f9f982532002e82c2fb291ae0480)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 arch/arm64/include/asm/io.h | 132 ++++++++++++++++++++++++++++++++++++
 arch/arm64/kernel/io.c      |  42 ++++++++++++
 2 files changed, 174 insertions(+)

diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h
index 3b694511b98f8..bd77fe2112776 100644
--- a/arch/arm64/include/asm/io.h
+++ b/arch/arm64/include/asm/io.h
@@ -135,6 +135,138 @@ extern void __memset_io(volatile void __iomem *, int, size_t);
 #define memcpy_fromio(a,c,l)	__memcpy_fromio((a),(c),(l))
 #define memcpy_toio(c,a,l)	__memcpy_toio((c),(a),(l))
 
+/*
+ * The ARM64 iowrite implementation is intended to support drivers that want to
+ * use write combining. For instance PCI drivers using write combining with a 64
+ * byte __iowrite64_copy() expect to get a 64 byte MemWr TLP on the PCIe bus.
+ *
+ * Newer ARM core have sensitive write combining buffers, it is important that
+ * the stores be contiguous blocks of store instructions. Normal memcpy
+ * approaches have a very low chance to generate write combining.
+ *
+ * Since this is the only API on ARM64 that should be used with write combining
+ * it also integrates the DGH hint which is supposed to lower the latency to
+ * emit the large TLP from the CPU.
+ */
+
+static inline void __const_memcpy_toio_aligned32(volatile u32 __iomem *to,
+						 const u32 *from, size_t count)
+{
+	switch (count) {
+	case 8:
+		asm volatile("str %w0, [%8, #4 * 0]\n"
+			     "str %w1, [%8, #4 * 1]\n"
+			     "str %w2, [%8, #4 * 2]\n"
+			     "str %w3, [%8, #4 * 3]\n"
+			     "str %w4, [%8, #4 * 4]\n"
+			     "str %w5, [%8, #4 * 5]\n"
+			     "str %w6, [%8, #4 * 6]\n"
+			     "str %w7, [%8, #4 * 7]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
+			       "rZ"(from[6]), "rZ"(from[7]), "r"(to));
+		break;
+	case 4:
+		asm volatile("str %w0, [%4, #4 * 0]\n"
+			     "str %w1, [%4, #4 * 1]\n"
+			     "str %w2, [%4, #4 * 2]\n"
+			     "str %w3, [%4, #4 * 3]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "r"(to));
+		break;
+	case 2:
+		asm volatile("str %w0, [%2, #4 * 0]\n"
+			     "str %w1, [%2, #4 * 1]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
+		break;
+	case 1:
+		__raw_writel(*from, to);
+		break;
+	default:
+		BUILD_BUG();
+	}
+}
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count);
+
+static inline void __const_iowrite32_copy(void __iomem *to, const void *from,
+					  size_t count)
+{
+	if (count == 8 || count == 4 || count == 2 || count == 1) {
+		__const_memcpy_toio_aligned32(to, from, count);
+		dgh();
+	} else {
+		__iowrite32_copy_full(to, from, count);
+	}
+}
+
+#define __iowrite32_copy(to, from, count)                  \
+	(__builtin_constant_p(count) ?                     \
+		 __const_iowrite32_copy(to, from, count) : \
+		 __iowrite32_copy_full(to, from, count))
+
+static inline void __const_memcpy_toio_aligned64(volatile u64 __iomem *to,
+						 const u64 *from, size_t count)
+{
+	switch (count) {
+	case 8:
+		asm volatile("str %x0, [%8, #8 * 0]\n"
+			     "str %x1, [%8, #8 * 1]\n"
+			     "str %x2, [%8, #8 * 2]\n"
+			     "str %x3, [%8, #8 * 3]\n"
+			     "str %x4, [%8, #8 * 4]\n"
+			     "str %x5, [%8, #8 * 5]\n"
+			     "str %x6, [%8, #8 * 6]\n"
+			     "str %x7, [%8, #8 * 7]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "rZ"(from[4]), "rZ"(from[5]),
+			       "rZ"(from[6]), "rZ"(from[7]), "r"(to));
+		break;
+	case 4:
+		asm volatile("str %x0, [%4, #8 * 0]\n"
+			     "str %x1, [%4, #8 * 1]\n"
+			     "str %x2, [%4, #8 * 2]\n"
+			     "str %x3, [%4, #8 * 3]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "rZ"(from[2]),
+			       "rZ"(from[3]), "r"(to));
+		break;
+	case 2:
+		asm volatile("str %x0, [%2, #8 * 0]\n"
+			     "str %x1, [%2, #8 * 1]\n"
+			     :
+			     : "rZ"(from[0]), "rZ"(from[1]), "r"(to));
+		break;
+	case 1:
+		__raw_writeq(*from, to);
+		break;
+	default:
+		BUILD_BUG();
+	}
+}
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count);
+
+static inline void __const_iowrite64_copy(void __iomem *to, const void *from,
+					  size_t count)
+{
+	if (count == 8 || count == 4 || count == 2 || count == 1) {
+		__const_memcpy_toio_aligned64(to, from, count);
+		dgh();
+	} else {
+		__iowrite64_copy_full(to, from, count);
+	}
+}
+
+#define __iowrite64_copy(to, from, count)                  \
+	(__builtin_constant_p(count) ?                     \
+		 __const_iowrite64_copy(to, from, count) : \
+		 __iowrite64_copy_full(to, from, count))
+
 /*
  * I/O memory mapping functions.
  */
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c
index aa7a4ec6a3ae6..ef48089fbfe1a 100644
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -37,6 +37,48 @@ void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
 }
 EXPORT_SYMBOL(__memcpy_fromio);
 
+/*
+ * This generates a memcpy that works on a from/to address which is aligned to
+ * bits. Count is in terms of the number of bits sized quantities to copy. It
+ * optimizes to use the STR groupings when possible so that it is WC friendly.
+ */
+#define memcpy_toio_aligned(to, from, count, bits)                        \
+	({                                                                \
+		volatile u##bits __iomem *_to = to;                       \
+		const u##bits *_from = from;                              \
+		size_t _count = count;                                    \
+		const u##bits *_end_from = _from + ALIGN_DOWN(_count, 8); \
+                                                                          \
+		for (; _from < _end_from; _from += 8, _to += 8)           \
+			__const_memcpy_toio_aligned##bits(_to, _from, 8); \
+		if ((_count % 8) >= 4) {                                  \
+			__const_memcpy_toio_aligned##bits(_to, _from, 4); \
+			_from += 4;                                       \
+			_to += 4;                                         \
+		}                                                         \
+		if ((_count % 4) >= 2) {                                  \
+			__const_memcpy_toio_aligned##bits(_to, _from, 2); \
+			_from += 2;                                       \
+			_to += 2;                                         \
+		}                                                         \
+		if (_count % 2)                                           \
+			__const_memcpy_toio_aligned##bits(_to, _from, 1); \
+	})
+
+void __iowrite64_copy_full(void __iomem *to, const void *from, size_t count)
+{
+	memcpy_toio_aligned(to, from, count, 64);
+	dgh();
+}
+EXPORT_SYMBOL(__iowrite64_copy_full);
+
+void __iowrite32_copy_full(void __iomem *to, const void *from, size_t count)
+{
+	memcpy_toio_aligned(to, from, count, 32);
+	dgh();
+}
+EXPORT_SYMBOL(__iowrite32_copy_full);
+
 /*
  * Copy data from "real" memory space to IO memory space.
  */

From 048deb6ff052857d792a13065970f48b00847df4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 13:46:18 -0300
Subject: [PATCH 058/352] net: hns3: Remove io_stop_wc() calls after
 __iowrite64_copy()

BugLink: https://bugs.launchpad.net/bugs/2071655

Now that the ARM64 arch implementation does the DGH as part of
__iowrite64_copy() there is no reason to open code this in drivers.

Link: https://lore.kernel.org/r/5-v3-1893cd8b9369+1925-mlx5_arm_wc_jgg@nvidia.com
Reviewed-by: Jijie Shao<shaojijie@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 2b7a5e1fe02231acc5d50339b2f10833565ef559)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 31a5d1597fc7b..a1e4a20987189 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2068,8 +2068,6 @@ static void hns3_tx_push_bd(struct hns3_enet_ring *ring, int num)
 	__iowrite64_copy(ring->tqp->mem_base, desc,
 			 (sizeof(struct hns3_desc) * HNS3_MAX_PUSH_BD_NUM) /
 			 HNS3_BYTES_PER_64BIT);
-
-	io_stop_wc();
 }
 
 static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring)
@@ -2088,8 +2086,6 @@ static void hns3_tx_mem_doorbell(struct hns3_enet_ring *ring)
 	u64_stats_update_begin(&ring->syncp);
 	ring->stats.tx_mem_doorbell += ring->pending_buf;
 	u64_stats_update_end(&ring->syncp);
-
-	io_stop_wc();
 }
 
 static void hns3_tx_doorbell(struct hns3_enet_ring *ring, int num,

From 15d2b4fe348ea51b2cd51dd8788b79d80bd6911e Mon Sep 17 00:00:00 2001
From: Vidya Sagar <vidyas@nvidia.com>
Date: Tue, 25 Jun 2024 21:01:50 +0530
Subject: [PATCH 059/352] PCI: Extend ACS configurability

BugLink: https://bugs.launchpad.net/bugs/2073811

PCIe ACS settings control the level of isolation and the possible P2P paths
between devices. With greater isolation the kernel will create smaller
iommu_groups and with less isolation there is more HW that can achieve P2P
transfers. From a virtualization perspective all devices in the same
iommu_group must be assigned to the same VM as they lack security
isolation.

There is no way for the kernel to automatically know the correct ACS
settings for any given system and workload. Existing command line options
(e.g., disable_acs_redir) allow only for large scale change, disabling all
isolation, but this is not sufficient for more complex cases.

Add a kernel command-line option 'config_acs' to directly control all the
ACS bits for specific devices, which allows the operator to setup the right
level of isolation to achieve the desired P2P configuration.  The
definition is future proof; when new ACS bits are added to the spec the
open syntax can be extended.

ACS needs to be setup early in the kernel boot as the ACS settings affect
how iommu_groups are formed. iommu_group formation is a one time event
during initial device discovery, so changing ACS bits after kernel boot can
result in an inaccurate view of the iommu_groups compared to the current
isolation configuration.

ACS applies to PCIe Downstream Ports and multi-function devices.  The
default ACS settings are strict and deny any direct traffic between two
functions. This results in the smallest iommu_group the HW can support.
Frequently these values result in slow or non-working P2PDMA.

ACS offers a range of security choices controlling how traffic is
allowed to go directly between two devices. Some popular choices:

  - Full prevention

  - Translated requests can be direct, with various options

  - Asymmetric direct traffic, A can reach B but not the reverse

  - All traffic can be direct

Along with some other less common ones for special topologies.

The intention is that this option would be used with expert knowledge of
the HW capability and workload to achieve the desired configuration.

Link: https://lore.kernel.org/r/20240625153150.159310-1-vidyas@nvidia.com
Signed-off-by: Vidya Sagar <vidyas@nvidia.com>
[bhelgaas: add example, tidy printk formats]
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
(cherry picked from commit 47c8846a49baa8c0b7a6a3e7e7eacd6e8d119d25)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 .../admin-guide/kernel-parameters.txt         |  32 ++++
 drivers/pci/pci.c                             | 148 +++++++++++-------
 2 files changed, 122 insertions(+), 58 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index e58f3bbb7643c..1b46fd461eee5 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4572,6 +4572,38 @@
 				bridges without forcing it upstream. Note:
 				this removes isolation between devices and
 				may put more devices in an IOMMU group.
+		config_acs=
+				Format:
+				<ACS flags>@<pci_dev>[; ...]
+				Specify one or more PCI devices (in the format
+				specified above) optionally prepended with flags
+				and separated by semicolons. The respective
+				capabilities will be enabled, disabled or
+				unchanged based on what is specified in
+				flags.
+
+				ACS Flags is defined as follows:
+				  bit-0 : ACS Source Validation
+				  bit-1 : ACS Translation Blocking
+				  bit-2 : ACS P2P Request Redirect
+				  bit-3 : ACS P2P Completion Redirect
+				  bit-4 : ACS Upstream Forwarding
+				  bit-5 : ACS P2P Egress Control
+				  bit-6 : ACS Direct Translated P2P
+				Each bit can be marked as:
+				  '0' – force disabled
+				  '1' – force enabled
+				  'x' – unchanged
+				For example,
+				  pci=config_acs=10x
+				would configure all devices that support
+				ACS to enable P2P Request Redirect, disable
+				Translation Blocking, and leave Source
+				Validation unchanged from whatever power-up
+				or firmware set it to.
+
+				Note: this may remove isolation between devices
+				and may put more devices in an IOMMU group.
 		force_floating	[S390] Force usage of floating interrupts.
 		nomio		[S390] Do not use MIO instructions.
 		norid		[S390] ignore the RID field and force use of
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 55eabb8dfd4a6..b2ccb8e122f2d 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -947,30 +947,67 @@ void pci_request_acs(void)
 }
 
 static const char *disable_acs_redir_param;
+static const char *config_acs_param;
 
-/**
- * pci_disable_acs_redir - disable ACS redirect capabilities
- * @dev: the PCI device
- *
- * For only devices specified in the disable_acs_redir parameter.
- */
-static void pci_disable_acs_redir(struct pci_dev *dev)
+struct pci_acs {
+	u16 cap;
+	u16 ctrl;
+	u16 fw_ctrl;
+};
+
+static void __pci_config_acs(struct pci_dev *dev, struct pci_acs *caps,
+			     const char *p, u16 mask, u16 flags)
 {
+	char *delimit;
 	int ret = 0;
-	const char *p;
-	int pos;
-	u16 ctrl;
 
-	if (!disable_acs_redir_param)
+	if (!p)
 		return;
 
-	p = disable_acs_redir_param;
 	while (*p) {
+		if (!mask) {
+			/* Check for ACS flags */
+			delimit = strstr(p, "@");
+			if (delimit) {
+				int end;
+				u32 shift = 0;
+
+				end = delimit - p - 1;
+
+				while (end > -1) {
+					if (*(p + end) == '0') {
+						mask |= 1 << shift;
+						shift++;
+						end--;
+					} else if (*(p + end) == '1') {
+						mask |= 1 << shift;
+						flags |= 1 << shift;
+						shift++;
+						end--;
+					} else if ((*(p + end) == 'x') || (*(p + end) == 'X')) {
+						shift++;
+						end--;
+					} else {
+						pci_err(dev, "Invalid ACS flags... Ignoring\n");
+						return;
+					}
+				}
+				p = delimit + 1;
+			} else {
+				pci_err(dev, "ACS Flags missing\n");
+				return;
+			}
+		}
+
+		if (mask & ~(PCI_ACS_SV | PCI_ACS_TB | PCI_ACS_RR | PCI_ACS_CR |
+			    PCI_ACS_UF | PCI_ACS_EC | PCI_ACS_DT)) {
+			pci_err(dev, "Invalid ACS flags specified\n");
+			return;
+		}
+
 		ret = pci_dev_str_match(dev, p, &p);
 		if (ret < 0) {
-			pr_info_once("PCI: Can't parse disable_acs_redir parameter: %s\n",
-				     disable_acs_redir_param);
-
+			pr_info_once("PCI: Can't parse ACS command line parameter\n");
 			break;
 		} else if (ret == 1) {
 			/* Found a match */
@@ -990,56 +1027,38 @@ static void pci_disable_acs_redir(struct pci_dev *dev)
 	if (!pci_dev_specific_disable_acs_redir(dev))
 		return;
 
-	pos = dev->acs_cap;
-	if (!pos) {
-		pci_warn(dev, "cannot disable ACS redirect for this hardware as it does not have ACS capabilities\n");
-		return;
-	}
-
-	pci_read_config_word(dev, pos + PCI_ACS_CTRL, &ctrl);
+	pci_dbg(dev, "ACS mask  = %#06x\n", mask);
+	pci_dbg(dev, "ACS flags = %#06x\n", flags);
 
-	/* P2P Request & Completion Redirect */
-	ctrl &= ~(PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC);
+	/* If mask is 0 then we copy the bit from the firmware setting. */
+	caps->ctrl = (caps->ctrl & ~mask) | (caps->fw_ctrl & mask);
+	caps->ctrl |= flags;
 
-	pci_write_config_word(dev, pos + PCI_ACS_CTRL, ctrl);
-
-	pci_info(dev, "disabled ACS redirect\n");
+	pci_info(dev, "Configured ACS to %#06x\n", caps->ctrl);
 }
 
 /**
  * pci_std_enable_acs - enable ACS on devices using standard ACS capabilities
  * @dev: the PCI device
+ * @caps: default ACS controls
  */
-static void pci_std_enable_acs(struct pci_dev *dev)
+static void pci_std_enable_acs(struct pci_dev *dev, struct pci_acs *caps)
 {
-	int pos;
-	u16 cap;
-	u16 ctrl;
-
-	pos = dev->acs_cap;
-	if (!pos)
-		return;
-
-	pci_read_config_word(dev, pos + PCI_ACS_CAP, &cap);
-	pci_read_config_word(dev, pos + PCI_ACS_CTRL, &ctrl);
-
 	/* Source Validation */
-	ctrl |= (cap & PCI_ACS_SV);
+	caps->ctrl |= (caps->cap & PCI_ACS_SV);
 
 	/* P2P Request Redirect */
-	ctrl |= (cap & PCI_ACS_RR);
+	caps->ctrl |= (caps->cap & PCI_ACS_RR);
 
 	/* P2P Completion Redirect */
-	ctrl |= (cap & PCI_ACS_CR);
+	caps->ctrl |= (caps->cap & PCI_ACS_CR);
 
 	/* Upstream Forwarding */
-	ctrl |= (cap & PCI_ACS_UF);
+	caps->ctrl |= (caps->cap & PCI_ACS_UF);
 
 	/* Enable Translation Blocking for external devices and noats */
 	if (pci_ats_disabled() || dev->external_facing || dev->untrusted)
-		ctrl |= (cap & PCI_ACS_TB);
-
-	pci_write_config_word(dev, pos + PCI_ACS_CTRL, ctrl);
+		caps->ctrl |= (caps->cap & PCI_ACS_TB);
 }
 
 /**
@@ -1048,23 +1067,33 @@ static void pci_std_enable_acs(struct pci_dev *dev)
  */
 static void pci_enable_acs(struct pci_dev *dev)
 {
-	if (!pci_acs_enable)
-		goto disable_acs_redir;
+	struct pci_acs caps;
+	int pos;
+
+	pos = dev->acs_cap;
+	if (!pos)
+		return;
 
-	if (!pci_dev_specific_enable_acs(dev))
-		goto disable_acs_redir;
+	pci_read_config_word(dev, pos + PCI_ACS_CAP, &caps.cap);
+	pci_read_config_word(dev, pos + PCI_ACS_CTRL, &caps.ctrl);
+	caps.fw_ctrl = caps.ctrl;
 
-	pci_std_enable_acs(dev);
+	/* If an iommu is present we start with kernel default caps */
+	if (pci_acs_enable) {
+		if (pci_dev_specific_enable_acs(dev))
+			pci_std_enable_acs(dev, &caps);
+	}
 
-disable_acs_redir:
 	/*
-	 * Note: pci_disable_acs_redir() must be called even if ACS was not
-	 * enabled by the kernel because it may have been enabled by
-	 * platform firmware.  So if we are told to disable it, we should
-	 * always disable it after setting the kernel's default
-	 * preferences.
+	 * Always apply caps from the command line, even if there is no iommu.
+	 * Trust that the admin has a reason to change the ACS settings.
 	 */
-	pci_disable_acs_redir(dev);
+	__pci_config_acs(dev, &caps, disable_acs_redir_param,
+			 PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC,
+			 ~(PCI_ACS_RR | PCI_ACS_CR | PCI_ACS_EC));
+	__pci_config_acs(dev, &caps, config_acs_param, 0, 0);
+
+	pci_write_config_word(dev, pos + PCI_ACS_CTRL, caps.ctrl);
 }
 
 /**
@@ -7123,6 +7152,8 @@ static int __init pci_setup(char *str)
 				pci_add_flags(PCI_SCAN_ALL_PCIE_DEVS);
 			} else if (!strncmp(str, "disable_acs_redir=", 18)) {
 				disable_acs_redir_param = str + 18;
+			} else if (!strncmp(str, "config_acs=", 11)) {
+				config_acs_param = str + 11;
 			} else {
 				pr_err("PCI: Unknown option `%s'\n", str);
 			}
@@ -7147,6 +7178,7 @@ static int __init pci_realloc_setup_params(void)
 	resource_alignment_param = kstrdup(resource_alignment_param,
 					   GFP_KERNEL);
 	disable_acs_redir_param = kstrdup(disable_acs_redir_param, GFP_KERNEL);
+	config_acs_param = kstrdup(config_acs_param, GFP_KERNEL);
 
 	return 0;
 }

From 78edb37752c66b1f818b215265c0a028bc4b2a3f Mon Sep 17 00:00:00 2001
From: Ram Tummala <rtummala@nvidia.com>
Date: Tue, 9 Jul 2024 18:45:39 -0700
Subject: [PATCH 060/352] mm: fix old/young bit handling in the faulting path

BugLink: https://bugs.launchpad.net/bugs/2075396

Commit 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()")
replaced do_set_pte() with set_pte_range() and that introduced a
regression in the following faulting path of non-anonymous vmas which
caused the PTE for the faulting address to be marked as old instead of
young.

handle_pte_fault()
  do_pte_missing()
    do_fault()
      do_read_fault() || do_cow_fault() || do_shared_fault()
        finish_fault()
          set_pte_range()

The polarity of prefault calculation is incorrect.  This leads to prefault
being incorrectly set for the faulting address.  The following check will
incorrectly mark the PTE old rather than young.  On some architectures
this will cause a double fault to mark it young when the access is
retried.

    if (prefault && arch_wants_old_prefaulted_pte())
        entry = pte_mkold(entry);

On a subsequent fault on the same address, the faulting path will see a
non NULL vmf->pte and instead of reaching the do_pte_missing() path, PTE
will then be correctly marked young in handle_pte_fault() itself.

Due to this bug, performance degradation in the fault handling path will
be observed due to unnecessary double faulting.

Link: https://lkml.kernel.org/r/20240710014539.746200-1-rtummala@nvidia.com
Fixes: 3bd786f76de2 ("mm: convert do_set_pte() to set_pte_range()")
Signed-off-by: Ram Tummala <rtummala@nvidia.com>
Reviewed-by: Yin Fengwei <fengwei.yin@intel.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: Yin Fengwei <fengwei.yin@intel.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(backported from commit 4cd7ba16a0afb36550eed7690e73d3e7a743fa96)
[context changes]
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Tested-by: Carol Soto <csoto@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index 146a7b15c676d..b1b888a54aed7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4649,7 +4649,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
 	struct vm_area_struct *vma = vmf->vma;
 	bool uffd_wp = vmf_orig_pte_uffd_wp(vmf);
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
-	bool prefault = in_range(vmf->address, addr, nr * PAGE_SIZE);
+	bool prefault = !in_range(vmf->address, addr, nr * PAGE_SIZE);
 	pte_t entry;
 
 	flush_icache_pages(vma, page, nr);

From 70753955ff08dbb09b88352c6cbf70f796202690 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 5 Aug 2024 11:44:46 -0700
Subject: [PATCH 061/352] NVIDIA: SAUCE: iommu/arm-smmu-v3: Allow default
 substream bypass with a pasid support

BugLink: https://bugs.launchpad.net/bugs/2031320

When an iommu_domain is set to IOMMU_DOMAIN_IDENTITY, the driver would
skip the allocation of a CD table and set the CONFIG field of the STE
to STRTAB_STE_0_CFG_BYPASS. This works well for devices that only have
one substream, i.e. PASID disabled.

However, there could be a use case, for a pasid capable device, that
allows bypassing the translation at the default substream while still
enabling the pasid feature, which means the driver should not skip the
allocation of a CD table nor simply bypass the CONFIG field. Instead,
the S1DSS field should be set to STRTAB_STE_1_S1DSS_BYPASS and the
SHCFG field should be set to STRTAB_STE_1_SHCFG_INCOMING.

Add s1dss in struct cd_table, to allow a configuration in the
finalise() to support this use case.

Also, according to "13.5 Summary of attribute/permission configuration
fields" in the reference manual, the SHCFG field value is irrelevant.
So, set the SHCFG field of the STE always to STRTAB_STE_1_SHCFG_INCOMING
for simplification.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Pritesh Raithatha <praithatha@nvidia.com>
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Signed-off-by: Jacob Martin <jacob.martin@canonical.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 ++++++++++++++++----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 +
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5071a8495a78c..645f6e0974268 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1138,6 +1138,12 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 
 	cd_table->stall_enabled = master->stall_enabled;
 	cd_table->s1cdmax = master->ssid_bits;
+
+	if (master->domain->domain.type  == IOMMU_DOMAIN_IDENTITY)
+		cd_table->s1dss = STRTAB_STE_1_S1DSS_BYPASS;
+	else
+		cd_table->s1dss = STRTAB_STE_1_S1DSS_SSID0;
+
 	max_contexts = 1 << cd_table->s1cdmax;
 
 	if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) ||
@@ -1341,7 +1347,8 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 
 		BUG_ON(ste_live);
 		dst->data[1] = cpu_to_le64(
-			 FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
+			 FIELD_PREP(STRTAB_STE_1_S1DSS, cd_table->s1dss) |
+			 FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING) |
 			 FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 			 FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 			 FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
@@ -2138,7 +2145,8 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
 	return 0;
 }
 
-static int arm_smmu_domain_finalise(struct iommu_domain *domain)
+static int arm_smmu_domain_finalise(struct iommu_domain *domain,
+				     struct arm_smmu_master *master)
 {
 	int ret;
 	unsigned long ias, oas;
@@ -2150,7 +2158,11 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 
-	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
+	/*
+	 * A master with a pasid capability might need a CD table, so only set
+	 * ARM_SMMU_DOMAIN_BYPASS if IOMMU_DOMAIN_IDENTITY and non-pasid master
+	 */
+	if (domain->type == IOMMU_DOMAIN_IDENTITY && !master->ssid_bits) {
 		smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
 		return 0;
 	}
@@ -2402,7 +2414,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
 	if (!smmu_domain->smmu) {
 		smmu_domain->smmu = smmu;
-		ret = arm_smmu_domain_finalise(domain);
+		ret = arm_smmu_domain_finalise(domain, master);
 		if (ret)
 			smmu_domain->smmu = NULL;
 	} else if (smmu_domain->smmu != smmu)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 65fb388d51734..5abadec3d3bf6 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -601,6 +601,7 @@ struct arm_smmu_ctx_desc_cfg {
 	struct arm_smmu_l1_ctx_desc	*l1_desc;
 	unsigned int			num_l1_ents;
 	u8				s1fmt;
+	u8				s1dss;
 	/* log2 of the maximum number of CDs supported by this table */
 	u8				s1cdmax;
 	/* Whether CD entries in this table have the stall bit set. */

From 64562eda50b7e6d19c0f2f53215f1d7e32acee60 Mon Sep 17 00:00:00 2001
From: KobaK <kobak@nvidia.com>
Date: Thu, 18 Jul 2024 14:10:44 +0000
Subject: [PATCH 062/352] NVIDIA: SAUCE: acpi/prmt: find block with specific
 type

BugLink: https://bugs.launchpad.net/bugs/2081874

PRMT needs to find the correct type of block to
translate the PA-VA mapping for EFI runtime services.

The issue arises because the PRMT is finding a block of
type EFI_CONVENTIONAL_MEMORY, which is not appropriate for
runtime services as described in Section 2.2.2 (Runtime
Services) of the UEFI Specification [1]. Since the PRM handler is
a type of runtime service, this causes an exception
when the PRM handler is called.

    [Firmware Bug]: Unable to handle paging request in EFI runtime service
    WARNING: CPU: 22 PID: 4330 at drivers/firmware/efi/runtime-wrappers.c:341
        __efi_queue_work+0x11c/0x170
    Call trace:
      __efi_queue_work+0x11c/0x170
      efi_call_acpi_prm_handler+0x68/0xd0
      acpi_platformrt_space_handler+0x198/0x258
      acpi_ev_address_space_dispatch+0x144/0x388
      acpi_ex_access_region+0x9c/0x118
      acpi_ex_write_serial_bus+0xc4/0x218
      acpi_ex_write_data_to_field+0x168/0x218
      acpi_ex_store_object_to_node+0x1a8/0x258
      acpi_ex_store+0xec/0x330
      acpi_ex_opcode_1A_1T_1R+0x15c/0x618
      acpi_ds_exec_end_op+0x274/0x548
      acpi_ps_parse_loop+0x10c/0x6b8
      acpi_ps_parse_aml+0x140/0x3b0
      acpi_ps_execute_method+0x12c/0x2a0
      acpi_ns_evaluate+0x210/0x310
      acpi_evaluate_object+0x178/0x358
      acpi_proc_write+0x1a8/0x8a0 [acpi_call]
      proc_reg_write+0xcc/0x150
      vfs_write+0xd8/0x380
      ksys_write+0x70/0x120
      __arm64_sys_write+0x24/0x48
      invoke_syscall.constprop.0+0x80/0xf8
      do_el0_svc+0x50/0x110
      el0_svc+0x48/0x1d0
      el0t_64_sync_handler+0x15c/0x178
      el0t_64_sync+0x1a8/0x1b0

Find a block with specific type to fix this.
prmt find a block with EFI_RUNTIME_SERVICES_DATA for prm handler and
find a block with EFI_RUNTIME_SERVICES_CODE for prm context.
If no suitable block is found, a warning message will be prompted
but the procedue continues to manage the next prm handler.
However, if the prm handler is actullay called without proper allocation,
it would result in a failure during error handling.

By using the correct memory types for runtime services,
Ensure that the PRM handler and the context are
properly mapped in the virtual address space during runtime,
preventing the paging request error.

[1] https://uefi.org/sites/default/files/resources/UEFI_Spec_2_10_Aug29.pdf

Fixes: cefc7ca46235 ("ACPI: PRM: implement OperationRegion handler for the PlatformRtMechanism subtype")
[backported from https://patchwork.kernel.org/project/linux-acpi/patch/20240911155536.3900579-1-kobak@nvidia.com/]
Signed-off-by: KobaK <kobak@nvidia.com>
Reviewed-by: Matthew R. Ochs <mochs@nvidia.com>
Tested-by: Andrew Adriance aadriance@nvidia.com
Acked-by: Brad Figg <bfigg@nvidia.com>
Acked-by: Jacob Martin <jacob.martin@canonical.com>
Acked-by: Noah Wager <noah.wager@canonical.com>
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 drivers/acpi/prmt.c | 49 +++++++++++++++++++++++++++++++--------------
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/drivers/acpi/prmt.c b/drivers/acpi/prmt.c
index c78453c74ef5a..cd4a7f5491d61 100644
--- a/drivers/acpi/prmt.c
+++ b/drivers/acpi/prmt.c
@@ -72,15 +72,17 @@ struct prm_module_info {
 	struct prm_handler_info handlers[] __counted_by(handler_count);
 };
 
-static u64 efi_pa_va_lookup(u64 pa)
+static u64 efi_pa_va_lookup(u64 pa, u32 type)
 {
 	efi_memory_desc_t *md;
 	u64 pa_offset = pa & ~PAGE_MASK;
 	u64 page = pa & PAGE_MASK;
 
 	for_each_efi_memory_desc(md) {
-		if (md->phys_addr < pa && pa < md->phys_addr + PAGE_SIZE * md->num_pages)
+		if ((md->type == type) &&
+			(md->phys_addr < pa && pa < md->phys_addr + PAGE_SIZE * md->num_pages)) {
 			return pa_offset + md->virt_addr + page - md->phys_addr;
+		}
 	}
 
 	return 0;
@@ -148,9 +150,18 @@ acpi_parse_prmt(union acpi_subtable_headers *header, const unsigned long end)
 		th = &tm->handlers[cur_handler];
 
 		guid_copy(&th->guid, (guid_t *)handler_info->handler_guid);
-		th->handler_addr = (void *)efi_pa_va_lookup(handler_info->handler_address);
-		th->static_data_buffer_addr = efi_pa_va_lookup(handler_info->static_data_buffer_address);
-		th->acpi_param_buffer_addr = efi_pa_va_lookup(handler_info->acpi_param_buffer_address);
+		th->handler_addr =
+			(void *)efi_pa_va_lookup(handler_info->handler_address, EFI_RUNTIME_SERVICES_CODE);
+		th->static_data_buffer_addr =
+			efi_pa_va_lookup(handler_info->static_data_buffer_address, EFI_RUNTIME_SERVICES_DATA);
+		th->acpi_param_buffer_addr =
+			efi_pa_va_lookup(handler_info->acpi_param_buffer_address, EFI_RUNTIME_SERVICES_DATA);
+
+		if (!th->handler_addr || !th->static_data_buffer_addr || !th->acpi_param_buffer_addr)
+			pr_warn(
+				"Idx: %llu, Parts of handler(GUID: %pUL) are missed, handler_addr %p, data_addr %p, param_addr %p",
+				cur_handler, &th->guid, th->handler_addr,
+				(void *)th->static_data_buffer_addr, (void *)th->acpi_param_buffer_addr);
 	} while (++cur_handler < tm->handler_count && (handler_info = get_next_handler(handler_info)));
 
 	return 0;
@@ -250,8 +261,16 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 
 		handler = find_prm_handler(&buffer->handler_guid);
 		module = find_prm_module(&buffer->handler_guid);
-		if (!handler || !module)
-			goto invalid_guid;
+		if (!handler || !module) {
+			buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
+			return AE_OK;
+		}
+
+		if (!handler->handler_addr || !handler->static_data_buffer_addr ||
+			!handler->acpi_param_buffer_addr) {
+			buffer->prm_status = PRM_HANDLER_ERROR;
+			return AE_OK;
+		}
 
 		ACPI_COPY_NAMESEG(context.signature, "PRMC");
 		context.revision = 0x0;
@@ -274,8 +293,10 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 	case PRM_CMD_START_TRANSACTION:
 
 		module = find_prm_module(&buffer->handler_guid);
-		if (!module)
-			goto invalid_guid;
+		if (!module) {
+			buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
+			return AE_OK;
+		}
 
 		if (module->updatable)
 			module->updatable = false;
@@ -286,8 +307,10 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 	case PRM_CMD_END_TRANSACTION:
 
 		module = find_prm_module(&buffer->handler_guid);
-		if (!module)
-			goto invalid_guid;
+		if (!module) {
+			buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
+			return AE_OK;
+		}
 
 		if (module->updatable)
 			buffer->prm_status = UPDATE_UNLOCK_WITHOUT_LOCK;
@@ -302,10 +325,6 @@ static acpi_status acpi_platformrt_space_handler(u32 function,
 	}
 
 	return AE_OK;
-
-invalid_guid:
-	buffer->prm_status = PRM_HANDLER_GUID_NOT_FOUND;
-	return AE_OK;
 }
 
 void __init init_prmt(void)

From 344e5ab896dfb1fab2aa3c1dcf6bab6b0d2edf8d Mon Sep 17 00:00:00 2001
From: Brad Figg <bfigg@nvidia.com>
Date: Tue, 1 Oct 2024 14:58:07 -0700
Subject: [PATCH 063/352] UBUNTU: Start new release

Ignore: yes
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/changelog | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
index 5633195e75035..d688d76c86d35 100644
--- a/debian.nvidia-adv/changelog
+++ b/debian.nvidia-adv/changelog
@@ -1,3 +1,11 @@
+linux-nvidia-adv (6.8.0-1001.2) UNRELEASED; urgency=medium
+
+  CHANGELOG: Do not edit directly. Autogenerated at release.
+  CHANGELOG: Use the printchanges target to see the curent changes.
+  CHANGELOG: Use the insertchanges target to create the final log.
+
+ -- Brad Figg <bfigg@nvidia.com>  Tue, 01 Oct 2024 14:57:25 -0700
+
 linux-nvidia-adv (6.8.0-1001.1) noble; urgency=medium
 
   [ Ubuntu: 6.8.0-45.45 ]

From 91dcc0bbedb9cb8dad2d63e26b3f322d0a3f8343 Mon Sep 17 00:00:00 2001
From: Brad Figg <bfigg@nvidia.com>
Date: Tue, 1 Oct 2024 15:05:47 -0700
Subject: [PATCH 064/352] NVIDIA: linux-nvidia-adv-6.8.0-1002.2

Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/changelog | 94 +++++++++++++++++++++++++++++++++++--
 1 file changed, 90 insertions(+), 4 deletions(-)

diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
index d688d76c86d35..672012c7d9fde 100644
--- a/debian.nvidia-adv/changelog
+++ b/debian.nvidia-adv/changelog
@@ -1,8 +1,94 @@
-linux-nvidia-adv (6.8.0-1001.2) UNRELEASED; urgency=medium
+linux-nvidia-adv (6.8.0-1002.2) noble; urgency=medium
 
-  CHANGELOG: Do not edit directly. Autogenerated at release.
-  CHANGELOG: Use the printchanges target to see the curent changes.
-  CHANGELOG: Use the insertchanges target to create the final log.
+  * NVIDIA: SAUCE: acpi/prmt: find block with specific type (LP: #2081874)
+    - NVIDIA: SAUCE: acpi/prmt: find block with specific type
+
+  * Pull-request to address ARM SMMU issue (LP: #2031320)
+    - NVIDIA: SAUCE: iommu/arm-smmu-v3: Allow default substream bypass with a
+      pasid support
+
+  * Pull request: mm: fix old/young bit handling in the faulting path of
+    set_pte_range() (LP: #2075396)
+    - mm: fix old/young bit handling in the faulting path
+
+  * Pull-request:Add a kernel command-line option 'config_acs' to directly
+    control all the ACS bits for specific devices (LP: #2073811)
+    - PCI: Extend ACS configurability
+
+  * PR for:  "IB/mlx5: Use __iowrite64_copy() for write combining stores"
+    (LP: #2071655)
+    - x86: Stop using weak symbols for __iowrite32_copy()
+    - s390: Implement __iowrite32_copy()
+    - s390: Stop using weak symbols for __iowrite64_copy()
+    - arm64/io: Provide a WC friendly __iowriteXX_copy()
+    - net: hns3: Remove io_stop_wc() calls after __iowrite64_copy()
+
+  * PR for:  "PCI: Clear Secondary Status errors after enumeration"
+    (LP: #2071654)
+    - PCI: Clear Secondary Status errors after enumeration
+
+  * mlxbf_pmc: bring in latest 6.8 upstream commits (LP: #2069777)
+    - platform/mellanox: mlxbf-pmc: Replace uintN_t with kernel-style types
+    - platform/mellanox: mlxbf-pmc: Cleanup signed/unsigned mix-up
+    - platform/mellanox: mlxbf-pmc: mlxbf_pmc_event_list(): make size ptr optional
+    - platform/mellanox: mlxbf-pmc: Ignore unsupported performance blocks
+    - platform/mellanox: mlxbf-pmc: fix signedness bugs
+
+  * mlxbf_gige: bring in latest 6.x upstream commits (LP: #2068067)
+    - mlxbf_gige: add support to display pause frame counters
+
+  * Export kernel symbols required for NVIDIA GDS (LP: #2068544)
+    - NVIDIA: SAUCE: NFS: Export nvfs register and unregister functions as GPL
+    - NVIDIA: SAUCE: NVMe/NVMeoF: Export nvfs register and unregister functions as
+      GPL
+
+  * linux-nvidia-6.5_6.5.0-1014.14 breaks with earlier BIOS release, and
+    modeset/resolutions are wrong (LP: #2061930) // Blacklist coresight_etm4x
+    (LP: #2067106)
+    - [Packaging] blacklist coresight_etm4x
+
+  * backport arm64 THP improvements from 6.9 (LP: #2059316)
+    - arm64/mm: make set_ptes() robust when OAs cross 48-bit boundary
+    - arm/pgtable: define PFN_PTE_SHIFT
+    - nios2/pgtable: define PFN_PTE_SHIFT
+    - powerpc/pgtable: define PFN_PTE_SHIFT
+    - riscv/pgtable: define PFN_PTE_SHIFT
+    - s390/pgtable: define PFN_PTE_SHIFT
+    - sparc/pgtable: define PFN_PTE_SHIFT
+    - mm/pgtable: make pte_next_pfn() independent of set_ptes()
+    - arm/mm: use pte_next_pfn() in set_ptes()
+    - powerpc/mm: use pte_next_pfn() in set_ptes()
+    - mm/memory: factor out copying the actual PTE in copy_present_pte()
+    - mm/memory: pass PTE to copy_present_pte()
+    - mm/memory: optimize fork() with PTE-mapped THP
+    - mm/memory: ignore dirty/accessed/soft-dirty bits in folio_pte_batch()
+    - mm/memory: ignore writable bit in folio_pte_batch()
+    - mm: clarify the spec for set_ptes()
+    - mm: thp: batch-collapse PMD with set_ptes()
+    - mm: introduce pte_advance_pfn() and use for pte_next_pfn()
+    - arm64/mm: convert pte_next_pfn() to pte_advance_pfn()
+    - x86/mm: convert pte_next_pfn() to pte_advance_pfn()
+    - mm: tidy up pte_next_pfn() definition
+    - arm64/mm: convert READ_ONCE(*ptep) to ptep_get(ptep)
+    - arm64/mm: convert set_pte_at() to set_ptes(..., 1)
+    - arm64/mm: convert ptep_clear() to ptep_get_and_clear()
+    - arm64/mm: new ptep layer to manage contig bit
+    - arm64/mm: dplit __flush_tlb_range() to elide trailing DSB
+    - NVIDIA: [Config] arm64: ARM64_CONTPTE=y
+    - arm64/mm: wire up PTE_CONT for user mappings
+    - arm64/mm: implement new wrprotect_ptes() batch API
+    - arm64/mm: implement new [get_and_]clear_full_ptes() batch APIs
+    - mm: add pte_batch_hint() to reduce scanning in folio_pte_batch()
+    - arm64/mm: implement pte_batch_hint()
+    - arm64/mm: __always_inline to improve fork() perf
+    - arm64/mm: automatically fold contpte mappings
+    - arm64/mm: export contpte symbols only to GPL users
+    - arm64/mm: improve comment in contpte_ptep_get_lockless()
+
+  * Enable GDS in the 6.8 based linux-nvidia kernel (LP: #2059814)
+    - NVIDIA: SAUCE: Patch NFS driver to support GDS with 6.8 Kernel
+    - NVIDIA: SAUCE: NVMe/MVMEeOF: Patch NVMe/NVMeOF driver to support GDS on
+      Linux 6.8 Kernel
 
  -- Brad Figg <bfigg@nvidia.com>  Tue, 01 Oct 2024 14:57:25 -0700
 

From c05d9d0b2ed29d4c1cf4a15f910ce1e623adb686 Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mochs@nvidia.com>
Date: Tue, 1 Oct 2024 19:33:35 -0700
Subject: [PATCH 065/352] Revert "NVIDIA: SAUCE: iommu/arm-smmu-v3: Allow
 default substream bypass with a pasid support"

This reverts commit 70753955ff08dbb09b88352c6cbf70f796202690.

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 ++++----------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 645f6e0974268..5071a8495a78c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1138,12 +1138,6 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 
 	cd_table->stall_enabled = master->stall_enabled;
 	cd_table->s1cdmax = master->ssid_bits;
-
-	if (master->domain->domain.type  == IOMMU_DOMAIN_IDENTITY)
-		cd_table->s1dss = STRTAB_STE_1_S1DSS_BYPASS;
-	else
-		cd_table->s1dss = STRTAB_STE_1_S1DSS_SSID0;
-
 	max_contexts = 1 << cd_table->s1cdmax;
 
 	if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) ||
@@ -1347,8 +1341,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 
 		BUG_ON(ste_live);
 		dst->data[1] = cpu_to_le64(
-			 FIELD_PREP(STRTAB_STE_1_S1DSS, cd_table->s1dss) |
-			 FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING) |
+			 FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
 			 FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 			 FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 			 FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
@@ -2145,8 +2138,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
 	return 0;
 }
 
-static int arm_smmu_domain_finalise(struct iommu_domain *domain,
-				     struct arm_smmu_master *master)
+static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 {
 	int ret;
 	unsigned long ias, oas;
@@ -2158,11 +2150,7 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain,
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 
-	/*
-	 * A master with a pasid capability might need a CD table, so only set
-	 * ARM_SMMU_DOMAIN_BYPASS if IOMMU_DOMAIN_IDENTITY and non-pasid master
-	 */
-	if (domain->type == IOMMU_DOMAIN_IDENTITY && !master->ssid_bits) {
+	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
 		smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
 		return 0;
 	}
@@ -2414,7 +2402,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
 	if (!smmu_domain->smmu) {
 		smmu_domain->smmu = smmu;
-		ret = arm_smmu_domain_finalise(domain, master);
+		ret = arm_smmu_domain_finalise(domain);
 		if (ret)
 			smmu_domain->smmu = NULL;
 	} else if (smmu_domain->smmu != smmu)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 5abadec3d3bf6..65fb388d51734 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -601,7 +601,6 @@ struct arm_smmu_ctx_desc_cfg {
 	struct arm_smmu_l1_ctx_desc	*l1_desc;
 	unsigned int			num_l1_ents;
 	u8				s1fmt;
-	u8				s1dss;
 	/* log2 of the maximum number of CDs supported by this table */
 	u8				s1cdmax;
 	/* Whether CD entries in this table have the stall bit set. */

From 4e8ed90091e9f2e13453c889988e62bb5d203ca3 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 4 Jan 2024 15:15:43 -0500
Subject: [PATCH 066/352] vfio: replace CONFIG_HAVE_KVM with
 IS_ENABLED(CONFIG_KVM)

It is more accurate to Check if KVM is enabled, instead of having the
architecture say so.  Architectures always "have" KVM, so for example
checking CONFIG_HAVE_KVM in vfio code is pointless, but if KVM is disabled
in a specific build, there is no need for support code.

Alternatively, the #ifdefs could simply be deleted.  However, this
would add dead code.  For example, when KVM is disabled, there is no
need to include code in VFIO that uses symbol_get, as that symbol_get
would always fail.

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: x86@kernel.org
Cc: kbingham@kernel.org
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 09e33b045526085fc216bfbb216d87c8da7deb4f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/vfio.h      | 2 +-
 drivers/vfio/vfio_main.c | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio.h b/drivers/vfio/vfio.h
index bde84ad344e50..50128da18bcaf 100644
--- a/drivers/vfio/vfio.h
+++ b/drivers/vfio/vfio.h
@@ -434,7 +434,7 @@ static inline void vfio_virqfd_exit(void)
 }
 #endif
 
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm);
 void vfio_device_put_kvm(struct vfio_device *device);
 #else
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index 1cc93aac99a29..e97d796a54fba 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -16,7 +16,7 @@
 #include <linux/fs.h>
 #include <linux/idr.h>
 #include <linux/iommu.h>
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
 #include <linux/kvm_host.h>
 #endif
 #include <linux/list.h>
@@ -385,7 +385,7 @@ void vfio_unregister_group_dev(struct vfio_device *device)
 }
 EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
 
-#ifdef CONFIG_HAVE_KVM
+#if IS_ENABLED(CONFIG_KVM)
 void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
 {
 	void (*pfn)(struct kvm *kvm);

From 20f7ada877b3cf26e7bffe02257233de2a6dd249 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 5 Feb 2024 15:32:39 +0000
Subject: [PATCH 067/352] iommu/iova: Tidy up iova_cache_get() failure

Failure handling in iova_cache_get() is a little messy, and we'd like
to add some more to it, so let's tidy up a bit first. By leaving the
hotplug handler until last we can take advantage of kmem_cache_destroy()
being NULL-safe to have a single cleanup label. We can also improve the
error reporting, noting that kmem_cache_create() already screams if it
fails, so that one is redundant.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/ae4a3bda2d6a9b738221553c838d30473bd624e7.1707144953.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit e7b3533c81386464dfdcb01193075f8a9557083a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iova.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d30e453d0fb4b..cf95001d85c0b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -254,26 +254,20 @@ static void free_iova_mem(struct iova *iova)
 
 int iova_cache_get(void)
 {
+	int err = -ENOMEM;
+
 	mutex_lock(&iova_cache_mutex);
 	if (!iova_cache_users) {
-		int ret;
-
-		ret = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead", NULL,
-					iova_cpuhp_dead);
-		if (ret) {
-			mutex_unlock(&iova_cache_mutex);
-			pr_err("Couldn't register cpuhp handler\n");
-			return ret;
-		}
+		iova_cache = kmem_cache_create("iommu_iova", sizeof(struct iova), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+		if (!iova_cache)
+			goto out_err;
 
-		iova_cache = kmem_cache_create(
-			"iommu_iova", sizeof(struct iova), 0,
-			SLAB_HWCACHE_ALIGN, NULL);
-		if (!iova_cache) {
-			cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
-			mutex_unlock(&iova_cache_mutex);
-			pr_err("Couldn't create iova cache\n");
-			return -ENOMEM;
+		err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead",
+					      NULL, iova_cpuhp_dead);
+		if (err) {
+			pr_err("IOVA: Couldn't register cpuhp handler: %pe\n", ERR_PTR(err));
+			goto out_err;
 		}
 	}
 
@@ -281,6 +275,11 @@ int iova_cache_get(void)
 	mutex_unlock(&iova_cache_mutex);
 
 	return 0;
+
+out_err:
+	kmem_cache_destroy(iova_cache);
+	mutex_unlock(&iova_cache_mutex);
+	return err;
 }
 EXPORT_SYMBOL_GPL(iova_cache_get);
 

From c611faa2ec82bedaf8edbc4e6a43b05009d6f088 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 5 Feb 2024 15:32:40 +0000
Subject: [PATCH 068/352] iommu/iova: Reorganise some code

The iova_cache_{get,put}() calls really represent top-level lifecycle
management for the whole IOVA library, so it's long been rather
confusing to have them buried right in the middle of the allocator
implementation details. Move them to a more expected position at the end
of the file, where it will then also be easier to expand them. With
this, we can also move the rcache hotplug handler (plus another stray
function) into the rcache portion of the file.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: David Rientjes <rientjes@google.com>
Reviewed-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/d4753562f4faa0e6b3aeebcbf88fdb60cc22d715.1707144953.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 7f845d8b2eed0986a03a777d4956b52a57007974)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iova.c | 128 +++++++++++++++++++++----------------------
 1 file changed, 64 insertions(+), 64 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index cf95001d85c0b..b5de865ee50b2 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -24,24 +24,8 @@ static bool iova_rcache_insert(struct iova_domain *iovad,
 static unsigned long iova_rcache_get(struct iova_domain *iovad,
 				     unsigned long size,
 				     unsigned long limit_pfn);
-static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_iova_rcaches(struct iova_domain *iovad);
-
-unsigned long iova_rcache_range(void)
-{
-	return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1);
-}
-
-static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
-{
-	struct iova_domain *iovad;
-
-	iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead);
-
-	free_cpu_cached_iovas(cpu, iovad);
-	return 0;
-}
-
+static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad);
 static void free_global_cached_iovas(struct iova_domain *iovad);
 
 static struct iova *to_iova(struct rb_node *node)
@@ -252,53 +236,6 @@ static void free_iova_mem(struct iova *iova)
 		kmem_cache_free(iova_cache, iova);
 }
 
-int iova_cache_get(void)
-{
-	int err = -ENOMEM;
-
-	mutex_lock(&iova_cache_mutex);
-	if (!iova_cache_users) {
-		iova_cache = kmem_cache_create("iommu_iova", sizeof(struct iova), 0,
-					       SLAB_HWCACHE_ALIGN, NULL);
-		if (!iova_cache)
-			goto out_err;
-
-		err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead",
-					      NULL, iova_cpuhp_dead);
-		if (err) {
-			pr_err("IOVA: Couldn't register cpuhp handler: %pe\n", ERR_PTR(err));
-			goto out_err;
-		}
-	}
-
-	iova_cache_users++;
-	mutex_unlock(&iova_cache_mutex);
-
-	return 0;
-
-out_err:
-	kmem_cache_destroy(iova_cache);
-	mutex_unlock(&iova_cache_mutex);
-	return err;
-}
-EXPORT_SYMBOL_GPL(iova_cache_get);
-
-void iova_cache_put(void)
-{
-	mutex_lock(&iova_cache_mutex);
-	if (WARN_ON(!iova_cache_users)) {
-		mutex_unlock(&iova_cache_mutex);
-		return;
-	}
-	iova_cache_users--;
-	if (!iova_cache_users) {
-		cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
-		kmem_cache_destroy(iova_cache);
-	}
-	mutex_unlock(&iova_cache_mutex);
-}
-EXPORT_SYMBOL_GPL(iova_cache_put);
-
 /**
  * alloc_iova - allocates an iova
  * @iovad: - iova domain in question
@@ -653,6 +590,11 @@ struct iova_rcache {
 	struct delayed_work work;
 };
 
+unsigned long iova_rcache_range(void)
+{
+	return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1);
+}
+
 static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
 {
 	struct iova_magazine *mag;
@@ -989,5 +931,63 @@ static void free_global_cached_iovas(struct iova_domain *iovad)
 		spin_unlock_irqrestore(&rcache->lock, flags);
 	}
 }
+
+static int iova_cpuhp_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct iova_domain *iovad;
+
+	iovad = hlist_entry_safe(node, struct iova_domain, cpuhp_dead);
+
+	free_cpu_cached_iovas(cpu, iovad);
+	return 0;
+}
+
+int iova_cache_get(void)
+{
+	int err = -ENOMEM;
+
+	mutex_lock(&iova_cache_mutex);
+	if (!iova_cache_users) {
+		iova_cache = kmem_cache_create("iommu_iova", sizeof(struct iova), 0,
+					       SLAB_HWCACHE_ALIGN, NULL);
+		if (!iova_cache)
+			goto out_err;
+
+		err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead",
+					      NULL, iova_cpuhp_dead);
+		if (err) {
+			pr_err("IOVA: Couldn't register cpuhp handler: %pe\n", ERR_PTR(err));
+			goto out_err;
+		}
+	}
+
+	iova_cache_users++;
+	mutex_unlock(&iova_cache_mutex);
+
+	return 0;
+
+out_err:
+	kmem_cache_destroy(iova_cache);
+	mutex_unlock(&iova_cache_mutex);
+	return err;
+}
+EXPORT_SYMBOL_GPL(iova_cache_get);
+
+void iova_cache_put(void)
+{
+	mutex_lock(&iova_cache_mutex);
+	if (WARN_ON(!iova_cache_users)) {
+		mutex_unlock(&iova_cache_mutex);
+		return;
+	}
+	iova_cache_users--;
+	if (!iova_cache_users) {
+		cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
+		kmem_cache_destroy(iova_cache);
+	}
+	mutex_unlock(&iova_cache_mutex);
+}
+EXPORT_SYMBOL_GPL(iova_cache_put);
+
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
 MODULE_LICENSE("GPL");

From 30283d728f55b96d3cddd8f2f075f95598b7f6b1 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Mon, 5 Feb 2024 15:32:41 +0000
Subject: [PATCH 069/352] iommu/iova: use named kmem_cache for iova magazines

The magazine buffers can take gigabytes of kmem memory, dominating all
other allocations. For observability purpose create named slab cache so
the iova magazine memory overhead can be clearly observed.

With this change:

> slabtop -o | head
 Active / Total Objects (% used)    : 869731 / 952904 (91.3%)
 Active / Total Slabs (% used)      : 103411 / 103974 (99.5%)
 Active / Total Caches (% used)     : 135 / 211 (64.0%)
 Active / Total Size (% used)       : 395389.68K / 411430.20K (96.1%)
 Minimum / Average / Maximum Object : 0.02K / 0.43K / 8.00K

OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME
244412 244239 99%    1.00K  61103       4    244412K iommu_iova_magazine
 91636  88343 96%    0.03K    739     124      2956K kmalloc-32
 75744  74844 98%    0.12K   2367      32      9468K kernfs_node_cache

On this machine it is now clear that magazine use 242M of kmem memory.

Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
[ rm: adjust to rework of iova_cache_{get,put} ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Link: https://lore.kernel.org/r/dc5c51aaba50906a92b9ba1a5137ed462484a7be.1707144953.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 84e6f56be9c68b59aca51544f7ee33706542d7bc)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iova.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index b5de865ee50b2..d59d0ea2fd219 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -590,6 +590,8 @@ struct iova_rcache {
 	struct delayed_work work;
 };
 
+static struct kmem_cache *iova_magazine_cache;
+
 unsigned long iova_rcache_range(void)
 {
 	return PAGE_SIZE << (IOVA_RANGE_CACHE_MAX_SIZE - 1);
@@ -599,7 +601,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
 {
 	struct iova_magazine *mag;
 
-	mag = kmalloc(sizeof(*mag), flags);
+	mag = kmem_cache_alloc(iova_magazine_cache, flags);
 	if (mag)
 		mag->size = 0;
 
@@ -608,7 +610,7 @@ static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
 
 static void iova_magazine_free(struct iova_magazine *mag)
 {
-	kfree(mag);
+	kmem_cache_free(iova_magazine_cache, mag);
 }
 
 static void
@@ -953,6 +955,12 @@ int iova_cache_get(void)
 		if (!iova_cache)
 			goto out_err;
 
+		iova_magazine_cache = kmem_cache_create("iommu_iova_magazine",
+							sizeof(struct iova_magazine),
+							0, SLAB_HWCACHE_ALIGN, NULL);
+		if (!iova_magazine_cache)
+			goto out_err;
+
 		err = cpuhp_setup_state_multi(CPUHP_IOMMU_IOVA_DEAD, "iommu/iova:dead",
 					      NULL, iova_cpuhp_dead);
 		if (err) {
@@ -968,6 +976,7 @@ int iova_cache_get(void)
 
 out_err:
 	kmem_cache_destroy(iova_cache);
+	kmem_cache_destroy(iova_magazine_cache);
 	mutex_unlock(&iova_cache_mutex);
 	return err;
 }
@@ -984,6 +993,7 @@ void iova_cache_put(void)
 	if (!iova_cache_users) {
 		cpuhp_remove_multi_state(CPUHP_IOMMU_IOVA_DEAD);
 		kmem_cache_destroy(iova_cache);
+		kmem_cache_destroy(iova_magazine_cache);
 	}
 	mutex_unlock(&iova_cache_mutex);
 }

From b6f431b320070f30b6064eeb2726027dffb00290 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 5 Feb 2024 16:43:27 +0000
Subject: [PATCH 070/352] iommu/ipmmu-vmsa: Minor cleanups

Remove the of_match_ptr() which was supposed to have gone long ago, but
managed to got lost in a fix-squashing mishap. On a similar theme, we
may as well also modernise the PM ops to get rid of the clunky #ifdefs,
and modernise the resource mapping to keep the checkpatch brigade happy.

Link: https://lore.kernel.org/linux-iommu/Yxni3d6CdI3FZ5D+@8bytes.org/
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Yoshihiro Shimoda <yoshihiro.shimoda.uh@renesas.com>
Link: https://lore.kernel.org/r/791877b0d310dc2ab7dc616d2786ab24252b9b8e.1707151207.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit f2d6677ad5772d4972b11dc3a4c3e555973b27d9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/ipmmu-vmsa.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index ace1fc4bd34b0..90d3f03242db8 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -1005,7 +1005,6 @@ static const struct of_device_id ipmmu_of_ids[] = {
 static int ipmmu_probe(struct platform_device *pdev)
 {
 	struct ipmmu_vmsa_device *mmu;
-	struct resource *res;
 	int irq;
 	int ret;
 
@@ -1025,8 +1024,7 @@ static int ipmmu_probe(struct platform_device *pdev)
 		return ret;
 
 	/* Map I/O memory and request IRQ. */
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	mmu->base = devm_ioremap_resource(&pdev->dev, res);
+	mmu->base = devm_platform_ioremap_resource(pdev, 0);
 	if (IS_ERR(mmu->base))
 		return PTR_ERR(mmu->base);
 
@@ -1123,7 +1121,6 @@ static void ipmmu_remove(struct platform_device *pdev)
 	ipmmu_device_reset(mmu);
 }
 
-#ifdef CONFIG_PM_SLEEP
 static int ipmmu_resume_noirq(struct device *dev)
 {
 	struct ipmmu_vmsa_device *mmu = dev_get_drvdata(dev);
@@ -1153,18 +1150,14 @@ static int ipmmu_resume_noirq(struct device *dev)
 }
 
 static const struct dev_pm_ops ipmmu_pm  = {
-	SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, ipmmu_resume_noirq)
+	NOIRQ_SYSTEM_SLEEP_PM_OPS(NULL, ipmmu_resume_noirq)
 };
-#define DEV_PM_OPS	&ipmmu_pm
-#else
-#define DEV_PM_OPS	NULL
-#endif /* CONFIG_PM_SLEEP */
 
 static struct platform_driver ipmmu_driver = {
 	.driver = {
 		.name = "ipmmu-vmsa",
-		.of_match_table = of_match_ptr(ipmmu_of_ids),
-		.pm = DEV_PM_OPS,
+		.of_match_table = ipmmu_of_ids,
+		.pm = pm_sleep_ptr(&ipmmu_pm),
 	},
 	.probe = ipmmu_probe,
 	.remove_new = ipmmu_remove,

From f6637d1d8ca0c4084a23fb1e2fd13eb13161e97a Mon Sep 17 00:00:00 2001
From: Vasant Hegde <vasant.hegde@amd.com>
Date: Mon, 5 Feb 2024 11:56:07 +0000
Subject: [PATCH 071/352] iommu: Introduce iommu_group_mutex_assert()

Add function to check iommu group mutex lock. So that device drivers can
rely on group mutex lock instead of adding another driver level lock
before modifying driver specific device data structure.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240205115615.6053-10-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit bf8aff2945ba4091f503df673b9df33002546e6a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu.c | 19 +++++++++++++++++++
 include/linux/iommu.h |  8 ++++++++
 2 files changed, 27 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index e606d250d1d55..a3dd0a0171836 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1259,6 +1259,25 @@ void iommu_group_remove_device(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_group_remove_device);
 
+#if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API)
+/**
+ * iommu_group_mutex_assert - Check device group mutex lock
+ * @dev: the device that has group param set
+ *
+ * This function is called by an iommu driver to check whether it holds
+ * group mutex lock for the given device or not.
+ *
+ * Note that this function must be called after device group param is set.
+ */
+void iommu_group_mutex_assert(struct device *dev)
+{
+	struct iommu_group *group = dev->iommu_group;
+
+	lockdep_assert_held(&group->mutex);
+}
+EXPORT_SYMBOL_GPL(iommu_group_mutex_assert);
+#endif
+
 static struct device *iommu_group_first_dev(struct iommu_group *group)
 {
 	lockdep_assert_held(&group->mutex);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b58f15b914abc..8fefb06085166 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1344,6 +1344,14 @@ static inline ioasid_t iommu_alloc_global_pasid(struct device *dev)
 static inline void iommu_free_global_pasid(ioasid_t pasid) {}
 #endif /* CONFIG_IOMMU_API */
 
+#if IS_ENABLED(CONFIG_LOCKDEP) && IS_ENABLED(CONFIG_IOMMU_API)
+void iommu_group_mutex_assert(struct device *dev);
+#else
+static inline void iommu_group_mutex_assert(struct device *dev)
+{
+}
+#endif
+
 /**
  * iommu_map_sgtable - Map the given buffer to the IOMMU domain
  * @domain:	The IOMMU domain to perform the mapping

From b70ecfbe511ea496964a04758658c9107c3331ea Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:12 +0800
Subject: [PATCH 072/352] iommu: Move iommu fault data to linux/iommu.h

The iommu fault data is currently defined in uapi/linux/iommu.h, but is
only used inside the iommu subsystem. Move it to linux/iommu.h, where it
will be more accessible to kernel drivers.

With this done, uapi/linux/iommu.h becomes empty and can be removed from
the tree.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-2-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 00a9bc6070434814d39118a0de70c1645f64bf60)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 MAINTAINERS                |   1 -
 include/linux/iommu.h      | 152 +++++++++++++++++++++++++++++++++-
 include/uapi/linux/iommu.h | 161 -------------------------------------
 3 files changed, 151 insertions(+), 163 deletions(-)
 delete mode 100644 include/uapi/linux/iommu.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 29e9003d123a8..5c71554a5392e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11264,7 +11264,6 @@ F:	drivers/iommu/
 F:	include/linux/iommu.h
 F:	include/linux/iova.h
 F:	include/linux/of_iommu.h
-F:	include/uapi/linux/iommu.h
 
 IOMMUFD
 M:	Jason Gunthorpe <jgg@nvidia.com>
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 8fefb06085166..5e23abcdd402d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -14,7 +14,6 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/iova_bitmap.h>
-#include <uapi/linux/iommu.h>
 
 #define IOMMU_READ	(1 << 0)
 #define IOMMU_WRITE	(1 << 1)
@@ -44,6 +43,157 @@ struct iommu_sva;
 struct iommu_fault_event;
 struct iommu_dma_cookie;
 
+#define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC	(1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV	(1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+	IOMMU_FAULT_DMA_UNRECOV = 1,	/* unrecoverable fault */
+	IOMMU_FAULT_PAGE_REQ,		/* page request fault */
+};
+
+enum iommu_fault_reason {
+	IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+	/* Could not access the PASID table (fetch caused external abort) */
+	IOMMU_FAULT_REASON_PASID_FETCH,
+
+	/* PASID entry is invalid or has configuration errors */
+	IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+	/*
+	 * PASID is out of range (e.g. exceeds the maximum PASID
+	 * supported by the IOMMU) or disabled.
+	 */
+	IOMMU_FAULT_REASON_PASID_INVALID,
+
+	/*
+	 * An external abort occurred fetching (or updating) a translation
+	 * table descriptor
+	 */
+	IOMMU_FAULT_REASON_WALK_EABT,
+
+	/*
+	 * Could not access the page table entry (Bad address),
+	 * actual translation fault
+	 */
+	IOMMU_FAULT_REASON_PTE_FETCH,
+
+	/* Protection flag check failed */
+	IOMMU_FAULT_REASON_PERMISSION,
+
+	/* access flag check failed */
+	IOMMU_FAULT_REASON_ACCESS,
+
+	/* Output address of a translation stage caused Address Size fault */
+	IOMMU_FAULT_REASON_OOR_ADDRESS,
+};
+
+/**
+ * struct iommu_fault_unrecoverable - Unrecoverable fault data
+ * @reason: reason of the fault, from &enum iommu_fault_reason
+ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
+ * @pasid: Process Address Space ID
+ * @perm: requested permission access using by the incoming transaction
+ *        (IOMMU_FAULT_PERM_* values)
+ * @addr: offending page address
+ * @fetch_addr: address that caused a fetch abort, if any
+ */
+struct iommu_fault_unrecoverable {
+	__u32	reason;
+#define IOMMU_FAULT_UNRECOV_PASID_VALID		(1 << 0)
+#define IOMMU_FAULT_UNRECOV_ADDR_VALID		(1 << 1)
+#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID	(1 << 2)
+	__u32	flags;
+	__u32	pasid;
+	__u32	perm;
+	__u64	addr;
+	__u64	fetch_addr;
+};
+
+/**
+ * struct iommu_fault_page_request - Page Request data
+ * @flags: encodes whether the corresponding fields are valid and whether this
+ *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
+ *         When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
+ *         must have the same PASID value as the page request. When it is clear,
+ *         the page response should not have a PASID.
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
+ * @addr: page address
+ * @private_data: device-specific private information
+ */
+struct iommu_fault_page_request {
+#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
+#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
+#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
+#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
+	__u32	flags;
+	__u32	pasid;
+	__u32	grpid;
+	__u32	perm;
+	__u64	addr;
+	__u64	private_data[2];
+};
+
+/**
+ * struct iommu_fault - Generic fault data
+ * @type: fault type from &enum iommu_fault_type
+ * @padding: reserved for future use (should be zero)
+ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
+ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
+ * @padding2: sets the fault size to allow for future extensions
+ */
+struct iommu_fault {
+	__u32	type;
+	__u32	padding;
+	union {
+		struct iommu_fault_unrecoverable event;
+		struct iommu_fault_page_request prm;
+		__u8 padding2[56];
+	};
+};
+
+/**
+ * enum iommu_page_response_code - Return status of fault handlers
+ * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
+ *	populated, retry the access. This is "Success" in PCI PRI.
+ * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
+ *	this device if possible. This is "Response Failure" in PCI PRI.
+ * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
+ *	access. This is "Invalid Request" in PCI PRI.
+ */
+enum iommu_page_response_code {
+	IOMMU_PAGE_RESP_SUCCESS = 0,
+	IOMMU_PAGE_RESP_INVALID,
+	IOMMU_PAGE_RESP_FAILURE,
+};
+
+/**
+ * struct iommu_page_response - Generic page response information
+ * @argsz: User filled size of this data
+ * @version: API version of this structure
+ * @flags: encodes whether the corresponding fields are valid
+ *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @code: response code from &enum iommu_page_response_code
+ */
+struct iommu_page_response {
+	__u32	argsz;
+#define IOMMU_PAGE_RESP_VERSION_1	1
+	__u32	version;
+#define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
+	__u32	flags;
+	__u32	pasid;
+	__u32	grpid;
+	__u32	code;
+};
+
+
 /* iommu fault flags */
 #define IOMMU_FAULT_READ	0x0
 #define IOMMU_FAULT_WRITE	0x1
diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
deleted file mode 100644
index 65d8b0234f690..0000000000000
--- a/include/uapi/linux/iommu.h
+++ /dev/null
@@ -1,161 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * IOMMU user API definitions
- */
-
-#ifndef _UAPI_IOMMU_H
-#define _UAPI_IOMMU_H
-
-#include <linux/types.h>
-
-#define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
-#define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
-#define IOMMU_FAULT_PERM_EXEC	(1 << 2) /* exec */
-#define IOMMU_FAULT_PERM_PRIV	(1 << 3) /* privileged */
-
-/* Generic fault types, can be expanded IRQ remapping fault */
-enum iommu_fault_type {
-	IOMMU_FAULT_DMA_UNRECOV = 1,	/* unrecoverable fault */
-	IOMMU_FAULT_PAGE_REQ,		/* page request fault */
-};
-
-enum iommu_fault_reason {
-	IOMMU_FAULT_REASON_UNKNOWN = 0,
-
-	/* Could not access the PASID table (fetch caused external abort) */
-	IOMMU_FAULT_REASON_PASID_FETCH,
-
-	/* PASID entry is invalid or has configuration errors */
-	IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
-
-	/*
-	 * PASID is out of range (e.g. exceeds the maximum PASID
-	 * supported by the IOMMU) or disabled.
-	 */
-	IOMMU_FAULT_REASON_PASID_INVALID,
-
-	/*
-	 * An external abort occurred fetching (or updating) a translation
-	 * table descriptor
-	 */
-	IOMMU_FAULT_REASON_WALK_EABT,
-
-	/*
-	 * Could not access the page table entry (Bad address),
-	 * actual translation fault
-	 */
-	IOMMU_FAULT_REASON_PTE_FETCH,
-
-	/* Protection flag check failed */
-	IOMMU_FAULT_REASON_PERMISSION,
-
-	/* access flag check failed */
-	IOMMU_FAULT_REASON_ACCESS,
-
-	/* Output address of a translation stage caused Address Size fault */
-	IOMMU_FAULT_REASON_OOR_ADDRESS,
-};
-
-/**
- * struct iommu_fault_unrecoverable - Unrecoverable fault data
- * @reason: reason of the fault, from &enum iommu_fault_reason
- * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
- * @pasid: Process Address Space ID
- * @perm: requested permission access using by the incoming transaction
- *        (IOMMU_FAULT_PERM_* values)
- * @addr: offending page address
- * @fetch_addr: address that caused a fetch abort, if any
- */
-struct iommu_fault_unrecoverable {
-	__u32	reason;
-#define IOMMU_FAULT_UNRECOV_PASID_VALID		(1 << 0)
-#define IOMMU_FAULT_UNRECOV_ADDR_VALID		(1 << 1)
-#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID	(1 << 2)
-	__u32	flags;
-	__u32	pasid;
-	__u32	perm;
-	__u64	addr;
-	__u64	fetch_addr;
-};
-
-/**
- * struct iommu_fault_page_request - Page Request data
- * @flags: encodes whether the corresponding fields are valid and whether this
- *         is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
- *         When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
- *         must have the same PASID value as the page request. When it is clear,
- *         the page response should not have a PASID.
- * @pasid: Process Address Space ID
- * @grpid: Page Request Group Index
- * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
- * @addr: page address
- * @private_data: device-specific private information
- */
-struct iommu_fault_page_request {
-#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID	(1 << 0)
-#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
-#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
-#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
-	__u32	flags;
-	__u32	pasid;
-	__u32	grpid;
-	__u32	perm;
-	__u64	addr;
-	__u64	private_data[2];
-};
-
-/**
- * struct iommu_fault - Generic fault data
- * @type: fault type from &enum iommu_fault_type
- * @padding: reserved for future use (should be zero)
- * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
- * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
- * @padding2: sets the fault size to allow for future extensions
- */
-struct iommu_fault {
-	__u32	type;
-	__u32	padding;
-	union {
-		struct iommu_fault_unrecoverable event;
-		struct iommu_fault_page_request prm;
-		__u8 padding2[56];
-	};
-};
-
-/**
- * enum iommu_page_response_code - Return status of fault handlers
- * @IOMMU_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
- *	populated, retry the access. This is "Success" in PCI PRI.
- * @IOMMU_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
- *	this device if possible. This is "Response Failure" in PCI PRI.
- * @IOMMU_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
- *	access. This is "Invalid Request" in PCI PRI.
- */
-enum iommu_page_response_code {
-	IOMMU_PAGE_RESP_SUCCESS = 0,
-	IOMMU_PAGE_RESP_INVALID,
-	IOMMU_PAGE_RESP_FAILURE,
-};
-
-/**
- * struct iommu_page_response - Generic page response information
- * @argsz: User filled size of this data
- * @version: API version of this structure
- * @flags: encodes whether the corresponding fields are valid
- *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
- * @pasid: Process Address Space ID
- * @grpid: Page Request Group Index
- * @code: response code from &enum iommu_page_response_code
- */
-struct iommu_page_response {
-	__u32	argsz;
-#define IOMMU_PAGE_RESP_VERSION_1	1
-	__u32	version;
-#define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
-	__u32	flags;
-	__u32	pasid;
-	__u32	grpid;
-	__u32	code;
-};
-
-#endif /* _UAPI_IOMMU_H */

From 374cca1726252f51026f95582cf55aa03f67a8d9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:13 +0800
Subject: [PATCH 073/352] iommu/arm-smmu-v3: Remove unrecoverable faults
 reporting

No device driver registers fault handler to handle the reported
unrecoveraable faults. Remove it to avoid dead code.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-3-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 66014df73b302d326b995178141a150b9a3a52b7)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 46 ++++++---------------
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5071a8495a78c..f56cb94ab46a9 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1461,7 +1461,6 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
 static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 {
 	int ret;
-	u32 reason;
 	u32 perm = 0;
 	struct arm_smmu_master *master;
 	bool ssid_valid = evt[0] & EVTQ_0_SSV;
@@ -1471,16 +1470,9 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 
 	switch (FIELD_GET(EVTQ_0_ID, evt[0])) {
 	case EVT_ID_TRANSLATION_FAULT:
-		reason = IOMMU_FAULT_REASON_PTE_FETCH;
-		break;
 	case EVT_ID_ADDR_SIZE_FAULT:
-		reason = IOMMU_FAULT_REASON_OOR_ADDRESS;
-		break;
 	case EVT_ID_ACCESS_FAULT:
-		reason = IOMMU_FAULT_REASON_ACCESS;
-		break;
 	case EVT_ID_PERMISSION_FAULT:
-		reason = IOMMU_FAULT_REASON_PERMISSION;
 		break;
 	default:
 		return -EOPNOTSUPP;
@@ -1490,6 +1482,9 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 	if (evt[1] & EVTQ_1_S2)
 		return -EFAULT;
 
+	if (!(evt[1] & EVTQ_1_STALL))
+		return -EOPNOTSUPP;
+
 	if (evt[1] & EVTQ_1_RnW)
 		perm |= IOMMU_FAULT_PERM_READ;
 	else
@@ -1501,32 +1496,17 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 	if (evt[1] & EVTQ_1_PnU)
 		perm |= IOMMU_FAULT_PERM_PRIV;
 
-	if (evt[1] & EVTQ_1_STALL) {
-		flt->type = IOMMU_FAULT_PAGE_REQ;
-		flt->prm = (struct iommu_fault_page_request) {
-			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
-			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
-			.perm = perm,
-			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
-		};
-
-		if (ssid_valid) {
-			flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
-			flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
-		}
-	} else {
-		flt->type = IOMMU_FAULT_DMA_UNRECOV;
-		flt->event = (struct iommu_fault_unrecoverable) {
-			.reason = reason,
-			.flags = IOMMU_FAULT_UNRECOV_ADDR_VALID,
-			.perm = perm,
-			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
-		};
+	flt->type = IOMMU_FAULT_PAGE_REQ;
+	flt->prm = (struct iommu_fault_page_request) {
+		.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
+		.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
+		.perm = perm,
+		.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
+	};
 
-		if (ssid_valid) {
-			flt->event.flags |= IOMMU_FAULT_UNRECOV_PASID_VALID;
-			flt->event.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
-		}
+	if (ssid_valid) {
+		flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+		flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
 	}
 
 	mutex_lock(&smmu->streams_mutex);

From 09f41920e2858611717f5a6aed2ba99c810691f7 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:14 +0800
Subject: [PATCH 074/352] iommu: Remove unrecoverable fault data

The unrecoverable fault data is not used anywhere. Remove it to avoid
dead code.

Suggested-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-4-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 0edeab66eba88947dabe8634a3efd136cc771750)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/iommu.h | 72 ++-----------------------------------------
 1 file changed, 2 insertions(+), 70 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5e23abcdd402d..a0a43c40737e1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -50,67 +50,7 @@ struct iommu_dma_cookie;
 
 /* Generic fault types, can be expanded IRQ remapping fault */
 enum iommu_fault_type {
-	IOMMU_FAULT_DMA_UNRECOV = 1,	/* unrecoverable fault */
-	IOMMU_FAULT_PAGE_REQ,		/* page request fault */
-};
-
-enum iommu_fault_reason {
-	IOMMU_FAULT_REASON_UNKNOWN = 0,
-
-	/* Could not access the PASID table (fetch caused external abort) */
-	IOMMU_FAULT_REASON_PASID_FETCH,
-
-	/* PASID entry is invalid or has configuration errors */
-	IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
-
-	/*
-	 * PASID is out of range (e.g. exceeds the maximum PASID
-	 * supported by the IOMMU) or disabled.
-	 */
-	IOMMU_FAULT_REASON_PASID_INVALID,
-
-	/*
-	 * An external abort occurred fetching (or updating) a translation
-	 * table descriptor
-	 */
-	IOMMU_FAULT_REASON_WALK_EABT,
-
-	/*
-	 * Could not access the page table entry (Bad address),
-	 * actual translation fault
-	 */
-	IOMMU_FAULT_REASON_PTE_FETCH,
-
-	/* Protection flag check failed */
-	IOMMU_FAULT_REASON_PERMISSION,
-
-	/* access flag check failed */
-	IOMMU_FAULT_REASON_ACCESS,
-
-	/* Output address of a translation stage caused Address Size fault */
-	IOMMU_FAULT_REASON_OOR_ADDRESS,
-};
-
-/**
- * struct iommu_fault_unrecoverable - Unrecoverable fault data
- * @reason: reason of the fault, from &enum iommu_fault_reason
- * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
- * @pasid: Process Address Space ID
- * @perm: requested permission access using by the incoming transaction
- *        (IOMMU_FAULT_PERM_* values)
- * @addr: offending page address
- * @fetch_addr: address that caused a fetch abort, if any
- */
-struct iommu_fault_unrecoverable {
-	__u32	reason;
-#define IOMMU_FAULT_UNRECOV_PASID_VALID		(1 << 0)
-#define IOMMU_FAULT_UNRECOV_ADDR_VALID		(1 << 1)
-#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID	(1 << 2)
-	__u32	flags;
-	__u32	pasid;
-	__u32	perm;
-	__u64	addr;
-	__u64	fetch_addr;
+	IOMMU_FAULT_PAGE_REQ = 1,	/* page request fault */
 };
 
 /**
@@ -142,19 +82,11 @@ struct iommu_fault_page_request {
 /**
  * struct iommu_fault - Generic fault data
  * @type: fault type from &enum iommu_fault_type
- * @padding: reserved for future use (should be zero)
- * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
  * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
- * @padding2: sets the fault size to allow for future extensions
  */
 struct iommu_fault {
 	__u32	type;
-	__u32	padding;
-	union {
-		struct iommu_fault_unrecoverable event;
-		struct iommu_fault_page_request prm;
-		__u8 padding2[56];
-	};
+	struct iommu_fault_page_request prm;
 };
 
 /**

From 3ff2ede625675a5958510ced0a5f8950d08d60c8 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:15 +0800
Subject: [PATCH 075/352] iommu: Cleanup iopf data structure definitions

struct iommu_fault_page_request and struct iommu_page_response are not
part of uAPI anymore. Convert them to data structures for kAPI.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-5-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 8b32a3bea2629049c484f595af7aad797e24453e)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c |  1 -
 drivers/iommu/iommu.c      |  4 ----
 include/linux/iommu.h      | 27 +++++++++++----------------
 3 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index e5b8b9110c132..24b5545352ae7 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -56,7 +56,6 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
 			       enum iommu_page_response_code status)
 {
 	struct iommu_page_response resp = {
-		.version		= IOMMU_PAGE_RESP_VERSION_1,
 		.pasid			= iopf->fault.prm.pasid,
 		.grpid			= iopf->fault.prm.grpid,
 		.code			= status,
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index a3dd0a0171836..c673d7aa0b7a6 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1524,10 +1524,6 @@ int iommu_page_response(struct device *dev,
 	if (!param || !param->fault_param)
 		return -EINVAL;
 
-	if (msg->version != IOMMU_PAGE_RESP_VERSION_1 ||
-	    msg->flags & ~IOMMU_PAGE_RESP_PASID_VALID)
-		return -EINVAL;
-
 	/* Only send response if there is a fault report pending */
 	mutex_lock(&param->fault_param->lock);
 	if (list_empty(&param->fault_param->faults)) {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a0a43c40737e1..969b3ee86c661 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -71,12 +71,12 @@ struct iommu_fault_page_request {
 #define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE	(1 << 1)
 #define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA	(1 << 2)
 #define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID	(1 << 3)
-	__u32	flags;
-	__u32	pasid;
-	__u32	grpid;
-	__u32	perm;
-	__u64	addr;
-	__u64	private_data[2];
+	u32	flags;
+	u32	pasid;
+	u32	grpid;
+	u32	perm;
+	u64	addr;
+	u64	private_data[2];
 };
 
 /**
@@ -85,7 +85,7 @@ struct iommu_fault_page_request {
  * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
  */
 struct iommu_fault {
-	__u32	type;
+	u32 type;
 	struct iommu_fault_page_request prm;
 };
 
@@ -106,8 +106,6 @@ enum iommu_page_response_code {
 
 /**
  * struct iommu_page_response - Generic page response information
- * @argsz: User filled size of this data
- * @version: API version of this structure
  * @flags: encodes whether the corresponding fields are valid
  *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
  * @pasid: Process Address Space ID
@@ -115,14 +113,11 @@ enum iommu_page_response_code {
  * @code: response code from &enum iommu_page_response_code
  */
 struct iommu_page_response {
-	__u32	argsz;
-#define IOMMU_PAGE_RESP_VERSION_1	1
-	__u32	version;
 #define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
-	__u32	flags;
-	__u32	pasid;
-	__u32	grpid;
-	__u32	code;
+	u32	flags;
+	u32	pasid;
+	u32	grpid;
+	u32	code;
 };
 
 

From b25e5dddaf536458768102d65b0a2222af852fa2 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:16 +0800
Subject: [PATCH 076/352] iommu: Merge iopf_device_param into iommu_fault_param

The struct dev_iommu contains two pointers, fault_param and iopf_param.
The fault_param pointer points to a data structure that is used to store
pending faults that are awaiting responses. The iopf_param pointer points
to a data structure that is used to store partial faults that are part of
a Page Request Group.

The fault_param and iopf_param pointers are essentially duplicate. This
causes memory waste. Merge the iopf_device_param pointer into the
iommu_fault_param pointer to consolidate the code and save memory. The
consolidated pointer would be allocated on demand when the device driver
enables the iopf on device, and would be freed after iopf is disabled.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-6-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 15fc60cdd2d236a73b32c99d21fc0f7b7ce6cbbb)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 110 ++++++++++++++++++-------------------
 drivers/iommu/iommu.c      |  34 ++----------
 include/linux/iommu.h      |  18 ++++--
 3 files changed, 72 insertions(+), 90 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 24b5545352ae7..f948303b2a91a 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -25,21 +25,6 @@ struct iopf_queue {
 	struct mutex			lock;
 };
 
-/**
- * struct iopf_device_param - IO Page Fault data attached to a device
- * @dev: the device that owns this param
- * @queue: IOPF queue
- * @queue_list: index into queue->devices
- * @partial: faults that are part of a Page Request Group for which the last
- *           request hasn't been submitted yet.
- */
-struct iopf_device_param {
-	struct device			*dev;
-	struct iopf_queue		*queue;
-	struct list_head		queue_list;
-	struct list_head		partial;
-};
-
 struct iopf_fault {
 	struct iommu_fault		fault;
 	struct list_head		list;
@@ -144,7 +129,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
 	int ret;
 	struct iopf_group *group;
 	struct iopf_fault *iopf, *next;
-	struct iopf_device_param *iopf_param;
+	struct iommu_fault_param *iopf_param;
 
 	struct device *dev = cookie;
 	struct dev_iommu *param = dev->iommu;
@@ -159,7 +144,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
 	 * As long as we're holding param->lock, the queue can't be unlinked
 	 * from the device and therefore cannot disappear.
 	 */
-	iopf_param = param->iopf_param;
+	iopf_param = param->fault_param;
 	if (!iopf_param)
 		return -ENODEV;
 
@@ -229,14 +214,14 @@ EXPORT_SYMBOL_GPL(iommu_queue_iopf);
 int iopf_queue_flush_dev(struct device *dev)
 {
 	int ret = 0;
-	struct iopf_device_param *iopf_param;
+	struct iommu_fault_param *iopf_param;
 	struct dev_iommu *param = dev->iommu;
 
 	if (!param)
 		return -ENODEV;
 
 	mutex_lock(&param->lock);
-	iopf_param = param->iopf_param;
+	iopf_param = param->fault_param;
 	if (iopf_param)
 		flush_workqueue(iopf_param->queue->wq);
 	else
@@ -260,7 +245,7 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
 int iopf_queue_discard_partial(struct iopf_queue *queue)
 {
 	struct iopf_fault *iopf, *next;
-	struct iopf_device_param *iopf_param;
+	struct iommu_fault_param *iopf_param;
 
 	if (!queue)
 		return -EINVAL;
@@ -287,34 +272,36 @@ EXPORT_SYMBOL_GPL(iopf_queue_discard_partial);
  */
 int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
 {
-	int ret = -EBUSY;
-	struct iopf_device_param *iopf_param;
+	int ret = 0;
 	struct dev_iommu *param = dev->iommu;
-
-	if (!param)
-		return -ENODEV;
-
-	iopf_param = kzalloc(sizeof(*iopf_param), GFP_KERNEL);
-	if (!iopf_param)
-		return -ENOMEM;
-
-	INIT_LIST_HEAD(&iopf_param->partial);
-	iopf_param->queue = queue;
-	iopf_param->dev = dev;
+	struct iommu_fault_param *fault_param;
 
 	mutex_lock(&queue->lock);
 	mutex_lock(&param->lock);
-	if (!param->iopf_param) {
-		list_add(&iopf_param->queue_list, &queue->devices);
-		param->iopf_param = iopf_param;
-		ret = 0;
+	if (param->fault_param) {
+		ret = -EBUSY;
+		goto done_unlock;
 	}
+
+	fault_param = kzalloc(sizeof(*fault_param), GFP_KERNEL);
+	if (!fault_param) {
+		ret = -ENOMEM;
+		goto done_unlock;
+	}
+
+	mutex_init(&fault_param->lock);
+	INIT_LIST_HEAD(&fault_param->faults);
+	INIT_LIST_HEAD(&fault_param->partial);
+	fault_param->dev = dev;
+	list_add(&fault_param->queue_list, &queue->devices);
+	fault_param->queue = queue;
+
+	param->fault_param = fault_param;
+
+done_unlock:
 	mutex_unlock(&param->lock);
 	mutex_unlock(&queue->lock);
 
-	if (ret)
-		kfree(iopf_param);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iopf_queue_add_device);
@@ -330,34 +317,41 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device);
  */
 int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 {
-	int ret = -EINVAL;
+	int ret = 0;
 	struct iopf_fault *iopf, *next;
-	struct iopf_device_param *iopf_param;
 	struct dev_iommu *param = dev->iommu;
-
-	if (!param || !queue)
-		return -EINVAL;
+	struct iommu_fault_param *fault_param = param->fault_param;
 
 	mutex_lock(&queue->lock);
 	mutex_lock(&param->lock);
-	iopf_param = param->iopf_param;
-	if (iopf_param && iopf_param->queue == queue) {
-		list_del(&iopf_param->queue_list);
-		param->iopf_param = NULL;
-		ret = 0;
+	if (!fault_param) {
+		ret = -ENODEV;
+		goto unlock;
 	}
-	mutex_unlock(&param->lock);
-	mutex_unlock(&queue->lock);
-	if (ret)
-		return ret;
+
+	if (fault_param->queue != queue) {
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	if (!list_empty(&fault_param->faults)) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	list_del(&fault_param->queue_list);
 
 	/* Just in case some faults are still stuck */
-	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list)
+	list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
 		kfree(iopf);
 
-	kfree(iopf_param);
+	param->fault_param = NULL;
+	kfree(fault_param);
+unlock:
+	mutex_unlock(&param->lock);
+	mutex_unlock(&queue->lock);
 
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iopf_queue_remove_device);
 
@@ -403,7 +397,7 @@ EXPORT_SYMBOL_GPL(iopf_queue_alloc);
  */
 void iopf_queue_free(struct iopf_queue *queue)
 {
-	struct iopf_device_param *iopf_param, *next;
+	struct iommu_fault_param *iopf_param, *next;
 
 	if (!queue)
 		return;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index c673d7aa0b7a6..2701b9a6b0e97 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1385,27 +1385,18 @@ int iommu_register_device_fault_handler(struct device *dev,
 	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
 
-	if (!param)
+	if (!param || !param->fault_param)
 		return -EINVAL;
 
 	mutex_lock(&param->lock);
 	/* Only allow one fault handler registered for each device */
-	if (param->fault_param) {
+	if (param->fault_param->handler) {
 		ret = -EBUSY;
 		goto done_unlock;
 	}
 
-	get_device(dev);
-	param->fault_param = kzalloc(sizeof(*param->fault_param), GFP_KERNEL);
-	if (!param->fault_param) {
-		put_device(dev);
-		ret = -ENOMEM;
-		goto done_unlock;
-	}
 	param->fault_param->handler = handler;
 	param->fault_param->data = data;
-	mutex_init(&param->fault_param->lock);
-	INIT_LIST_HEAD(&param->fault_param->faults);
 
 done_unlock:
 	mutex_unlock(&param->lock);
@@ -1426,29 +1417,16 @@ EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
 int iommu_unregister_device_fault_handler(struct device *dev)
 {
 	struct dev_iommu *param = dev->iommu;
-	int ret = 0;
 
-	if (!param)
+	if (!param || !param->fault_param)
 		return -EINVAL;
 
 	mutex_lock(&param->lock);
-
-	if (!param->fault_param)
-		goto unlock;
-
-	/* we cannot unregister handler if there are pending faults */
-	if (!list_empty(&param->fault_param->faults)) {
-		ret = -EBUSY;
-		goto unlock;
-	}
-
-	kfree(param->fault_param);
-	param->fault_param = NULL;
-	put_device(dev);
-unlock:
+	param->fault_param->handler = NULL;
+	param->fault_param->data = NULL;
 	mutex_unlock(&param->lock);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 969b3ee86c661..a18647280d9ad 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -42,6 +42,7 @@ struct notifier_block;
 struct iommu_sva;
 struct iommu_fault_event;
 struct iommu_dma_cookie;
+struct iopf_queue;
 
 #define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
 #define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
@@ -673,21 +674,31 @@ struct iommu_fault_event {
  * struct iommu_fault_param - per-device IOMMU fault data
  * @handler: Callback function to handle IOMMU faults at device level
  * @data: handler private data
- * @faults: holds the pending faults which needs response
  * @lock: protect pending faults list
+ * @dev: the device that owns this param
+ * @queue: IOPF queue
+ * @queue_list: index into queue->devices
+ * @partial: faults that are part of a Page Request Group for which the last
+ *           request hasn't been submitted yet.
+ * @faults: holds the pending faults which need response
  */
 struct iommu_fault_param {
 	iommu_dev_fault_handler_t handler;
 	void *data;
-	struct list_head faults;
 	struct mutex lock;
+
+	struct device *dev;
+	struct iopf_queue *queue;
+	struct list_head queue_list;
+
+	struct list_head partial;
+	struct list_head faults;
 };
 
 /**
  * struct dev_iommu - Collection of per-device IOMMU data
  *
  * @fault_param: IOMMU detected device fault reporting data
- * @iopf_param:	 I/O Page Fault queue and data
  * @fwspec:	 IOMMU fwspec data
  * @iommu_dev:	 IOMMU device this device is linked to
  * @priv:	 IOMMU Driver private data
@@ -703,7 +714,6 @@ struct iommu_fault_param {
 struct dev_iommu {
 	struct mutex lock;
 	struct iommu_fault_param	*fault_param;
-	struct iopf_device_param	*iopf_param;
 	struct iommu_fwspec		*fwspec;
 	struct iommu_device		*iommu_dev;
 	void				*priv;

From d566c807e4510640508cba14dae1fc802b98e99b Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:17 +0800
Subject: [PATCH 077/352] iommu: Remove
 iommu_[un]register_device_fault_handler()

The individual iommu driver reports the iommu page faults by calling
iommu_report_device_fault(), where a pre-registered device fault handler
is called to route the fault to another fault handler installed on the
corresponding iommu domain.

The pre-registered device fault handler is static and won't be dynamic
as the fault handler is eventually per iommu domain. Replace calling
device fault handler with iommu_queue_iopf().

After this replacement, the registering and unregistering fault handler
interfaces are not needed anywhere. Remove the interfaces and the related
data structures to avoid dead code.

Convert cookie parameter of iommu_queue_iopf() into a device pointer that
is really passed.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-7-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 1ff25d798e52943d037accf15c675a6845d9776f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 13 +---
 drivers/iommu/intel/iommu.c                   | 24 ++----
 drivers/iommu/io-pgfault.c                    |  6 +-
 drivers/iommu/iommu-sva.h                     |  4 +-
 drivers/iommu/iommu.c                         | 76 +------------------
 include/linux/iommu.h                         | 23 ------
 6 files changed, 13 insertions(+), 133 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 4a27fbdb2d844..4099036fa7e35 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -470,7 +470,6 @@ bool arm_smmu_master_sva_enabled(struct arm_smmu_master *master)
 
 static int arm_smmu_master_sva_enable_iopf(struct arm_smmu_master *master)
 {
-	int ret;
 	struct device *dev = master->dev;
 
 	/*
@@ -483,16 +482,7 @@ static int arm_smmu_master_sva_enable_iopf(struct arm_smmu_master *master)
 	if (!master->iopf_enabled)
 		return -EINVAL;
 
-	ret = iopf_queue_add_device(master->smmu->evtq.iopf, dev);
-	if (ret)
-		return ret;
-
-	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
-	if (ret) {
-		iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
-		return ret;
-	}
-	return 0;
+	return iopf_queue_add_device(master->smmu->evtq.iopf, dev);
 }
 
 static void arm_smmu_master_sva_disable_iopf(struct arm_smmu_master *master)
@@ -502,7 +492,6 @@ static void arm_smmu_master_sva_disable_iopf(struct arm_smmu_master *master)
 	if (!master->iopf_enabled)
 		return;
 
-	iommu_unregister_device_fault_handler(dev);
 	iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
 }
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index c4c6240d14f98..349599c73ad56 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4578,23 +4578,15 @@ static int intel_iommu_enable_iopf(struct device *dev)
 	if (ret)
 		return ret;
 
-	ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
-	if (ret)
-		goto iopf_remove_device;
-
 	ret = pci_enable_pri(pdev, PRQ_DEPTH);
-	if (ret)
-		goto iopf_unregister_handler;
+	if (ret) {
+		iopf_queue_remove_device(iommu->iopf_queue, dev);
+		return ret;
+	}
+
 	info->pri_enabled = 1;
 
 	return 0;
-
-iopf_unregister_handler:
-	iommu_unregister_device_fault_handler(dev);
-iopf_remove_device:
-	iopf_queue_remove_device(iommu->iopf_queue, dev);
-
-	return ret;
 }
 
 static int intel_iommu_disable_iopf(struct device *dev)
@@ -4617,11 +4609,9 @@ static int intel_iommu_disable_iopf(struct device *dev)
 	info->pri_enabled = 0;
 
 	/*
-	 * With PRI disabled and outstanding PRQs drained, unregistering
-	 * fault handler and removing device from iopf queue should never
-	 * fail.
+	 * With PRI disabled and outstanding PRQs drained, removing device
+	 * from iopf queue should never fail.
 	 */
-	WARN_ON(iommu_unregister_device_fault_handler(dev));
 	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
 
 	return 0;
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index f948303b2a91a..4fda01de55898 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -87,7 +87,7 @@ static void iopf_handler(struct work_struct *work)
 /**
  * iommu_queue_iopf - IO Page Fault handler
  * @fault: fault event
- * @cookie: struct device, passed to iommu_register_device_fault_handler.
+ * @dev: struct device.
  *
  * Add a fault to the device workqueue, to be handled by mm.
  *
@@ -124,14 +124,12 @@ static void iopf_handler(struct work_struct *work)
  *
  * Return: 0 on success and <0 on error.
  */
-int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 {
 	int ret;
 	struct iopf_group *group;
 	struct iopf_fault *iopf, *next;
 	struct iommu_fault_param *iopf_param;
-
-	struct device *dev = cookie;
 	struct dev_iommu *param = dev->iommu;
 
 	lockdep_assert_held(&param->lock);
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
index 54946b5a7cafe..de7819c796cee 100644
--- a/drivers/iommu/iommu-sva.h
+++ b/drivers/iommu/iommu-sva.h
@@ -13,7 +13,7 @@ struct iommu_fault;
 struct iopf_queue;
 
 #ifdef CONFIG_IOMMU_SVA
-int iommu_queue_iopf(struct iommu_fault *fault, void *cookie);
+int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev);
 
 int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
 int iopf_queue_remove_device(struct iopf_queue *queue,
@@ -26,7 +26,7 @@ enum iommu_page_response_code
 iommu_sva_handle_iopf(struct iommu_fault *fault, void *data);
 
 #else /* CONFIG_IOMMU_SVA */
-static inline int iommu_queue_iopf(struct iommu_fault *fault, void *cookie)
+static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 {
 	return -ENODEV;
 }
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 2701b9a6b0e97..0e7f30318f13c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1360,76 +1360,6 @@ void iommu_group_put(struct iommu_group *group)
 }
 EXPORT_SYMBOL_GPL(iommu_group_put);
 
-/**
- * iommu_register_device_fault_handler() - Register a device fault handler
- * @dev: the device
- * @handler: the fault handler
- * @data: private data passed as argument to the handler
- *
- * When an IOMMU fault event is received, this handler gets called with the
- * fault event and data as argument. The handler should return 0 on success. If
- * the fault is recoverable (IOMMU_FAULT_PAGE_REQ), the consumer should also
- * complete the fault by calling iommu_page_response() with one of the following
- * response code:
- * - IOMMU_PAGE_RESP_SUCCESS: retry the translation
- * - IOMMU_PAGE_RESP_INVALID: terminate the fault
- * - IOMMU_PAGE_RESP_FAILURE: terminate the fault and stop reporting
- *   page faults if possible.
- *
- * Return 0 if the fault handler was installed successfully, or an error.
- */
-int iommu_register_device_fault_handler(struct device *dev,
-					iommu_dev_fault_handler_t handler,
-					void *data)
-{
-	struct dev_iommu *param = dev->iommu;
-	int ret = 0;
-
-	if (!param || !param->fault_param)
-		return -EINVAL;
-
-	mutex_lock(&param->lock);
-	/* Only allow one fault handler registered for each device */
-	if (param->fault_param->handler) {
-		ret = -EBUSY;
-		goto done_unlock;
-	}
-
-	param->fault_param->handler = handler;
-	param->fault_param->data = data;
-
-done_unlock:
-	mutex_unlock(&param->lock);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_register_device_fault_handler);
-
-/**
- * iommu_unregister_device_fault_handler() - Unregister the device fault handler
- * @dev: the device
- *
- * Remove the device fault handler installed with
- * iommu_register_device_fault_handler().
- *
- * Return 0 on success, or an error.
- */
-int iommu_unregister_device_fault_handler(struct device *dev)
-{
-	struct dev_iommu *param = dev->iommu;
-
-	if (!param || !param->fault_param)
-		return -EINVAL;
-
-	mutex_lock(&param->lock);
-	param->fault_param->handler = NULL;
-	param->fault_param->data = NULL;
-	mutex_unlock(&param->lock);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(iommu_unregister_device_fault_handler);
-
 /**
  * iommu_report_device_fault() - Report fault event to device driver
  * @dev: the device
@@ -1454,10 +1384,6 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 	/* we only report device fault if there is a handler registered */
 	mutex_lock(&param->lock);
 	fparam = param->fault_param;
-	if (!fparam || !fparam->handler) {
-		ret = -EINVAL;
-		goto done_unlock;
-	}
 
 	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
 	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
@@ -1472,7 +1398,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 		mutex_unlock(&fparam->lock);
 	}
 
-	ret = fparam->handler(&evt->fault, fparam->data);
+	ret = iommu_queue_iopf(&evt->fault, dev);
 	if (ret && evt_pending) {
 		mutex_lock(&fparam->lock);
 		list_del(&evt_pending->list);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a18647280d9ad..b68239c09b1b1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -128,7 +128,6 @@ struct iommu_page_response {
 
 typedef int (*iommu_fault_handler_t)(struct iommu_domain *,
 			struct device *, unsigned long, int, void *);
-typedef int (*iommu_dev_fault_handler_t)(struct iommu_fault *, void *);
 
 struct iommu_domain_geometry {
 	dma_addr_t aperture_start; /* First address that can be mapped    */
@@ -672,8 +671,6 @@ struct iommu_fault_event {
 
 /**
  * struct iommu_fault_param - per-device IOMMU fault data
- * @handler: Callback function to handle IOMMU faults at device level
- * @data: handler private data
  * @lock: protect pending faults list
  * @dev: the device that owns this param
  * @queue: IOPF queue
@@ -683,8 +680,6 @@ struct iommu_fault_event {
  * @faults: holds the pending faults which need response
  */
 struct iommu_fault_param {
-	iommu_dev_fault_handler_t handler;
-	void *data;
 	struct mutex lock;
 
 	struct device *dev;
@@ -807,11 +802,6 @@ extern int iommu_group_for_each_dev(struct iommu_group *group, void *data,
 extern struct iommu_group *iommu_group_get(struct device *dev);
 extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group);
 extern void iommu_group_put(struct iommu_group *group);
-extern int iommu_register_device_fault_handler(struct device *dev,
-					iommu_dev_fault_handler_t handler,
-					void *data);
-
-extern int iommu_unregister_device_fault_handler(struct device *dev);
 
 extern int iommu_report_device_fault(struct device *dev,
 				     struct iommu_fault_event *evt);
@@ -1226,19 +1216,6 @@ static inline void iommu_group_put(struct iommu_group *group)
 {
 }
 
-static inline
-int iommu_register_device_fault_handler(struct device *dev,
-					iommu_dev_fault_handler_t handler,
-					void *data)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_unregister_device_fault_handler(struct device *dev)
-{
-	return 0;
-}
-
 static inline
 int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 {

From 65025de50d6c0f46cfc5bc014698b79cf60261d6 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:18 +0800
Subject: [PATCH 078/352] iommu: Merge iommu_fault_event and iopf_fault

The iommu_fault_event and iopf_fault data structures store the same
information about an iopf fault. They are also used in the same way.
Merge these two data structures into a single one to make the code
more concise and easier to maintain.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-8-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 3f02a9dc70007c0e6299fda9c4f7a1e2277ec3d2)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  4 +--
 drivers/iommu/intel/iommu.h                 |  2 +-
 drivers/iommu/intel/svm.c                   |  5 ++--
 drivers/iommu/io-pgfault.c                  |  5 ----
 drivers/iommu/iommu.c                       |  8 +++---
 include/linux/iommu.h                       | 27 ++++++---------------
 6 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index f56cb94ab46a9..daef40bc587af 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -922,7 +922,7 @@ static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 }
 
 static int arm_smmu_page_response(struct device *dev,
-				  struct iommu_fault_event *unused,
+				  struct iopf_fault *unused,
 				  struct iommu_page_response *resp)
 {
 	struct arm_smmu_cmdq_ent cmd = {0};
@@ -1465,7 +1465,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 	struct arm_smmu_master *master;
 	bool ssid_valid = evt[0] & EVTQ_0_SSV;
 	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
-	struct iommu_fault_event fault_evt = { };
+	struct iopf_fault fault_evt = { };
 	struct iommu_fault *flt = &fault_evt.fault;
 
 	switch (FIELD_GET(EVTQ_0_ID, evt[0])) {
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index cd267ba64eda1..a03959ec439f1 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1096,7 +1096,7 @@ struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
 void intel_svm_check(struct intel_iommu *iommu);
 int intel_svm_enable_prq(struct intel_iommu *iommu);
 int intel_svm_finish_prq(struct intel_iommu *iommu);
-int intel_svm_page_response(struct device *dev, struct iommu_fault_event *evt,
+int intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
 			    struct iommu_page_response *msg);
 struct iommu_domain *intel_svm_domain_alloc(void);
 void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 4d269df0082fb..8596d35be8f68 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -565,13 +565,12 @@ static int prq_to_iommu_prot(struct page_req_dsc *req)
 static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 				struct page_req_dsc *desc)
 {
-	struct iommu_fault_event event;
+	struct iopf_fault event = { };
 
 	if (!dev || !dev_is_pci(dev))
 		return -ENODEV;
 
 	/* Fill in event data for device specific processing */
-	memset(&event, 0, sizeof(struct iommu_fault_event));
 	event.fault.type = IOMMU_FAULT_PAGE_REQ;
 	event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
 	event.fault.prm.pasid = desc->pasid;
@@ -744,7 +743,7 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 }
 
 int intel_svm_page_response(struct device *dev,
-			    struct iommu_fault_event *evt,
+			    struct iopf_fault *evt,
 			    struct iommu_page_response *msg)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 4fda01de55898..10d48eb72608c 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -25,11 +25,6 @@ struct iopf_queue {
 	struct mutex			lock;
 };
 
-struct iopf_fault {
-	struct iommu_fault		fault;
-	struct list_head		list;
-};
-
 struct iopf_group {
 	struct iopf_fault		last_fault;
 	struct list_head		faults;
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 0e7f30318f13c..dc62b0b59e4b0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1371,10 +1371,10 @@ EXPORT_SYMBOL_GPL(iommu_group_put);
  *
  * Return 0 on success, or an error.
  */
-int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
 	struct dev_iommu *param = dev->iommu;
-	struct iommu_fault_event *evt_pending = NULL;
+	struct iopf_fault *evt_pending = NULL;
 	struct iommu_fault_param *fparam;
 	int ret = 0;
 
@@ -1387,7 +1387,7 @@ int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
 
 	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
 	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
-		evt_pending = kmemdup(evt, sizeof(struct iommu_fault_event),
+		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
 				      GFP_KERNEL);
 		if (!evt_pending) {
 			ret = -ENOMEM;
@@ -1416,7 +1416,7 @@ int iommu_page_response(struct device *dev,
 {
 	bool needs_pasid;
 	int ret = -EINVAL;
-	struct iommu_fault_event *evt;
+	struct iopf_fault *evt;
 	struct iommu_fault_page_request *prm;
 	struct dev_iommu *param = dev->iommu;
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b68239c09b1b1..02512ee32c87f 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -40,7 +40,6 @@ struct iommu_domain_ops;
 struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
-struct iommu_fault_event;
 struct iommu_dma_cookie;
 struct iopf_queue;
 
@@ -121,6 +120,11 @@ struct iommu_page_response {
 	u32	code;
 };
 
+struct iopf_fault {
+	struct iommu_fault fault;
+	/* node for pending lists */
+	struct list_head list;
+};
 
 /* iommu fault flags */
 #define IOMMU_FAULT_READ	0x0
@@ -553,7 +557,7 @@ struct iommu_ops {
 	int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
 
 	int (*page_response)(struct device *dev,
-			     struct iommu_fault_event *evt,
+			     struct iopf_fault *evt,
 			     struct iommu_page_response *msg);
 
 	int (*def_domain_type)(struct device *dev);
@@ -655,20 +659,6 @@ struct iommu_device {
 	u32 max_pasids;
 };
 
-/**
- * struct iommu_fault_event - Generic fault event
- *
- * Can represent recoverable faults such as a page requests or
- * unrecoverable faults such as DMA or IRQ remapping faults.
- *
- * @fault: fault descriptor
- * @list: pending fault event list, used for tracking responses
- */
-struct iommu_fault_event {
-	struct iommu_fault fault;
-	struct list_head list;
-};
-
 /**
  * struct iommu_fault_param - per-device IOMMU fault data
  * @lock: protect pending faults list
@@ -803,8 +793,7 @@ extern struct iommu_group *iommu_group_get(struct device *dev);
 extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group);
 extern void iommu_group_put(struct iommu_group *group);
 
-extern int iommu_report_device_fault(struct device *dev,
-				     struct iommu_fault_event *evt);
+extern int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
 extern int iommu_page_response(struct device *dev,
 			       struct iommu_page_response *msg);
 
@@ -1217,7 +1206,7 @@ static inline void iommu_group_put(struct iommu_group *group)
 }
 
 static inline
-int iommu_report_device_fault(struct device *dev, struct iommu_fault_event *evt)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
 	return -ENODEV;
 }

From 1ee9ec19ca0ce3e9d8258006a9c885b5b38bdfdc Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:19 +0800
Subject: [PATCH 079/352] iommu: Prepare for separating SVA and IOPF

Move iopf_group data structure to iommu.h to make it a minimal set of
faults that a domain's page fault handler should handle.

Add a new function, iopf_free_group(), to free a fault group after all
faults in the group are handled. This function will be made global so
that it can be called from other files, such as iommu-sva.c.

Move iopf_queue data structure to iommu.h to allow the workqueue to be
scheduled out of this file.

This will simplify the sequential patches.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-9-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 24b5d268b5ab95c12b5ae58a054d04bfa442f58f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 39 ++++++++++++++------------------------
 include/linux/iommu.h      | 20 ++++++++++++++++++-
 2 files changed, 33 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 10d48eb72608c..c7e6bbed5c05b 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -13,24 +13,17 @@
 
 #include "iommu-sva.h"
 
-/**
- * struct iopf_queue - IO Page Fault queue
- * @wq: the fault workqueue
- * @devices: devices attached to this queue
- * @lock: protects the device list
- */
-struct iopf_queue {
-	struct workqueue_struct		*wq;
-	struct list_head		devices;
-	struct mutex			lock;
-};
-
-struct iopf_group {
-	struct iopf_fault		last_fault;
-	struct list_head		faults;
-	struct work_struct		work;
-	struct device			*dev;
-};
+static void iopf_free_group(struct iopf_group *group)
+{
+	struct iopf_fault *iopf, *next;
+
+	list_for_each_entry_safe(iopf, next, &group->faults, list) {
+		if (!(iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
+			kfree(iopf);
+	}
+
+	kfree(group);
+}
 
 static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
 			       enum iommu_page_response_code status)
@@ -50,9 +43,9 @@ static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
 
 static void iopf_handler(struct work_struct *work)
 {
+	struct iopf_fault *iopf;
 	struct iopf_group *group;
 	struct iommu_domain *domain;
-	struct iopf_fault *iopf, *next;
 	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
 
 	group = container_of(work, struct iopf_group, work);
@@ -61,7 +54,7 @@ static void iopf_handler(struct work_struct *work)
 	if (!domain || !domain->iopf_handler)
 		status = IOMMU_PAGE_RESP_INVALID;
 
-	list_for_each_entry_safe(iopf, next, &group->faults, list) {
+	list_for_each_entry(iopf, &group->faults, list) {
 		/*
 		 * For the moment, errors are sticky: don't handle subsequent
 		 * faults in the group if there is an error.
@@ -69,14 +62,10 @@ static void iopf_handler(struct work_struct *work)
 		if (status == IOMMU_PAGE_RESP_SUCCESS)
 			status = domain->iopf_handler(&iopf->fault,
 						      domain->fault_data);
-
-		if (!(iopf->fault.prm.flags &
-		      IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE))
-			kfree(iopf);
 	}
 
 	iopf_complete_group(group->dev, &group->last_fault, status);
-	kfree(group);
+	iopf_free_group(group);
 }
 
 /**
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 02512ee32c87f..38646929e13c0 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,7 +41,6 @@ struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_dma_cookie;
-struct iopf_queue;
 
 #define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
 #define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
@@ -126,6 +125,25 @@ struct iopf_fault {
 	struct list_head list;
 };
 
+struct iopf_group {
+	struct iopf_fault last_fault;
+	struct list_head faults;
+	struct work_struct work;
+	struct device *dev;
+};
+
+/**
+ * struct iopf_queue - IO Page Fault queue
+ * @wq: the fault workqueue
+ * @devices: devices attached to this queue
+ * @lock: protects the device list
+ */
+struct iopf_queue {
+	struct workqueue_struct *wq;
+	struct list_head devices;
+	struct mutex lock;
+};
+
 /* iommu fault flags */
 #define IOMMU_FAULT_READ	0x0
 #define IOMMU_FAULT_WRITE	0x1

From 49459526e79ea1709ca477be8e5538cf3c2bd14e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:20 +0800
Subject: [PATCH 080/352] iommu: Make iommu_queue_iopf() more generic

Make iommu_queue_iopf() more generic by making the iopf_group a minimal
set of iopf's that an iopf handler of domain should handle and respond
to. Add domain parameter to struct iopf_group so that the handler can
retrieve and use it directly.

Change iommu_queue_iopf() to forward groups of iopf's to the domain's
iopf handler. This is also a necessary step to decouple the sva iopf
handling code from this interface.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-10-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 351ffcb11ca0ff64e399982e279cfa131e7cb1aa)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 68 +++++++++++++++++++++++++++++++-------
 drivers/iommu/iommu-sva.c  |  3 +-
 drivers/iommu/iommu-sva.h  |  6 ++--
 include/linux/iommu.h      |  4 +--
 4 files changed, 61 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index c7e6bbed5c05b..13cd0929e7661 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -13,6 +13,9 @@
 
 #include "iommu-sva.h"
 
+enum iommu_page_response_code
+iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm);
+
 static void iopf_free_group(struct iopf_group *group)
 {
 	struct iopf_fault *iopf, *next;
@@ -45,29 +48,48 @@ static void iopf_handler(struct work_struct *work)
 {
 	struct iopf_fault *iopf;
 	struct iopf_group *group;
-	struct iommu_domain *domain;
 	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
 
 	group = container_of(work, struct iopf_group, work);
-	domain = iommu_get_domain_for_dev_pasid(group->dev,
-				group->last_fault.fault.prm.pasid, 0);
-	if (!domain || !domain->iopf_handler)
-		status = IOMMU_PAGE_RESP_INVALID;
-
 	list_for_each_entry(iopf, &group->faults, list) {
 		/*
 		 * For the moment, errors are sticky: don't handle subsequent
 		 * faults in the group if there is an error.
 		 */
-		if (status == IOMMU_PAGE_RESP_SUCCESS)
-			status = domain->iopf_handler(&iopf->fault,
-						      domain->fault_data);
+		if (status != IOMMU_PAGE_RESP_SUCCESS)
+			break;
+
+		status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm);
 	}
 
 	iopf_complete_group(group->dev, &group->last_fault, status);
 	iopf_free_group(group);
 }
 
+static struct iommu_domain *get_domain_for_iopf(struct device *dev,
+						struct iommu_fault *fault)
+{
+	struct iommu_domain *domain;
+
+	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
+		domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0);
+		if (IS_ERR(domain))
+			domain = NULL;
+	} else {
+		domain = iommu_get_domain_for_dev(dev);
+	}
+
+	if (!domain || !domain->iopf_handler) {
+		dev_warn_ratelimited(dev,
+			"iopf (pasid %d) without domain attached or handler installed\n",
+			 fault->prm.pasid);
+
+		return NULL;
+	}
+
+	return domain;
+}
+
 /**
  * iommu_queue_iopf - IO Page Fault handler
  * @fault: fault event
@@ -112,6 +134,7 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 {
 	int ret;
 	struct iopf_group *group;
+	struct iommu_domain *domain;
 	struct iopf_fault *iopf, *next;
 	struct iommu_fault_param *iopf_param;
 	struct dev_iommu *param = dev->iommu;
@@ -143,6 +166,12 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 		return 0;
 	}
 
+	domain = get_domain_for_iopf(dev, fault);
+	if (!domain) {
+		ret = -EINVAL;
+		goto cleanup_partial;
+	}
+
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	if (!group) {
 		/*
@@ -157,8 +186,8 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 	group->dev = dev;
 	group->last_fault.fault = *fault;
 	INIT_LIST_HEAD(&group->faults);
+	group->domain = domain;
 	list_add(&group->last_fault.list, &group->faults);
-	INIT_WORK(&group->work, iopf_handler);
 
 	/* See if we have partial faults for this group */
 	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
@@ -167,9 +196,13 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 			list_move(&iopf->list, &group->faults);
 	}
 
-	queue_work(iopf_param->queue->wq, &group->work);
-	return 0;
+	mutex_unlock(&iopf_param->lock);
+	ret = domain->iopf_handler(group);
+	mutex_lock(&iopf_param->lock);
+	if (ret)
+		iopf_free_group(group);
 
+	return ret;
 cleanup_partial:
 	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
 		if (iopf->fault.prm.grpid == fault->prm.grpid) {
@@ -181,6 +214,17 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_queue_iopf);
 
+int iommu_sva_handle_iopf(struct iopf_group *group)
+{
+	struct iommu_fault_param *fault_param = group->dev->iommu->fault_param;
+
+	INIT_WORK(&group->work, iopf_handler);
+	if (!queue_work(fault_param->queue->wq, &group->work))
+		return -EBUSY;
+
+	return 0;
+}
+
 /**
  * iopf_queue_flush_dev - Ensure that all queued faults have been processed
  * @dev: the endpoint whose faults need to be flushed.
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 65814cbc84020..bc49689815582 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -180,11 +180,10 @@ EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
  * I/O page fault handler for SVA
  */
 enum iommu_page_response_code
-iommu_sva_handle_iopf(struct iommu_fault *fault, void *data)
+iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm)
 {
 	vm_fault_t ret;
 	struct vm_area_struct *vma;
-	struct mm_struct *mm = data;
 	unsigned int access_flags = 0;
 	unsigned int fault_flags = FAULT_FLAG_REMOTE;
 	struct iommu_fault_page_request *prm = &fault->prm;
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
index de7819c796cee..27c8da115b418 100644
--- a/drivers/iommu/iommu-sva.h
+++ b/drivers/iommu/iommu-sva.h
@@ -22,8 +22,7 @@ int iopf_queue_flush_dev(struct device *dev);
 struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
-enum iommu_page_response_code
-iommu_sva_handle_iopf(struct iommu_fault *fault, void *data);
+int iommu_sva_handle_iopf(struct iopf_group *group);
 
 #else /* CONFIG_IOMMU_SVA */
 static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
@@ -62,8 +61,7 @@ static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
 	return -ENODEV;
 }
 
-static inline enum iommu_page_response_code
-iommu_sva_handle_iopf(struct iommu_fault *fault, void *data)
+static inline int iommu_sva_handle_iopf(struct iopf_group *group)
 {
 	return IOMMU_PAGE_RESP_INVALID;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 38646929e13c0..1a765b061df23 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -130,6 +130,7 @@ struct iopf_group {
 	struct list_head faults;
 	struct work_struct work;
 	struct device *dev;
+	struct iommu_domain *domain;
 };
 
 /**
@@ -209,8 +210,7 @@ struct iommu_domain {
 	unsigned long pgsize_bitmap;	/* Bitmap of page sizes in use */
 	struct iommu_domain_geometry geometry;
 	struct iommu_dma_cookie *iova_cookie;
-	enum iommu_page_response_code (*iopf_handler)(struct iommu_fault *fault,
-						      void *data);
+	int (*iopf_handler)(struct iopf_group *group);
 	void *fault_data;
 	union {
 		struct {

From e035926b91c521bfe0d7067345caa1a883e682ce Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:21 +0800
Subject: [PATCH 081/352] iommu: Separate SVA and IOPF

Add CONFIG_IOMMU_IOPF for page fault handling framework and select it
from its real consumer. Move iopf function declaration from iommu-sva.h
to iommu.h and remove iommu-sva.h as it's empty now.

Consolidate all SVA related code into iommu-sva.c:
- Move iommu_sva_domain_alloc() from iommu.c to iommu-sva.c.
- Move sva iopf handling code from io-pgfault.c to iommu-sva.c.

Consolidate iommu_report_device_fault() and iommu_page_response() into
io-pgfault.c.

Export iopf_free_group() and iopf_group_response() for iopf handlers
implemented in modules. Some functions are renamed with more meaningful
names. No other intentional functionality changes.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-11-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 17c51a0ea36b800e7a5998a92d83016c82935dff)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/Kconfig                         |   4 +
 drivers/iommu/Makefile                        |   3 +-
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |   1 -
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |   1 -
 drivers/iommu/intel/Kconfig                   |   1 +
 drivers/iommu/intel/iommu.c                   |   1 -
 drivers/iommu/intel/svm.c                     |   1 -
 drivers/iommu/io-pgfault.c                    | 188 +++++++++++++-----
 drivers/iommu/iommu-sva.c                     |  68 ++++++-
 drivers/iommu/iommu-sva.h                     |  69 -------
 drivers/iommu/iommu.c                         | 133 -------------
 include/linux/iommu.h                         |  98 ++++++---
 12 files changed, 277 insertions(+), 291 deletions(-)
 delete mode 100644 drivers/iommu/iommu-sva.h

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 9dbb55e745bd9..e9f6a5cb3400f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -163,6 +163,9 @@ config IOMMU_SVA
 	select IOMMU_MM_DATA
 	bool
 
+config IOMMU_IOPF
+	bool
+
 config FSL_PAMU
 	bool "Freescale IOMMU support"
 	depends on PCI
@@ -398,6 +401,7 @@ config ARM_SMMU_V3_SVA
 	bool "Shared Virtual Addressing support for the ARM SMMUv3"
 	depends on ARM_SMMU_V3
 	select IOMMU_SVA
+	select IOMMU_IOPF
 	select MMU_NOTIFIER
 	help
 	  Support for sharing process address spaces with devices using the
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 95ad9dbfbda02..542760d963ec7 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
 obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
 obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
-obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o
+obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o
+obj-$(CONFIG_IOMMU_IOPF) += io-pgfault.o
 obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
 obj-$(CONFIG_APPLE_DART) += apple-dart.o
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 4099036fa7e35..874d1f977d90c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -10,7 +10,6 @@
 #include <linux/slab.h>
 
 #include "arm-smmu-v3.h"
-#include "../../iommu-sva.h"
 #include "../../io-pgtable-arm.h"
 
 struct arm_smmu_mmu_notifier {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index daef40bc587af..d62c790663950 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -29,7 +29,6 @@
 
 #include "arm-smmu-v3.h"
 #include "../../dma-iommu.h"
-#include "../../iommu-sva.h"
 
 static bool disable_bypass = true;
 module_param(disable_bypass, bool, 0444);
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index 012cd2541a68a..a4a125666293f 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -51,6 +51,7 @@ config INTEL_IOMMU_SVM
 	depends on X86_64
 	select MMU_NOTIFIER
 	select IOMMU_SVA
+	select IOMMU_IOPF
 	help
 	  Shared Virtual Memory (SVM) provides a facility for devices
 	  to access DMA resources through process address space by
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 349599c73ad56..86a4c2a87a5e9 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -27,7 +27,6 @@
 #include "iommu.h"
 #include "../dma-iommu.h"
 #include "../irq_remapping.h"
-#include "../iommu-sva.h"
 #include "pasid.h"
 #include "cap_audit.h"
 #include "perfmon.h"
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 8596d35be8f68..1f4bfeace98f7 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -22,7 +22,6 @@
 #include "iommu.h"
 #include "pasid.h"
 #include "perf.h"
-#include "../iommu-sva.h"
 #include "trace.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 13cd0929e7661..c1e88da973cef 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -11,12 +11,9 @@
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 
-#include "iommu-sva.h"
+#include "iommu-priv.h"
 
-enum iommu_page_response_code
-iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm);
-
-static void iopf_free_group(struct iopf_group *group)
+void iopf_free_group(struct iopf_group *group)
 {
 	struct iopf_fault *iopf, *next;
 
@@ -27,44 +24,7 @@ static void iopf_free_group(struct iopf_group *group)
 
 	kfree(group);
 }
-
-static int iopf_complete_group(struct device *dev, struct iopf_fault *iopf,
-			       enum iommu_page_response_code status)
-{
-	struct iommu_page_response resp = {
-		.pasid			= iopf->fault.prm.pasid,
-		.grpid			= iopf->fault.prm.grpid,
-		.code			= status,
-	};
-
-	if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
-	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
-		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
-
-	return iommu_page_response(dev, &resp);
-}
-
-static void iopf_handler(struct work_struct *work)
-{
-	struct iopf_fault *iopf;
-	struct iopf_group *group;
-	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
-
-	group = container_of(work, struct iopf_group, work);
-	list_for_each_entry(iopf, &group->faults, list) {
-		/*
-		 * For the moment, errors are sticky: don't handle subsequent
-		 * faults in the group if there is an error.
-		 */
-		if (status != IOMMU_PAGE_RESP_SUCCESS)
-			break;
-
-		status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm);
-	}
-
-	iopf_complete_group(group->dev, &group->last_fault, status);
-	iopf_free_group(group);
-}
+EXPORT_SYMBOL_GPL(iopf_free_group);
 
 static struct iommu_domain *get_domain_for_iopf(struct device *dev,
 						struct iommu_fault *fault)
@@ -91,7 +51,7 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
 }
 
 /**
- * iommu_queue_iopf - IO Page Fault handler
+ * iommu_handle_iopf - IO Page Fault handler
  * @fault: fault event
  * @dev: struct device.
  *
@@ -130,7 +90,7 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
  *
  * Return: 0 on success and <0 on error.
  */
-int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
+static int iommu_handle_iopf(struct iommu_fault *fault, struct device *dev)
 {
 	int ret;
 	struct iopf_group *group;
@@ -212,18 +172,117 @@ int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
 	}
 	return ret;
 }
-EXPORT_SYMBOL_GPL(iommu_queue_iopf);
 
-int iommu_sva_handle_iopf(struct iopf_group *group)
+/**
+ * iommu_report_device_fault() - Report fault event to device driver
+ * @dev: the device
+ * @evt: fault event data
+ *
+ * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
+ * handler. When this function fails and the fault is recoverable, it is the
+ * caller's responsibility to complete the fault.
+ *
+ * Return 0 on success, or an error.
+ */
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
-	struct iommu_fault_param *fault_param = group->dev->iommu->fault_param;
+	struct dev_iommu *param = dev->iommu;
+	struct iopf_fault *evt_pending = NULL;
+	struct iommu_fault_param *fparam;
+	int ret = 0;
 
-	INIT_WORK(&group->work, iopf_handler);
-	if (!queue_work(fault_param->queue->wq, &group->work))
-		return -EBUSY;
+	if (!param || !evt)
+		return -EINVAL;
 
-	return 0;
+	/* we only report device fault if there is a handler registered */
+	mutex_lock(&param->lock);
+	fparam = param->fault_param;
+
+	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
+	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
+				      GFP_KERNEL);
+		if (!evt_pending) {
+			ret = -ENOMEM;
+			goto done_unlock;
+		}
+		mutex_lock(&fparam->lock);
+		list_add_tail(&evt_pending->list, &fparam->faults);
+		mutex_unlock(&fparam->lock);
+	}
+
+	ret = iommu_handle_iopf(&evt->fault, dev);
+	if (ret && evt_pending) {
+		mutex_lock(&fparam->lock);
+		list_del(&evt_pending->list);
+		mutex_unlock(&fparam->lock);
+		kfree(evt_pending);
+	}
+done_unlock:
+	mutex_unlock(&param->lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iommu_report_device_fault);
+
+int iommu_page_response(struct device *dev,
+			struct iommu_page_response *msg)
+{
+	bool needs_pasid;
+	int ret = -EINVAL;
+	struct iopf_fault *evt;
+	struct iommu_fault_page_request *prm;
+	struct dev_iommu *param = dev->iommu;
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
+
+	if (!ops->page_response)
+		return -ENODEV;
+
+	if (!param || !param->fault_param)
+		return -EINVAL;
+
+	/* Only send response if there is a fault report pending */
+	mutex_lock(&param->fault_param->lock);
+	if (list_empty(&param->fault_param->faults)) {
+		dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
+		goto done_unlock;
+	}
+	/*
+	 * Check if we have a matching page request pending to respond,
+	 * otherwise return -EINVAL
+	 */
+	list_for_each_entry(evt, &param->fault_param->faults, list) {
+		prm = &evt->fault.prm;
+		if (prm->grpid != msg->grpid)
+			continue;
+
+		/*
+		 * If the PASID is required, the corresponding request is
+		 * matched using the group ID, the PASID valid bit and the PASID
+		 * value. Otherwise only the group ID matches request and
+		 * response.
+		 */
+		needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
+		if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
+			continue;
+
+		if (!needs_pasid && has_pasid) {
+			/* No big deal, just clear it. */
+			msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
+			msg->pasid = 0;
+		}
+
+		ret = ops->page_response(dev, evt, msg);
+		list_del(&evt->list);
+		kfree(evt);
+		break;
+	}
+
+done_unlock:
+	mutex_unlock(&param->fault_param->lock);
+	return ret;
 }
+EXPORT_SYMBOL_GPL(iommu_page_response);
 
 /**
  * iopf_queue_flush_dev - Ensure that all queued faults have been processed
@@ -258,6 +317,31 @@ int iopf_queue_flush_dev(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
 
+/**
+ * iopf_group_response - Respond a group of page faults
+ * @group: the group of faults with the same group id
+ * @status: the response code
+ *
+ * Return 0 on success and <0 on error.
+ */
+int iopf_group_response(struct iopf_group *group,
+			enum iommu_page_response_code status)
+{
+	struct iopf_fault *iopf = &group->last_fault;
+	struct iommu_page_response resp = {
+		.pasid = iopf->fault.prm.pasid,
+		.grpid = iopf->fault.prm.grpid,
+		.code = status,
+	};
+
+	if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
+	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
+		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+	return iommu_page_response(group->dev, &resp);
+}
+EXPORT_SYMBOL_GPL(iopf_group_response);
+
 /**
  * iopf_queue_discard_partial - Remove all pending partial fault
  * @queue: the queue whose partial faults need to be discarded
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index bc49689815582..fbab4f059d284 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -7,7 +7,7 @@
 #include <linux/sched/mm.h>
 #include <linux/iommu.h>
 
-#include "iommu-sva.h"
+#include "iommu-priv.h"
 
 static DEFINE_MUTEX(iommu_sva_lock);
 
@@ -176,10 +176,21 @@ u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 }
 EXPORT_SYMBOL_GPL(iommu_sva_get_pasid);
 
+void mm_pasid_drop(struct mm_struct *mm)
+{
+	struct iommu_mm_data *iommu_mm = mm->iommu_mm;
+
+	if (!iommu_mm)
+		return;
+
+	iommu_free_global_pasid(iommu_mm->pasid);
+	kfree(iommu_mm);
+}
+
 /*
  * I/O page fault handler for SVA
  */
-enum iommu_page_response_code
+static enum iommu_page_response_code
 iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm)
 {
 	vm_fault_t ret;
@@ -233,13 +244,54 @@ iommu_sva_handle_mm(struct iommu_fault *fault, struct mm_struct *mm)
 	return status;
 }
 
-void mm_pasid_drop(struct mm_struct *mm)
+static void iommu_sva_handle_iopf(struct work_struct *work)
 {
-	struct iommu_mm_data *iommu_mm = mm->iommu_mm;
+	struct iopf_fault *iopf;
+	struct iopf_group *group;
+	enum iommu_page_response_code status = IOMMU_PAGE_RESP_SUCCESS;
+
+	group = container_of(work, struct iopf_group, work);
+	list_for_each_entry(iopf, &group->faults, list) {
+		/*
+		 * For the moment, errors are sticky: don't handle subsequent
+		 * faults in the group if there is an error.
+		 */
+		if (status != IOMMU_PAGE_RESP_SUCCESS)
+			break;
+
+		status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm);
+	}
 
-	if (!iommu_mm)
-		return;
+	iopf_group_response(group, status);
+	iopf_free_group(group);
+}
 
-	iommu_free_global_pasid(iommu_mm->pasid);
-	kfree(iommu_mm);
+static int iommu_sva_iopf_handler(struct iopf_group *group)
+{
+	struct iommu_fault_param *fault_param = group->dev->iommu->fault_param;
+
+	INIT_WORK(&group->work, iommu_sva_handle_iopf);
+	if (!queue_work(fault_param->queue->wq, &group->work))
+		return -EBUSY;
+
+	return 0;
+}
+
+struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+					    struct mm_struct *mm)
+{
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	struct iommu_domain *domain;
+
+	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
+	if (!domain)
+		return NULL;
+
+	domain->type = IOMMU_DOMAIN_SVA;
+	mmgrab(mm);
+	domain->mm = mm;
+	domain->owner = ops;
+	domain->iopf_handler = iommu_sva_iopf_handler;
+
+	return domain;
 }
diff --git a/drivers/iommu/iommu-sva.h b/drivers/iommu/iommu-sva.h
deleted file mode 100644
index 27c8da115b418..0000000000000
--- a/drivers/iommu/iommu-sva.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * SVA library for IOMMU drivers
- */
-#ifndef _IOMMU_SVA_H
-#define _IOMMU_SVA_H
-
-#include <linux/mm_types.h>
-
-/* I/O Page fault */
-struct device;
-struct iommu_fault;
-struct iopf_queue;
-
-#ifdef CONFIG_IOMMU_SVA
-int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev);
-
-int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
-int iopf_queue_remove_device(struct iopf_queue *queue,
-			     struct device *dev);
-int iopf_queue_flush_dev(struct device *dev);
-struct iopf_queue *iopf_queue_alloc(const char *name);
-void iopf_queue_free(struct iopf_queue *queue);
-int iopf_queue_discard_partial(struct iopf_queue *queue);
-int iommu_sva_handle_iopf(struct iopf_group *group);
-
-#else /* CONFIG_IOMMU_SVA */
-static inline int iommu_queue_iopf(struct iommu_fault *fault, struct device *dev)
-{
-	return -ENODEV;
-}
-
-static inline int iopf_queue_add_device(struct iopf_queue *queue,
-					struct device *dev)
-{
-	return -ENODEV;
-}
-
-static inline int iopf_queue_remove_device(struct iopf_queue *queue,
-					   struct device *dev)
-{
-	return -ENODEV;
-}
-
-static inline int iopf_queue_flush_dev(struct device *dev)
-{
-	return -ENODEV;
-}
-
-static inline struct iopf_queue *iopf_queue_alloc(const char *name)
-{
-	return NULL;
-}
-
-static inline void iopf_queue_free(struct iopf_queue *queue)
-{
-}
-
-static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_sva_handle_iopf(struct iopf_group *group)
-{
-	return IOMMU_PAGE_RESP_INVALID;
-}
-#endif /* CONFIG_IOMMU_SVA */
-#endif /* _IOMMU_SVA_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index dc62b0b59e4b0..9b94c973670b2 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -36,8 +36,6 @@
 #include "dma-iommu.h"
 #include "iommu-priv.h"
 
-#include "iommu-sva.h"
-
 static struct kset *iommu_group_kset;
 static DEFINE_IDA(iommu_group_ida);
 static DEFINE_IDA(iommu_global_pasid_ida);
@@ -1360,117 +1358,6 @@ void iommu_group_put(struct iommu_group *group)
 }
 EXPORT_SYMBOL_GPL(iommu_group_put);
 
-/**
- * iommu_report_device_fault() - Report fault event to device driver
- * @dev: the device
- * @evt: fault event data
- *
- * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
- * handler. When this function fails and the fault is recoverable, it is the
- * caller's responsibility to complete the fault.
- *
- * Return 0 on success, or an error.
- */
-int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
-{
-	struct dev_iommu *param = dev->iommu;
-	struct iopf_fault *evt_pending = NULL;
-	struct iommu_fault_param *fparam;
-	int ret = 0;
-
-	if (!param || !evt)
-		return -EINVAL;
-
-	/* we only report device fault if there is a handler registered */
-	mutex_lock(&param->lock);
-	fparam = param->fault_param;
-
-	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
-	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
-		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
-				      GFP_KERNEL);
-		if (!evt_pending) {
-			ret = -ENOMEM;
-			goto done_unlock;
-		}
-		mutex_lock(&fparam->lock);
-		list_add_tail(&evt_pending->list, &fparam->faults);
-		mutex_unlock(&fparam->lock);
-	}
-
-	ret = iommu_queue_iopf(&evt->fault, dev);
-	if (ret && evt_pending) {
-		mutex_lock(&fparam->lock);
-		list_del(&evt_pending->list);
-		mutex_unlock(&fparam->lock);
-		kfree(evt_pending);
-	}
-done_unlock:
-	mutex_unlock(&param->lock);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_report_device_fault);
-
-int iommu_page_response(struct device *dev,
-			struct iommu_page_response *msg)
-{
-	bool needs_pasid;
-	int ret = -EINVAL;
-	struct iopf_fault *evt;
-	struct iommu_fault_page_request *prm;
-	struct dev_iommu *param = dev->iommu;
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
-	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
-
-	if (!ops->page_response)
-		return -ENODEV;
-
-	if (!param || !param->fault_param)
-		return -EINVAL;
-
-	/* Only send response if there is a fault report pending */
-	mutex_lock(&param->fault_param->lock);
-	if (list_empty(&param->fault_param->faults)) {
-		dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
-		goto done_unlock;
-	}
-	/*
-	 * Check if we have a matching page request pending to respond,
-	 * otherwise return -EINVAL
-	 */
-	list_for_each_entry(evt, &param->fault_param->faults, list) {
-		prm = &evt->fault.prm;
-		if (prm->grpid != msg->grpid)
-			continue;
-
-		/*
-		 * If the PASID is required, the corresponding request is
-		 * matched using the group ID, the PASID valid bit and the PASID
-		 * value. Otherwise only the group ID matches request and
-		 * response.
-		 */
-		needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
-		if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
-			continue;
-
-		if (!needs_pasid && has_pasid) {
-			/* No big deal, just clear it. */
-			msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
-			msg->pasid = 0;
-		}
-
-		ret = ops->page_response(dev, evt, msg);
-		list_del(&evt->list);
-		kfree(evt);
-		break;
-	}
-
-done_unlock:
-	mutex_unlock(&param->fault_param->lock);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(iommu_page_response);
-
 /**
  * iommu_group_id - Return ID for a group
  * @group: the group to ID
@@ -3571,26 +3458,6 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid);
 
-struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
-					    struct mm_struct *mm)
-{
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
-	struct iommu_domain *domain;
-
-	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
-	if (!domain)
-		return NULL;
-
-	domain->type = IOMMU_DOMAIN_SVA;
-	mmgrab(mm);
-	domain->mm = mm;
-	domain->owner = ops;
-	domain->iopf_handler = iommu_sva_handle_iopf;
-	domain->fault_data = mm;
-
-	return domain;
-}
-
 ioasid_t iommu_alloc_global_pasid(struct device *dev)
 {
 	int ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1a765b061df23..cc6a321f7373d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -811,10 +811,6 @@ extern struct iommu_group *iommu_group_get(struct device *dev);
 extern struct iommu_group *iommu_group_ref_get(struct iommu_group *group);
 extern void iommu_group_put(struct iommu_group *group);
 
-extern int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
-extern int iommu_page_response(struct device *dev,
-			       struct iommu_page_response *msg);
-
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
@@ -1033,8 +1029,6 @@ bool iommu_group_dma_owner_claimed(struct iommu_group *group);
 int iommu_device_claim_dma_owner(struct device *dev, void *owner);
 void iommu_device_release_dma_owner(struct device *dev);
 
-struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
-					    struct mm_struct *mm);
 int iommu_attach_device_pasid(struct iommu_domain *domain,
 			      struct device *dev, ioasid_t pasid);
 void iommu_detach_device_pasid(struct iommu_domain *domain,
@@ -1223,18 +1217,6 @@ static inline void iommu_group_put(struct iommu_group *group)
 {
 }
 
-static inline
-int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
-{
-	return -ENODEV;
-}
-
-static inline int iommu_page_response(struct device *dev,
-				      struct iommu_page_response *msg)
-{
-	return -ENODEV;
-}
-
 static inline int iommu_group_id(struct iommu_group *group)
 {
 	return -ENODEV;
@@ -1383,12 +1365,6 @@ static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner)
 	return -ENODEV;
 }
 
-static inline struct iommu_domain *
-iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm)
-{
-	return NULL;
-}
-
 static inline int iommu_attach_device_pasid(struct iommu_domain *domain,
 					    struct device *dev, ioasid_t pasid)
 {
@@ -1536,6 +1512,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
 u32 iommu_sva_get_pasid(struct iommu_sva *handle);
+struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+					    struct mm_struct *mm);
 #else
 static inline struct iommu_sva *
 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
@@ -1560,6 +1538,78 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
 }
 
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
+
+static inline struct iommu_domain *
+iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm)
+{
+	return NULL;
+}
 #endif /* CONFIG_IOMMU_SVA */
 
+#ifdef CONFIG_IOMMU_IOPF
+int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
+int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev);
+int iopf_queue_flush_dev(struct device *dev);
+struct iopf_queue *iopf_queue_alloc(const char *name);
+void iopf_queue_free(struct iopf_queue *queue);
+int iopf_queue_discard_partial(struct iopf_queue *queue);
+void iopf_free_group(struct iopf_group *group);
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
+int iommu_page_response(struct device *dev, struct iommu_page_response *msg);
+int iopf_group_response(struct iopf_group *group,
+			enum iommu_page_response_code status);
+#else
+static inline int
+iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int
+iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_queue_flush_dev(struct device *dev)
+{
+	return -ENODEV;
+}
+
+static inline struct iopf_queue *iopf_queue_alloc(const char *name)
+{
+	return NULL;
+}
+
+static inline void iopf_queue_free(struct iopf_queue *queue)
+{
+}
+
+static inline int iopf_queue_discard_partial(struct iopf_queue *queue)
+{
+	return -ENODEV;
+}
+
+static inline void iopf_free_group(struct iopf_group *group)
+{
+}
+
+static inline int
+iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
+{
+	return -ENODEV;
+}
+
+static inline int
+iommu_page_response(struct device *dev, struct iommu_page_response *msg)
+{
+	return -ENODEV;
+}
+
+static inline int iopf_group_response(struct iopf_group *group,
+				      enum iommu_page_response_code status)
+{
+	return -ENODEV;
+}
+#endif /* CONFIG_IOMMU_IOPF */
 #endif /* __LINUX_IOMMU_H */

From d2505771d1a5da7e39289a3927f77465afb73542 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:22 +0800
Subject: [PATCH 082/352] iommu: Refine locking for per-device fault data
 management

The per-device fault data is a data structure that is used to store
information about faults that occur on a device. This data is allocated
when IOPF is enabled on the device and freed when IOPF is disabled. The
data is used in the paths of iopf reporting, handling, responding, and
draining.

The fault data is protected by two locks:

- dev->iommu->lock: This lock is used to protect the allocation and
  freeing of the fault data.
- dev->iommu->fault_parameter->lock: This lock is used to protect the
  fault data itself.

Apply the locking mechanism to the fault reporting and responding paths.

The fault_parameter->lock is also added in iopf_queue_discard_partial().
It does not fix any real issue, as iopf_queue_discard_partial() is only
used in the VT-d driver's prq_event_thread(), which is a single-threaded
path that reports the IOPFs.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Tested-by: Longfang Liu <liulongfang@huawei.com>
Link: https://lore.kernel.org/r/20240212012227.119381-12-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit cc7338e9d807e20e60e6720a62956f0e9d46f0f8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 61 +++++++++++++++++++-------------------
 1 file changed, 30 insertions(+), 31 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index c1e88da973cef..5aea8402be476 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -53,7 +53,7 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
 /**
  * iommu_handle_iopf - IO Page Fault handler
  * @fault: fault event
- * @dev: struct device.
+ * @iopf_param: the fault parameter of the device.
  *
  * Add a fault to the device workqueue, to be handled by mm.
  *
@@ -90,29 +90,21 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
  *
  * Return: 0 on success and <0 on error.
  */
-static int iommu_handle_iopf(struct iommu_fault *fault, struct device *dev)
+static int iommu_handle_iopf(struct iommu_fault *fault,
+			     struct iommu_fault_param *iopf_param)
 {
 	int ret;
 	struct iopf_group *group;
 	struct iommu_domain *domain;
 	struct iopf_fault *iopf, *next;
-	struct iommu_fault_param *iopf_param;
-	struct dev_iommu *param = dev->iommu;
+	struct device *dev = iopf_param->dev;
 
-	lockdep_assert_held(&param->lock);
+	lockdep_assert_held(&iopf_param->lock);
 
 	if (fault->type != IOMMU_FAULT_PAGE_REQ)
 		/* Not a recoverable page fault */
 		return -EOPNOTSUPP;
 
-	/*
-	 * As long as we're holding param->lock, the queue can't be unlinked
-	 * from the device and therefore cannot disappear.
-	 */
-	iopf_param = param->fault_param;
-	if (!iopf_param)
-		return -ENODEV;
-
 	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
 		iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
 		if (!iopf)
@@ -186,18 +178,19 @@ static int iommu_handle_iopf(struct iommu_fault *fault, struct device *dev)
  */
 int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
-	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *fault_param;
 	struct iopf_fault *evt_pending = NULL;
-	struct iommu_fault_param *fparam;
+	struct dev_iommu *param = dev->iommu;
 	int ret = 0;
 
-	if (!param || !evt)
-		return -EINVAL;
-
-	/* we only report device fault if there is a handler registered */
 	mutex_lock(&param->lock);
-	fparam = param->fault_param;
+	fault_param = param->fault_param;
+	if (!fault_param) {
+		mutex_unlock(&param->lock);
+		return -EINVAL;
+	}
 
+	mutex_lock(&fault_param->lock);
 	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
 	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
 		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
@@ -206,20 +199,18 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 			ret = -ENOMEM;
 			goto done_unlock;
 		}
-		mutex_lock(&fparam->lock);
-		list_add_tail(&evt_pending->list, &fparam->faults);
-		mutex_unlock(&fparam->lock);
+		list_add_tail(&evt_pending->list, &fault_param->faults);
 	}
 
-	ret = iommu_handle_iopf(&evt->fault, dev);
+	ret = iommu_handle_iopf(&evt->fault, fault_param);
 	if (ret && evt_pending) {
-		mutex_lock(&fparam->lock);
 		list_del(&evt_pending->list);
-		mutex_unlock(&fparam->lock);
 		kfree(evt_pending);
 	}
 done_unlock:
+	mutex_unlock(&fault_param->lock);
 	mutex_unlock(&param->lock);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
@@ -232,18 +223,23 @@ int iommu_page_response(struct device *dev,
 	struct iopf_fault *evt;
 	struct iommu_fault_page_request *prm;
 	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *fault_param;
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
 
 	if (!ops->page_response)
 		return -ENODEV;
 
-	if (!param || !param->fault_param)
+	mutex_lock(&param->lock);
+	fault_param = param->fault_param;
+	if (!fault_param) {
+		mutex_unlock(&param->lock);
 		return -EINVAL;
+	}
 
 	/* Only send response if there is a fault report pending */
-	mutex_lock(&param->fault_param->lock);
-	if (list_empty(&param->fault_param->faults)) {
+	mutex_lock(&fault_param->lock);
+	if (list_empty(&fault_param->faults)) {
 		dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
 		goto done_unlock;
 	}
@@ -251,7 +247,7 @@ int iommu_page_response(struct device *dev,
 	 * Check if we have a matching page request pending to respond,
 	 * otherwise return -EINVAL
 	 */
-	list_for_each_entry(evt, &param->fault_param->faults, list) {
+	list_for_each_entry(evt, &fault_param->faults, list) {
 		prm = &evt->fault.prm;
 		if (prm->grpid != msg->grpid)
 			continue;
@@ -279,7 +275,8 @@ int iommu_page_response(struct device *dev,
 	}
 
 done_unlock:
-	mutex_unlock(&param->fault_param->lock);
+	mutex_unlock(&fault_param->lock);
+	mutex_unlock(&param->lock);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_page_response);
@@ -362,11 +359,13 @@ int iopf_queue_discard_partial(struct iopf_queue *queue)
 
 	mutex_lock(&queue->lock);
 	list_for_each_entry(iopf_param, &queue->devices, queue_list) {
+		mutex_lock(&iopf_param->lock);
 		list_for_each_entry_safe(iopf, next, &iopf_param->partial,
 					 list) {
 			list_del(&iopf->list);
 			kfree(iopf);
 		}
+		mutex_unlock(&iopf_param->lock);
 	}
 	mutex_unlock(&queue->lock);
 	return 0;

From 5efd3be27d6c4e9285c1bd39c16f336a90040785 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:23 +0800
Subject: [PATCH 083/352] iommu: Use refcount for fault data access

The per-device fault data structure stores information about faults
occurring on a device. Its lifetime spans from IOPF enablement to
disablement. Multiple paths, including IOPF reporting, handling, and
responding, may access it concurrently.

Previously, a mutex protected the fault data from use after free. But
this is not performance friendly due to the critical nature of IOPF
handling paths.

Refine this with a refcount-based approach. The fault data pointer is
obtained within an RCU read region with a refcount. The fault data
pointer is returned for usage only when the pointer is valid and a
refcount is successfully obtained. The fault data is freed with
kfree_rcu(), ensuring data is only freed after all RCU critical regions
complete.

An iopf handling work starts once an iopf group is created. The handling
work continues until iommu_page_response() is called to respond to the
iopf and the iopf group is freed. During this time, the device fault
parameter should always be available. Add a pointer to the device fault
parameter in the iopf_group structure and hold the reference until the
iopf_group is freed.

Make iommu_page_response() static as it is only used in io-pgfault.c.

Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/r/20240212012227.119381-13-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit a74c077b9021b36c785095c571336e5b204d3c2d)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 127 +++++++++++++++++++++++--------------
 drivers/iommu/iommu-sva.c  |   2 +-
 include/linux/iommu.h      |  17 +++--
 3 files changed, 88 insertions(+), 58 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 5aea8402be476..ce7058892b598 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -13,6 +13,32 @@
 
 #include "iommu-priv.h"
 
+/*
+ * Return the fault parameter of a device if it exists. Otherwise, return NULL.
+ * On a successful return, the caller takes a reference of this parameter and
+ * should put it after use by calling iopf_put_dev_fault_param().
+ */
+static struct iommu_fault_param *iopf_get_dev_fault_param(struct device *dev)
+{
+	struct dev_iommu *param = dev->iommu;
+	struct iommu_fault_param *fault_param;
+
+	rcu_read_lock();
+	fault_param = rcu_dereference(param->fault_param);
+	if (fault_param && !refcount_inc_not_zero(&fault_param->users))
+		fault_param = NULL;
+	rcu_read_unlock();
+
+	return fault_param;
+}
+
+/* Caller must hold a reference of the fault parameter. */
+static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param)
+{
+	if (refcount_dec_and_test(&fault_param->users))
+		kfree_rcu(fault_param, rcu);
+}
+
 void iopf_free_group(struct iopf_group *group)
 {
 	struct iopf_fault *iopf, *next;
@@ -22,6 +48,8 @@ void iopf_free_group(struct iopf_group *group)
 			kfree(iopf);
 	}
 
+	/* Pair with iommu_report_device_fault(). */
+	iopf_put_dev_fault_param(group->fault_param);
 	kfree(group);
 }
 EXPORT_SYMBOL_GPL(iopf_free_group);
@@ -135,7 +163,7 @@ static int iommu_handle_iopf(struct iommu_fault *fault,
 		goto cleanup_partial;
 	}
 
-	group->dev = dev;
+	group->fault_param = iopf_param;
 	group->last_fault.fault = *fault;
 	INIT_LIST_HEAD(&group->faults);
 	group->domain = domain;
@@ -178,64 +206,61 @@ static int iommu_handle_iopf(struct iommu_fault *fault,
  */
 int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
+	bool last_prq = evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
+		(evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE);
 	struct iommu_fault_param *fault_param;
-	struct iopf_fault *evt_pending = NULL;
-	struct dev_iommu *param = dev->iommu;
-	int ret = 0;
+	struct iopf_fault *evt_pending;
+	int ret;
 
-	mutex_lock(&param->lock);
-	fault_param = param->fault_param;
-	if (!fault_param) {
-		mutex_unlock(&param->lock);
+	fault_param = iopf_get_dev_fault_param(dev);
+	if (!fault_param)
 		return -EINVAL;
-	}
 
 	mutex_lock(&fault_param->lock);
-	if (evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
-	    (evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+	if (last_prq) {
 		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
 				      GFP_KERNEL);
 		if (!evt_pending) {
 			ret = -ENOMEM;
-			goto done_unlock;
+			goto err_unlock;
 		}
 		list_add_tail(&evt_pending->list, &fault_param->faults);
 	}
 
 	ret = iommu_handle_iopf(&evt->fault, fault_param);
-	if (ret && evt_pending) {
+	if (ret)
+		goto err_free;
+
+	mutex_unlock(&fault_param->lock);
+	/* The reference count of fault_param is now held by iopf_group. */
+	if (!last_prq)
+		iopf_put_dev_fault_param(fault_param);
+
+	return 0;
+err_free:
+	if (last_prq) {
 		list_del(&evt_pending->list);
 		kfree(evt_pending);
 	}
-done_unlock:
+err_unlock:
 	mutex_unlock(&fault_param->lock);
-	mutex_unlock(&param->lock);
+	iopf_put_dev_fault_param(fault_param);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
-int iommu_page_response(struct device *dev,
-			struct iommu_page_response *msg)
+static int iommu_page_response(struct iopf_group *group,
+			       struct iommu_page_response *msg)
 {
 	bool needs_pasid;
 	int ret = -EINVAL;
 	struct iopf_fault *evt;
 	struct iommu_fault_page_request *prm;
-	struct dev_iommu *param = dev->iommu;
-	struct iommu_fault_param *fault_param;
+	struct device *dev = group->fault_param->dev;
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
-
-	if (!ops->page_response)
-		return -ENODEV;
-
-	mutex_lock(&param->lock);
-	fault_param = param->fault_param;
-	if (!fault_param) {
-		mutex_unlock(&param->lock);
-		return -EINVAL;
-	}
+	struct iommu_fault_param *fault_param = group->fault_param;
 
 	/* Only send response if there is a fault report pending */
 	mutex_lock(&fault_param->lock);
@@ -276,10 +301,9 @@ int iommu_page_response(struct device *dev,
 
 done_unlock:
 	mutex_unlock(&fault_param->lock);
-	mutex_unlock(&param->lock);
+
 	return ret;
 }
-EXPORT_SYMBOL_GPL(iommu_page_response);
 
 /**
  * iopf_queue_flush_dev - Ensure that all queued faults have been processed
@@ -295,22 +319,20 @@ EXPORT_SYMBOL_GPL(iommu_page_response);
  */
 int iopf_queue_flush_dev(struct device *dev)
 {
-	int ret = 0;
 	struct iommu_fault_param *iopf_param;
-	struct dev_iommu *param = dev->iommu;
 
-	if (!param)
+	/*
+	 * It's a driver bug to be here after iopf_queue_remove_device().
+	 * Therefore, it's safe to dereference the fault parameter without
+	 * holding the lock.
+	 */
+	iopf_param = rcu_dereference_check(dev->iommu->fault_param, true);
+	if (WARN_ON(!iopf_param))
 		return -ENODEV;
 
-	mutex_lock(&param->lock);
-	iopf_param = param->fault_param;
-	if (iopf_param)
-		flush_workqueue(iopf_param->queue->wq);
-	else
-		ret = -ENODEV;
-	mutex_unlock(&param->lock);
+	flush_workqueue(iopf_param->queue->wq);
 
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
 
@@ -335,7 +357,7 @@ int iopf_group_response(struct iopf_group *group,
 	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
 		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
 
-	return iommu_page_response(group->dev, &resp);
+	return iommu_page_response(group, &resp);
 }
 EXPORT_SYMBOL_GPL(iopf_group_response);
 
@@ -384,10 +406,15 @@ int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
 	int ret = 0;
 	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_param *fault_param;
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+
+	if (!ops->page_response)
+		return -ENODEV;
 
 	mutex_lock(&queue->lock);
 	mutex_lock(&param->lock);
-	if (param->fault_param) {
+	if (rcu_dereference_check(param->fault_param,
+				  lockdep_is_held(&param->lock))) {
 		ret = -EBUSY;
 		goto done_unlock;
 	}
@@ -402,10 +429,11 @@ int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
 	INIT_LIST_HEAD(&fault_param->faults);
 	INIT_LIST_HEAD(&fault_param->partial);
 	fault_param->dev = dev;
+	refcount_set(&fault_param->users, 1);
 	list_add(&fault_param->queue_list, &queue->devices);
 	fault_param->queue = queue;
 
-	param->fault_param = fault_param;
+	rcu_assign_pointer(param->fault_param, fault_param);
 
 done_unlock:
 	mutex_unlock(&param->lock);
@@ -429,10 +457,12 @@ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 	int ret = 0;
 	struct iopf_fault *iopf, *next;
 	struct dev_iommu *param = dev->iommu;
-	struct iommu_fault_param *fault_param = param->fault_param;
+	struct iommu_fault_param *fault_param;
 
 	mutex_lock(&queue->lock);
 	mutex_lock(&param->lock);
+	fault_param = rcu_dereference_check(param->fault_param,
+					    lockdep_is_held(&param->lock));
 	if (!fault_param) {
 		ret = -ENODEV;
 		goto unlock;
@@ -454,8 +484,9 @@ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 	list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
 		kfree(iopf);
 
-	param->fault_param = NULL;
-	kfree(fault_param);
+	/* dec the ref owned by iopf_queue_add_device() */
+	rcu_assign_pointer(param->fault_param, NULL);
+	iopf_put_dev_fault_param(fault_param);
 unlock:
 	mutex_unlock(&param->lock);
 	mutex_unlock(&queue->lock);
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index fbab4f059d284..640acc804e8cd 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -268,7 +268,7 @@ static void iommu_sva_handle_iopf(struct work_struct *work)
 
 static int iommu_sva_iopf_handler(struct iopf_group *group)
 {
-	struct iommu_fault_param *fault_param = group->dev->iommu->fault_param;
+	struct iommu_fault_param *fault_param = group->fault_param;
 
 	INIT_WORK(&group->work, iommu_sva_handle_iopf);
 	if (!queue_work(fault_param->queue->wq, &group->work))
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index cc6a321f7373d..afbbfd3967aba 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -41,6 +41,7 @@ struct iommu_dirty_ops;
 struct notifier_block;
 struct iommu_sva;
 struct iommu_dma_cookie;
+struct iommu_fault_param;
 
 #define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
 #define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
@@ -129,8 +130,9 @@ struct iopf_group {
 	struct iopf_fault last_fault;
 	struct list_head faults;
 	struct work_struct work;
-	struct device *dev;
 	struct iommu_domain *domain;
+	/* The device's fault data parameter. */
+	struct iommu_fault_param *fault_param;
 };
 
 /**
@@ -680,6 +682,8 @@ struct iommu_device {
 /**
  * struct iommu_fault_param - per-device IOMMU fault data
  * @lock: protect pending faults list
+ * @users: user counter to manage the lifetime of the data
+ * @rcu: rcu head for kfree_rcu()
  * @dev: the device that owns this param
  * @queue: IOPF queue
  * @queue_list: index into queue->devices
@@ -689,6 +693,8 @@ struct iommu_device {
  */
 struct iommu_fault_param {
 	struct mutex lock;
+	refcount_t users;
+	struct rcu_head rcu;
 
 	struct device *dev;
 	struct iopf_queue *queue;
@@ -716,7 +722,7 @@ struct iommu_fault_param {
  */
 struct dev_iommu {
 	struct mutex lock;
-	struct iommu_fault_param	*fault_param;
+	struct iommu_fault_param __rcu	*fault_param;
 	struct iommu_fwspec		*fwspec;
 	struct iommu_device		*iommu_dev;
 	void				*priv;
@@ -1555,7 +1561,6 @@ void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
 void iopf_free_group(struct iopf_group *group);
 int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
-int iommu_page_response(struct device *dev, struct iommu_page_response *msg);
 int iopf_group_response(struct iopf_group *group,
 			enum iommu_page_response_code status);
 #else
@@ -1600,12 +1605,6 @@ iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	return -ENODEV;
 }
 
-static inline int
-iommu_page_response(struct device *dev, struct iommu_page_response *msg)
-{
-	return -ENODEV;
-}
-
 static inline int iopf_group_response(struct iopf_group *group,
 				      enum iommu_page_response_code status)
 {

From 1609b5801a496fa3ec2e5fc277643d7a9bdcc935 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:24 +0800
Subject: [PATCH 084/352] iommu: Improve iopf_queue_remove_device()

Convert iopf_queue_remove_device() to return void instead of an error code,
as the return value is never used. This removal helper is designed to be
never-failed, so there's no need for error handling.

Ack all outstanding page requests from the device with the response code of
IOMMU_PAGE_RESP_INVALID, indicating device should not attempt any retry.

Add comments to this helper explaining the steps involved in removing a
device from the iopf queue and disabling its PRI. The individual drivers
are expected to be adjusted accordingly. Here we just define the expected
behaviors of the individual iommu driver from the core's perspective.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/r/20240212012227.119381-14-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 0095bf83554f8e7a681961656608101bdf40e9ef)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/intel/iommu.c |  7 +----
 drivers/iommu/io-pgfault.c  | 57 ++++++++++++++++++++++++-------------
 include/linux/iommu.h       |  5 ++--
 3 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 86a4c2a87a5e9..605cb77ff4ed5 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4606,12 +4606,7 @@ static int intel_iommu_disable_iopf(struct device *dev)
 	 */
 	pci_disable_pri(to_pci_dev(dev));
 	info->pri_enabled = 0;
-
-	/*
-	 * With PRI disabled and outstanding PRQs drained, removing device
-	 * from iopf queue should never fail.
-	 */
-	WARN_ON(iopf_queue_remove_device(iommu->iopf_queue, dev));
+	iopf_queue_remove_device(iommu->iopf_queue, dev);
 
 	return 0;
 }
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index ce7058892b598..ece09552e5cf9 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -448,41 +448,60 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device);
  * @queue: IOPF queue
  * @dev: device to remove
  *
- * Caller makes sure that no more faults are reported for this device.
+ * Removing a device from an iopf_queue. It's recommended to follow these
+ * steps when removing a device:
  *
- * Return: 0 on success and <0 on error.
+ * - Disable new PRI reception: Turn off PRI generation in the IOMMU hardware
+ *   and flush any hardware page request queues. This should be done before
+ *   calling into this helper.
+ * - Acknowledge all outstanding PRQs to the device: Respond to all outstanding
+ *   page requests with IOMMU_PAGE_RESP_INVALID, indicating the device should
+ *   not retry. This helper function handles this.
+ * - Disable PRI on the device: After calling this helper, the caller could
+ *   then disable PRI on the device.
+ *
+ * Calling iopf_queue_remove_device() essentially disassociates the device.
+ * The fault_param might still exist, but iommu_page_response() will do
+ * nothing. The device fault parameter reference count has been properly
+ * passed from iommu_report_device_fault() to the fault handling work, and
+ * will eventually be released after iommu_page_response().
  */
-int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
+void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 {
-	int ret = 0;
 	struct iopf_fault *iopf, *next;
+	struct iommu_page_response resp;
 	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_param *fault_param;
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
 
 	mutex_lock(&queue->lock);
 	mutex_lock(&param->lock);
 	fault_param = rcu_dereference_check(param->fault_param,
 					    lockdep_is_held(&param->lock));
-	if (!fault_param) {
-		ret = -ENODEV;
-		goto unlock;
-	}
 
-	if (fault_param->queue != queue) {
-		ret = -EINVAL;
+	if (WARN_ON(!fault_param || fault_param->queue != queue))
 		goto unlock;
-	}
 
-	if (!list_empty(&fault_param->faults)) {
-		ret = -EBUSY;
-		goto unlock;
-	}
+	mutex_lock(&fault_param->lock);
+	list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
+		kfree(iopf);
 
-	list_del(&fault_param->queue_list);
+	list_for_each_entry_safe(iopf, next, &fault_param->faults, list) {
+		memset(&resp, 0, sizeof(struct iommu_page_response));
+		resp.pasid = iopf->fault.prm.pasid;
+		resp.grpid = iopf->fault.prm.grpid;
+		resp.code = IOMMU_PAGE_RESP_INVALID;
 
-	/* Just in case some faults are still stuck */
-	list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
+		if (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)
+			resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+
+		ops->page_response(dev, iopf, &resp);
+		list_del(&iopf->list);
 		kfree(iopf);
+	}
+	mutex_unlock(&fault_param->lock);
+
+	list_del(&fault_param->queue_list);
 
 	/* dec the ref owned by iopf_queue_add_device() */
 	rcu_assign_pointer(param->fault_param, NULL);
@@ -490,8 +509,6 @@ int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 unlock:
 	mutex_unlock(&param->lock);
 	mutex_unlock(&queue->lock);
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(iopf_queue_remove_device);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index afbbfd3967aba..53cdd6a5a6b59 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1554,7 +1554,7 @@ iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm)
 
 #ifdef CONFIG_IOMMU_IOPF
 int iopf_queue_add_device(struct iopf_queue *queue, struct device *dev);
-int iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev);
+void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev);
 int iopf_queue_flush_dev(struct device *dev);
 struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
@@ -1570,10 +1570,9 @@ iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
 	return -ENODEV;
 }
 
-static inline int
+static inline void
 iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 {
-	return -ENODEV;
 }
 
 static inline int iopf_queue_flush_dev(struct device *dev)

From af5dac0a43de1feb7f7693119cd78e6ef60a7f03 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:25 +0800
Subject: [PATCH 085/352] iommu: Track iopf group instead of last fault

Previously, before a group of page faults was passed to the domain's iopf
handler, the last page fault of the group was kept in the list of
iommu_fault_param::faults. In the page fault response path, the group's
last page fault was used to look up the list, and the page faults were
responded to device only if there was a matched fault.

The previous approach seems unnecessarily complex and not performance
friendly. Put the page fault group itself to the outstanding fault list.
It can be removed in the page fault response path or in the
iopf_queue_remove_device() path. The pending list is protected by
iommu_fault_param::lock. To allow checking for the group's presence in
the list using list_empty(), the iopf group should be removed from the
list with list_del_init().

IOMMU_PAGE_RESP_PASID_VALID is set in the code but not used anywhere.
Remove it to make the code clean. IOMMU_PAGE_RESP_PASID_VALID is set
in the response message indicating that the response message includes
a valid PASID value. Actually, we should keep this hardware detail in
the individual driver. When the page fault handling framework in IOMMU
and IOMMUFD subsystems includes a valid PASID in the fault message, the
response message should always contain the same PASID value. Individual
drivers should be responsible for deciding whether to include the PASID
in the messages they provide for the hardware.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Yan Zhao <yan.y.zhao@intel.com>
Link: https://lore.kernel.org/r/20240212012227.119381-15-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 19911232713573a2ebea84a25bd4d71d024ed86b)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 242 +++++++++++++------------------------
 include/linux/iommu.h      |   6 +-
 2 files changed, 86 insertions(+), 162 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index ece09552e5cf9..05e49e2e6a52e 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -78,12 +78,33 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
 	return domain;
 }
 
+/* Non-last request of a group. Postpone until the last one. */
+static int report_partial_fault(struct iommu_fault_param *fault_param,
+				struct iommu_fault *fault)
+{
+	struct iopf_fault *iopf;
+
+	iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
+	if (!iopf)
+		return -ENOMEM;
+
+	iopf->fault = *fault;
+
+	mutex_lock(&fault_param->lock);
+	list_add(&iopf->list, &fault_param->partial);
+	mutex_unlock(&fault_param->lock);
+
+	return 0;
+}
+
 /**
- * iommu_handle_iopf - IO Page Fault handler
- * @fault: fault event
- * @iopf_param: the fault parameter of the device.
+ * iommu_report_device_fault() - Report fault event to device driver
+ * @dev: the device
+ * @evt: fault event data
  *
- * Add a fault to the device workqueue, to be handled by mm.
+ * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
+ * handler. When this function fails and the fault is recoverable, it is the
+ * caller's responsibility to complete the fault.
  *
  * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
  * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
@@ -118,34 +139,37 @@ static struct iommu_domain *get_domain_for_iopf(struct device *dev,
  *
  * Return: 0 on success and <0 on error.
  */
-static int iommu_handle_iopf(struct iommu_fault *fault,
-			     struct iommu_fault_param *iopf_param)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
-	int ret;
-	struct iopf_group *group;
-	struct iommu_domain *domain;
+	struct iommu_fault *fault = &evt->fault;
+	struct iommu_fault_param *iopf_param;
 	struct iopf_fault *iopf, *next;
-	struct device *dev = iopf_param->dev;
-
-	lockdep_assert_held(&iopf_param->lock);
+	struct iommu_domain *domain;
+	struct iopf_group *group;
+	int ret;
 
 	if (fault->type != IOMMU_FAULT_PAGE_REQ)
-		/* Not a recoverable page fault */
 		return -EOPNOTSUPP;
 
-	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
-		iopf = kzalloc(sizeof(*iopf), GFP_KERNEL);
-		if (!iopf)
-			return -ENOMEM;
-
-		iopf->fault = *fault;
+	iopf_param = iopf_get_dev_fault_param(dev);
+	if (!iopf_param)
+		return -ENODEV;
 
-		/* Non-last request of a group. Postpone until the last one */
-		list_add(&iopf->list, &iopf_param->partial);
+	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
+		ret = report_partial_fault(iopf_param, fault);
+		iopf_put_dev_fault_param(iopf_param);
 
-		return 0;
+		return ret;
 	}
 
+	/*
+	 * This is the last page fault of a group. Allocate an iopf group and
+	 * pass it to domain's page fault handler. The group holds a reference
+	 * count of the fault parameter. It will be released after response or
+	 * error path of this function. If an error is returned, the caller
+	 * will send a response to the hardware. We need to clean up before
+	 * leaving, otherwise partial faults will be stuck.
+	 */
 	domain = get_domain_for_iopf(dev, fault);
 	if (!domain) {
 		ret = -EINVAL;
@@ -154,11 +178,6 @@ static int iommu_handle_iopf(struct iommu_fault *fault,
 
 	group = kzalloc(sizeof(*group), GFP_KERNEL);
 	if (!group) {
-		/*
-		 * The caller will send a response to the hardware. But we do
-		 * need to clean up before leaving, otherwise partial faults
-		 * will be stuck.
-		 */
 		ret = -ENOMEM;
 		goto cleanup_partial;
 	}
@@ -166,145 +185,45 @@ static int iommu_handle_iopf(struct iommu_fault *fault,
 	group->fault_param = iopf_param;
 	group->last_fault.fault = *fault;
 	INIT_LIST_HEAD(&group->faults);
+	INIT_LIST_HEAD(&group->pending_node);
 	group->domain = domain;
 	list_add(&group->last_fault.list, &group->faults);
 
 	/* See if we have partial faults for this group */
+	mutex_lock(&iopf_param->lock);
 	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
 		if (iopf->fault.prm.grpid == fault->prm.grpid)
 			/* Insert *before* the last fault */
 			list_move(&iopf->list, &group->faults);
 	}
-
+	list_add(&group->pending_node, &iopf_param->faults);
 	mutex_unlock(&iopf_param->lock);
+
 	ret = domain->iopf_handler(group);
-	mutex_lock(&iopf_param->lock);
-	if (ret)
+	if (ret) {
+		mutex_lock(&iopf_param->lock);
+		list_del_init(&group->pending_node);
+		mutex_unlock(&iopf_param->lock);
 		iopf_free_group(group);
+	}
 
 	return ret;
+
 cleanup_partial:
+	mutex_lock(&iopf_param->lock);
 	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
 		if (iopf->fault.prm.grpid == fault->prm.grpid) {
 			list_del(&iopf->list);
 			kfree(iopf);
 		}
 	}
-	return ret;
-}
-
-/**
- * iommu_report_device_fault() - Report fault event to device driver
- * @dev: the device
- * @evt: fault event data
- *
- * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
- * handler. When this function fails and the fault is recoverable, it is the
- * caller's responsibility to complete the fault.
- *
- * Return 0 on success, or an error.
- */
-int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
-{
-	bool last_prq = evt->fault.type == IOMMU_FAULT_PAGE_REQ &&
-		(evt->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE);
-	struct iommu_fault_param *fault_param;
-	struct iopf_fault *evt_pending;
-	int ret;
-
-	fault_param = iopf_get_dev_fault_param(dev);
-	if (!fault_param)
-		return -EINVAL;
-
-	mutex_lock(&fault_param->lock);
-	if (last_prq) {
-		evt_pending = kmemdup(evt, sizeof(struct iopf_fault),
-				      GFP_KERNEL);
-		if (!evt_pending) {
-			ret = -ENOMEM;
-			goto err_unlock;
-		}
-		list_add_tail(&evt_pending->list, &fault_param->faults);
-	}
-
-	ret = iommu_handle_iopf(&evt->fault, fault_param);
-	if (ret)
-		goto err_free;
-
-	mutex_unlock(&fault_param->lock);
-	/* The reference count of fault_param is now held by iopf_group. */
-	if (!last_prq)
-		iopf_put_dev_fault_param(fault_param);
-
-	return 0;
-err_free:
-	if (last_prq) {
-		list_del(&evt_pending->list);
-		kfree(evt_pending);
-	}
-err_unlock:
-	mutex_unlock(&fault_param->lock);
-	iopf_put_dev_fault_param(fault_param);
+	mutex_unlock(&iopf_param->lock);
+	iopf_put_dev_fault_param(iopf_param);
 
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
-static int iommu_page_response(struct iopf_group *group,
-			       struct iommu_page_response *msg)
-{
-	bool needs_pasid;
-	int ret = -EINVAL;
-	struct iopf_fault *evt;
-	struct iommu_fault_page_request *prm;
-	struct device *dev = group->fault_param->dev;
-	const struct iommu_ops *ops = dev_iommu_ops(dev);
-	bool has_pasid = msg->flags & IOMMU_PAGE_RESP_PASID_VALID;
-	struct iommu_fault_param *fault_param = group->fault_param;
-
-	/* Only send response if there is a fault report pending */
-	mutex_lock(&fault_param->lock);
-	if (list_empty(&fault_param->faults)) {
-		dev_warn_ratelimited(dev, "no pending PRQ, drop response\n");
-		goto done_unlock;
-	}
-	/*
-	 * Check if we have a matching page request pending to respond,
-	 * otherwise return -EINVAL
-	 */
-	list_for_each_entry(evt, &fault_param->faults, list) {
-		prm = &evt->fault.prm;
-		if (prm->grpid != msg->grpid)
-			continue;
-
-		/*
-		 * If the PASID is required, the corresponding request is
-		 * matched using the group ID, the PASID valid bit and the PASID
-		 * value. Otherwise only the group ID matches request and
-		 * response.
-		 */
-		needs_pasid = prm->flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID;
-		if (needs_pasid && (!has_pasid || msg->pasid != prm->pasid))
-			continue;
-
-		if (!needs_pasid && has_pasid) {
-			/* No big deal, just clear it. */
-			msg->flags &= ~IOMMU_PAGE_RESP_PASID_VALID;
-			msg->pasid = 0;
-		}
-
-		ret = ops->page_response(dev, evt, msg);
-		list_del(&evt->list);
-		kfree(evt);
-		break;
-	}
-
-done_unlock:
-	mutex_unlock(&fault_param->lock);
-
-	return ret;
-}
-
 /**
  * iopf_queue_flush_dev - Ensure that all queued faults have been processed
  * @dev: the endpoint whose faults need to be flushed.
@@ -346,18 +265,26 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
 int iopf_group_response(struct iopf_group *group,
 			enum iommu_page_response_code status)
 {
+	struct iommu_fault_param *fault_param = group->fault_param;
 	struct iopf_fault *iopf = &group->last_fault;
+	struct device *dev = group->fault_param->dev;
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_page_response resp = {
 		.pasid = iopf->fault.prm.pasid,
 		.grpid = iopf->fault.prm.grpid,
 		.code = status,
 	};
+	int ret = -EINVAL;
 
-	if ((iopf->fault.prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) &&
-	    (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID))
-		resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+	/* Only send response if there is a fault report pending */
+	mutex_lock(&fault_param->lock);
+	if (!list_empty(&group->pending_node)) {
+		ret = ops->page_response(dev, &group->last_fault, &resp);
+		list_del_init(&group->pending_node);
+	}
+	mutex_unlock(&fault_param->lock);
 
-	return iommu_page_response(group, &resp);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(iopf_group_response);
 
@@ -468,8 +395,9 @@ EXPORT_SYMBOL_GPL(iopf_queue_add_device);
  */
 void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 {
-	struct iopf_fault *iopf, *next;
-	struct iommu_page_response resp;
+	struct iopf_fault *partial_iopf;
+	struct iopf_fault *next;
+	struct iopf_group *group, *temp;
 	struct dev_iommu *param = dev->iommu;
 	struct iommu_fault_param *fault_param;
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
@@ -483,21 +411,19 @@ void iopf_queue_remove_device(struct iopf_queue *queue, struct device *dev)
 		goto unlock;
 
 	mutex_lock(&fault_param->lock);
-	list_for_each_entry_safe(iopf, next, &fault_param->partial, list)
-		kfree(iopf);
-
-	list_for_each_entry_safe(iopf, next, &fault_param->faults, list) {
-		memset(&resp, 0, sizeof(struct iommu_page_response));
-		resp.pasid = iopf->fault.prm.pasid;
-		resp.grpid = iopf->fault.prm.grpid;
-		resp.code = IOMMU_PAGE_RESP_INVALID;
+	list_for_each_entry_safe(partial_iopf, next, &fault_param->partial, list)
+		kfree(partial_iopf);
 
-		if (iopf->fault.prm.flags & IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID)
-			resp.flags = IOMMU_PAGE_RESP_PASID_VALID;
+	list_for_each_entry_safe(group, temp, &fault_param->faults, pending_node) {
+		struct iopf_fault *iopf = &group->last_fault;
+		struct iommu_page_response resp = {
+			.pasid = iopf->fault.prm.pasid,
+			.grpid = iopf->fault.prm.grpid,
+			.code = IOMMU_PAGE_RESP_INVALID
+		};
 
 		ops->page_response(dev, iopf, &resp);
-		list_del(&iopf->list);
-		kfree(iopf);
+		list_del_init(&group->pending_node);
 	}
 	mutex_unlock(&fault_param->lock);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 53cdd6a5a6b59..1be504b1d6e67 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -106,15 +106,11 @@ enum iommu_page_response_code {
 
 /**
  * struct iommu_page_response - Generic page response information
- * @flags: encodes whether the corresponding fields are valid
- *         (IOMMU_FAULT_PAGE_RESPONSE_* values)
  * @pasid: Process Address Space ID
  * @grpid: Page Request Group Index
  * @code: response code from &enum iommu_page_response_code
  */
 struct iommu_page_response {
-#define IOMMU_PAGE_RESP_PASID_VALID	(1 << 0)
-	u32	flags;
 	u32	pasid;
 	u32	grpid;
 	u32	code;
@@ -129,6 +125,8 @@ struct iopf_fault {
 struct iopf_group {
 	struct iopf_fault last_fault;
 	struct list_head faults;
+	/* list node for iommu_fault_param::faults */
+	struct list_head pending_node;
 	struct work_struct work;
 	struct iommu_domain *domain;
 	/* The device's fault data parameter. */

From f610f88ee8c3f0fe4001257dcc2143298327851f Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:26 +0800
Subject: [PATCH 086/352] iommu: Make iopf_group_response() return void

The iopf_group_response() should return void, as nothing can do anything
with the failure. This implies that ops->page_response() must also return
void; this is consistent with what the drivers do. The failure paths,
which are all integrity validations of the fault, should be WARN_ON'd,
not return codes.

If the iommu core fails to enqueue the fault, it should respond the fault
directly by calling ops->page_response() instead of returning an error
number and relying on the iommu drivers to do so. Consolidate the error
fault handling code in the core.

Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240212012227.119381-16-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit b554e396e51ce3d378a560666f85c6836a8323fd)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  50 +++-----
 drivers/iommu/intel/iommu.h                 |   4 +-
 drivers/iommu/intel/svm.c                   |  18 +--
 drivers/iommu/io-pgfault.c                  | 132 +++++++++++---------
 include/linux/iommu.h                       |  14 +--
 5 files changed, 98 insertions(+), 120 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d62c790663950..e2f76106e05b8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -920,31 +920,29 @@ static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
 }
 
-static int arm_smmu_page_response(struct device *dev,
-				  struct iopf_fault *unused,
-				  struct iommu_page_response *resp)
+static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused,
+				   struct iommu_page_response *resp)
 {
 	struct arm_smmu_cmdq_ent cmd = {0};
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	int sid = master->streams[0].id;
 
-	if (master->stall_enabled) {
-		cmd.opcode		= CMDQ_OP_RESUME;
-		cmd.resume.sid		= sid;
-		cmd.resume.stag		= resp->grpid;
-		switch (resp->code) {
-		case IOMMU_PAGE_RESP_INVALID:
-		case IOMMU_PAGE_RESP_FAILURE:
-			cmd.resume.resp = CMDQ_RESUME_0_RESP_ABORT;
-			break;
-		case IOMMU_PAGE_RESP_SUCCESS:
-			cmd.resume.resp = CMDQ_RESUME_0_RESP_RETRY;
-			break;
-		default:
-			return -EINVAL;
-		}
-	} else {
-		return -ENODEV;
+	if (WARN_ON(!master->stall_enabled))
+		return;
+
+	cmd.opcode		= CMDQ_OP_RESUME;
+	cmd.resume.sid		= sid;
+	cmd.resume.stag		= resp->grpid;
+	switch (resp->code) {
+	case IOMMU_PAGE_RESP_INVALID:
+	case IOMMU_PAGE_RESP_FAILURE:
+		cmd.resume.resp = CMDQ_RESUME_0_RESP_ABORT;
+		break;
+	case IOMMU_PAGE_RESP_SUCCESS:
+		cmd.resume.resp = CMDQ_RESUME_0_RESP_RETRY;
+		break;
+	default:
+		break;
 	}
 
 	arm_smmu_cmdq_issue_cmd(master->smmu, &cmd);
@@ -954,8 +952,6 @@ static int arm_smmu_page_response(struct device *dev,
 	 * terminated... at some point in the future. PRI_RESP is fire and
 	 * forget.
 	 */
-
-	return 0;
 }
 
 /* Context descriptor manipulation functions */
@@ -1516,16 +1512,6 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 	}
 
 	ret = iommu_report_device_fault(master->dev, &fault_evt);
-	if (ret && flt->type == IOMMU_FAULT_PAGE_REQ) {
-		/* Nobody cared, abort the access */
-		struct iommu_page_response resp = {
-			.pasid		= flt->prm.pasid,
-			.grpid		= flt->prm.grpid,
-			.code		= IOMMU_PAGE_RESP_FAILURE,
-		};
-		arm_smmu_page_response(master->dev, &fault_evt, &resp);
-	}
-
 out_unlock:
 	mutex_unlock(&smmu->streams_mutex);
 	return ret;
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index a03959ec439f1..404d2476a8774 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1096,8 +1096,8 @@ struct device *device_rbtree_find(struct intel_iommu *iommu, u16 rid);
 void intel_svm_check(struct intel_iommu *iommu);
 int intel_svm_enable_prq(struct intel_iommu *iommu);
 int intel_svm_finish_prq(struct intel_iommu *iommu);
-int intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
-			    struct iommu_page_response *msg);
+void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
+			     struct iommu_page_response *msg);
 struct iommu_domain *intel_svm_domain_alloc(void);
 void intel_svm_remove_dev_pasid(struct device *dev, ioasid_t pasid);
 void intel_drain_pasid_prq(struct device *dev, u32 pasid);
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index 1f4bfeace98f7..e25cd14c383e8 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -741,9 +741,8 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 	return IRQ_RETVAL(handled);
 }
 
-int intel_svm_page_response(struct device *dev,
-			    struct iopf_fault *evt,
-			    struct iommu_page_response *msg)
+void intel_svm_page_response(struct device *dev, struct iopf_fault *evt,
+			     struct iommu_page_response *msg)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
 	struct intel_iommu *iommu = info->iommu;
@@ -752,7 +751,6 @@ int intel_svm_page_response(struct device *dev,
 	bool private_present;
 	bool pasid_present;
 	bool last_page;
-	int ret = 0;
 	u16 sid;
 
 	prm = &evt->fault.prm;
@@ -761,16 +759,6 @@ int intel_svm_page_response(struct device *dev,
 	private_present = prm->flags & IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA;
 	last_page = prm->flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
 
-	if (!pasid_present) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	if (prm->pasid == 0 || prm->pasid >= PASID_MAX) {
-		ret = -EINVAL;
-		goto out;
-	}
-
 	/*
 	 * Per VT-d spec. v3.0 ch7.7, system software must respond
 	 * with page group response if private data is present (PDP)
@@ -799,8 +787,6 @@ int intel_svm_page_response(struct device *dev,
 
 		qi_submit_sync(iommu, &desc, 1, 0);
 	}
-out:
-	return ret;
 }
 
 static int intel_svm_set_dev_pasid(struct iommu_domain *domain,
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 05e49e2e6a52e..6a325bff8164e 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -39,7 +39,7 @@ static void iopf_put_dev_fault_param(struct iommu_fault_param *fault_param)
 		kfree_rcu(fault_param, rcu);
 }
 
-void iopf_free_group(struct iopf_group *group)
+static void __iopf_free_group(struct iopf_group *group)
 {
 	struct iopf_fault *iopf, *next;
 
@@ -50,6 +50,11 @@ void iopf_free_group(struct iopf_group *group)
 
 	/* Pair with iommu_report_device_fault(). */
 	iopf_put_dev_fault_param(group->fault_param);
+}
+
+void iopf_free_group(struct iopf_group *group)
+{
+	__iopf_free_group(group);
 	kfree(group);
 }
 EXPORT_SYMBOL_GPL(iopf_free_group);
@@ -97,14 +102,49 @@ static int report_partial_fault(struct iommu_fault_param *fault_param,
 	return 0;
 }
 
+static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
+					   struct iopf_fault *evt,
+					   struct iopf_group *abort_group)
+{
+	struct iopf_fault *iopf, *next;
+	struct iopf_group *group;
+
+	group = kzalloc(sizeof(*group), GFP_KERNEL);
+	if (!group) {
+		/*
+		 * We always need to construct the group as we need it to abort
+		 * the request at the driver if it can't be handled.
+		 */
+		group = abort_group;
+	}
+
+	group->fault_param = iopf_param;
+	group->last_fault.fault = evt->fault;
+	INIT_LIST_HEAD(&group->faults);
+	INIT_LIST_HEAD(&group->pending_node);
+	list_add(&group->last_fault.list, &group->faults);
+
+	/* See if we have partial faults for this group */
+	mutex_lock(&iopf_param->lock);
+	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
+		if (iopf->fault.prm.grpid == evt->fault.prm.grpid)
+			/* Insert *before* the last fault */
+			list_move(&iopf->list, &group->faults);
+	}
+	list_add(&group->pending_node, &iopf_param->faults);
+	mutex_unlock(&iopf_param->lock);
+
+	return group;
+}
+
 /**
  * iommu_report_device_fault() - Report fault event to device driver
  * @dev: the device
  * @evt: fault event data
  *
  * Called by IOMMU drivers when a fault is detected, typically in a threaded IRQ
- * handler. When this function fails and the fault is recoverable, it is the
- * caller's responsibility to complete the fault.
+ * handler. If this function fails then ops->page_response() was called to
+ * complete evt if required.
  *
  * This module doesn't handle PCI PASID Stop Marker; IOMMU drivers must discard
  * them before reporting faults. A PASID Stop Marker (LRW = 0b100) doesn't
@@ -143,22 +183,18 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
 	struct iommu_fault *fault = &evt->fault;
 	struct iommu_fault_param *iopf_param;
-	struct iopf_fault *iopf, *next;
-	struct iommu_domain *domain;
+	struct iopf_group abort_group = {};
 	struct iopf_group *group;
 	int ret;
 
-	if (fault->type != IOMMU_FAULT_PAGE_REQ)
-		return -EOPNOTSUPP;
-
 	iopf_param = iopf_get_dev_fault_param(dev);
-	if (!iopf_param)
+	if (WARN_ON(!iopf_param))
 		return -ENODEV;
 
 	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
 		ret = report_partial_fault(iopf_param, fault);
 		iopf_put_dev_fault_param(iopf_param);
-
+		/* A request that is not the last does not need to be ack'd */
 		return ret;
 	}
 
@@ -170,56 +206,33 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	 * will send a response to the hardware. We need to clean up before
 	 * leaving, otherwise partial faults will be stuck.
 	 */
-	domain = get_domain_for_iopf(dev, fault);
-	if (!domain) {
-		ret = -EINVAL;
-		goto cleanup_partial;
-	}
-
-	group = kzalloc(sizeof(*group), GFP_KERNEL);
-	if (!group) {
+	group = iopf_group_alloc(iopf_param, evt, &abort_group);
+	if (group == &abort_group) {
 		ret = -ENOMEM;
-		goto cleanup_partial;
+		goto err_abort;
 	}
 
-	group->fault_param = iopf_param;
-	group->last_fault.fault = *fault;
-	INIT_LIST_HEAD(&group->faults);
-	INIT_LIST_HEAD(&group->pending_node);
-	group->domain = domain;
-	list_add(&group->last_fault.list, &group->faults);
-
-	/* See if we have partial faults for this group */
-	mutex_lock(&iopf_param->lock);
-	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
-		if (iopf->fault.prm.grpid == fault->prm.grpid)
-			/* Insert *before* the last fault */
-			list_move(&iopf->list, &group->faults);
-	}
-	list_add(&group->pending_node, &iopf_param->faults);
-	mutex_unlock(&iopf_param->lock);
-
-	ret = domain->iopf_handler(group);
-	if (ret) {
-		mutex_lock(&iopf_param->lock);
-		list_del_init(&group->pending_node);
-		mutex_unlock(&iopf_param->lock);
-		iopf_free_group(group);
+	group->domain = get_domain_for_iopf(dev, fault);
+	if (!group->domain) {
+		ret = -EINVAL;
+		goto err_abort;
 	}
 
-	return ret;
-
-cleanup_partial:
-	mutex_lock(&iopf_param->lock);
-	list_for_each_entry_safe(iopf, next, &iopf_param->partial, list) {
-		if (iopf->fault.prm.grpid == fault->prm.grpid) {
-			list_del(&iopf->list);
-			kfree(iopf);
-		}
-	}
-	mutex_unlock(&iopf_param->lock);
-	iopf_put_dev_fault_param(iopf_param);
+	/*
+	 * On success iopf_handler must call iopf_group_response() and
+	 * iopf_free_group()
+	 */
+	ret = group->domain->iopf_handler(group);
+	if (ret)
+		goto err_abort;
+	return 0;
 
+err_abort:
+	iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE);
+	if (group == &abort_group)
+		__iopf_free_group(group);
+	else
+		iopf_free_group(group);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
@@ -259,11 +272,9 @@ EXPORT_SYMBOL_GPL(iopf_queue_flush_dev);
  * iopf_group_response - Respond a group of page faults
  * @group: the group of faults with the same group id
  * @status: the response code
- *
- * Return 0 on success and <0 on error.
  */
-int iopf_group_response(struct iopf_group *group,
-			enum iommu_page_response_code status)
+void iopf_group_response(struct iopf_group *group,
+			 enum iommu_page_response_code status)
 {
 	struct iommu_fault_param *fault_param = group->fault_param;
 	struct iopf_fault *iopf = &group->last_fault;
@@ -274,17 +285,14 @@ int iopf_group_response(struct iopf_group *group,
 		.grpid = iopf->fault.prm.grpid,
 		.code = status,
 	};
-	int ret = -EINVAL;
 
 	/* Only send response if there is a fault report pending */
 	mutex_lock(&fault_param->lock);
 	if (!list_empty(&group->pending_node)) {
-		ret = ops->page_response(dev, &group->last_fault, &resp);
+		ops->page_response(dev, &group->last_fault, &resp);
 		list_del_init(&group->pending_node);
 	}
 	mutex_unlock(&fault_param->lock);
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(iopf_group_response);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 1be504b1d6e67..2f273b4c794e3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -574,9 +574,8 @@ struct iommu_ops {
 	int (*dev_enable_feat)(struct device *dev, enum iommu_dev_features f);
 	int (*dev_disable_feat)(struct device *dev, enum iommu_dev_features f);
 
-	int (*page_response)(struct device *dev,
-			     struct iopf_fault *evt,
-			     struct iommu_page_response *msg);
+	void (*page_response)(struct device *dev, struct iopf_fault *evt,
+			      struct iommu_page_response *msg);
 
 	int (*def_domain_type)(struct device *dev);
 	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid);
@@ -1559,8 +1558,8 @@ void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
 void iopf_free_group(struct iopf_group *group);
 int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
-int iopf_group_response(struct iopf_group *group,
-			enum iommu_page_response_code status);
+void iopf_group_response(struct iopf_group *group,
+			 enum iommu_page_response_code status);
 #else
 static inline int
 iopf_queue_add_device(struct iopf_queue *queue, struct device *dev)
@@ -1602,10 +1601,9 @@ iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	return -ENODEV;
 }
 
-static inline int iopf_group_response(struct iopf_group *group,
-				      enum iommu_page_response_code status)
+static inline void iopf_group_response(struct iopf_group *group,
+				       enum iommu_page_response_code status)
 {
-	return -ENODEV;
 }
 #endif /* CONFIG_IOMMU_IOPF */
 #endif /* __LINUX_IOMMU_H */

From 92bd654e6ebd3723167a47487f904479f637baa3 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 12 Feb 2024 09:22:27 +0800
Subject: [PATCH 087/352] iommu: Make iommu_report_device_fault() return void

As the iommu_report_device_fault() has been converted to auto-respond a
page fault if it fails to enqueue it, there's no need to return a code
in any case. Make it return void.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240212012227.119381-17-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 3dfa64aecbafc288216b2790438d395add192c30)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  4 ++--
 drivers/iommu/intel/svm.c                   | 19 ++++++----------
 drivers/iommu/io-pgfault.c                  | 25 +++++++--------------
 include/linux/iommu.h                       |  5 ++---
 4 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e2f76106e05b8..5a2c60075e8e2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1455,7 +1455,7 @@ arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
 /* IRQ and event handlers */
 static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 {
-	int ret;
+	int ret = 0;
 	u32 perm = 0;
 	struct arm_smmu_master *master;
 	bool ssid_valid = evt[0] & EVTQ_0_SSV;
@@ -1511,7 +1511,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 		goto out_unlock;
 	}
 
-	ret = iommu_report_device_fault(master->dev, &fault_evt);
+	iommu_report_device_fault(master->dev, &fault_evt);
 out_unlock:
 	mutex_unlock(&smmu->streams_mutex);
 	return ret;
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index e25cd14c383e8..ee58f0d962272 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -561,14 +561,11 @@ static int prq_to_iommu_prot(struct page_req_dsc *req)
 	return prot;
 }
 
-static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
-				struct page_req_dsc *desc)
+static void intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
+				 struct page_req_dsc *desc)
 {
 	struct iopf_fault event = { };
 
-	if (!dev || !dev_is_pci(dev))
-		return -ENODEV;
-
 	/* Fill in event data for device specific processing */
 	event.fault.type = IOMMU_FAULT_PAGE_REQ;
 	event.fault.prm.addr = (u64)desc->addr << VTD_PAGE_SHIFT;
@@ -601,7 +598,7 @@ static int intel_svm_prq_report(struct intel_iommu *iommu, struct device *dev,
 		event.fault.prm.private_data[0] = ktime_to_ns(ktime_get());
 	}
 
-	return iommu_report_device_fault(dev, &event);
+	iommu_report_device_fault(dev, &event);
 }
 
 static void handle_bad_prq_event(struct intel_iommu *iommu,
@@ -705,12 +702,10 @@ static irqreturn_t prq_event_thread(int irq, void *d)
 			goto bad_req;
 		}
 
-		if (intel_svm_prq_report(iommu, dev, req))
-			handle_bad_prq_event(iommu, req, QI_RESP_INVALID);
-		else
-			trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
-					 req->priv_data[0], req->priv_data[1],
-					 iommu->prq_seq_number++);
+		intel_svm_prq_report(iommu, dev, req);
+		trace_prq_report(iommu, dev, req->qw_0, req->qw_1,
+				 req->priv_data[0], req->priv_data[1],
+				 iommu->prq_seq_number++);
 		mutex_unlock(&iommu->iopf_lock);
 prq_advance:
 		head = (head + sizeof(*req)) & PRQ_RING_MASK;
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 6a325bff8164e..06d78fcc79fdb 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -176,26 +176,22 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
  * freed after the device has stopped generating page faults (or the iommu
  * hardware has been set to block the page faults) and the pending page faults
  * have been flushed.
- *
- * Return: 0 on success and <0 on error.
  */
-int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
+void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
 	struct iommu_fault *fault = &evt->fault;
 	struct iommu_fault_param *iopf_param;
 	struct iopf_group abort_group = {};
 	struct iopf_group *group;
-	int ret;
 
 	iopf_param = iopf_get_dev_fault_param(dev);
 	if (WARN_ON(!iopf_param))
-		return -ENODEV;
+		return;
 
 	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
-		ret = report_partial_fault(iopf_param, fault);
+		report_partial_fault(iopf_param, fault);
 		iopf_put_dev_fault_param(iopf_param);
 		/* A request that is not the last does not need to be ack'd */
-		return ret;
 	}
 
 	/*
@@ -207,25 +203,21 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	 * leaving, otherwise partial faults will be stuck.
 	 */
 	group = iopf_group_alloc(iopf_param, evt, &abort_group);
-	if (group == &abort_group) {
-		ret = -ENOMEM;
+	if (group == &abort_group)
 		goto err_abort;
-	}
 
 	group->domain = get_domain_for_iopf(dev, fault);
-	if (!group->domain) {
-		ret = -EINVAL;
+	if (!group->domain)
 		goto err_abort;
-	}
 
 	/*
 	 * On success iopf_handler must call iopf_group_response() and
 	 * iopf_free_group()
 	 */
-	ret = group->domain->iopf_handler(group);
-	if (ret)
+	if (group->domain->iopf_handler(group))
 		goto err_abort;
-	return 0;
+
+	return;
 
 err_abort:
 	iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE);
@@ -233,7 +225,6 @@ int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 		__iopf_free_group(group);
 	else
 		iopf_free_group(group);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2f273b4c794e3..36375e7651f48 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1557,7 +1557,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
 void iopf_free_group(struct iopf_group *group);
-int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
+void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
 void iopf_group_response(struct iopf_group *group,
 			 enum iommu_page_response_code status);
 #else
@@ -1595,10 +1595,9 @@ static inline void iopf_free_group(struct iopf_group *group)
 {
 }
 
-static inline int
+static inline void
 iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
-	return -ENODEV;
 }
 
 static inline void iopf_group_response(struct iopf_group *group,

From 0a9d38f76b56715a0e361033297c0c54cb0071b9 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada <masahiroy@kernel.org>
Date: Sun, 11 Feb 2024 21:48:08 +0900
Subject: [PATCH 088/352] treewide: replace or remove redundant def_bool in
 Kconfig files

'def_bool X' is a shorthand for 'bool' plus 'default X'.

'def_bool' is redundant where 'bool' is already present, so 'def_bool X'
can be replaced with 'default X', or removed if X is 'n'.

Signed-off-by: Masahiro Yamada <masahiroy@kernel.org>
(cherry picked from commit cd14b01846612f3f3277e97bfbecba4c8cee5ce9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/parisc/Kconfig                   | 4 ++--
 arch/riscv/kernel/tests/Kconfig.debug | 2 +-
 arch/x86/kvm/Kconfig                  | 2 +-
 arch/x86/xen/Kconfig                  | 1 -
 drivers/acpi/Kconfig                  | 1 -
 drivers/iommu/intel/Kconfig           | 2 +-
 drivers/md/Kconfig                    | 1 -
 init/Kconfig                          | 2 +-
 lib/Kconfig.debug                     | 2 +-
 mm/Kconfig                            | 7 +++----
 net/dccp/ccids/Kconfig                | 2 +-
 11 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 42822265c59bb..0b16b1c0dfd4f 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -237,9 +237,9 @@ config PARISC_HUGE_KERNEL
 	def_bool y if !MODULES || UBSAN || FTRACE || COMPILE_TEST
 
 config MLONGCALLS
-	def_bool y if PARISC_HUGE_KERNEL
 	bool "Enable the -mlong-calls compiler option for big kernels" if !PARISC_HUGE_KERNEL
 	depends on PA8X00
+	default PARISC_HUGE_KERNEL
 	help
 	  If you configure the kernel to include many drivers built-in instead
 	  as modules, the kernel executable may become too big, so that the
@@ -254,9 +254,9 @@ config MLONGCALLS
 	  Enabling this option will probably slow down your kernel.
 
 config 64BIT
-	def_bool y if "$(ARCH)" = "parisc64"
 	bool "64-bit kernel" if "$(ARCH)" = "parisc"
 	depends on PA8X00
+	default "$(ARCH)" = "parisc64"
 	help
 	  Enable this if you want to support 64bit kernel on PA-RISC platform.
 
diff --git a/arch/riscv/kernel/tests/Kconfig.debug b/arch/riscv/kernel/tests/Kconfig.debug
index 5dba64e8e977c..78cea5d2c2702 100644
--- a/arch/riscv/kernel/tests/Kconfig.debug
+++ b/arch/riscv/kernel/tests/Kconfig.debug
@@ -6,7 +6,7 @@ config AS_HAS_ULEB128
 
 menuconfig RUNTIME_KERNEL_TESTING_MENU
        bool "arch/riscv/kernel runtime Testing"
-       def_bool y
+       default y
        help
          Enable riscv kernel runtime testing.
 
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 65ed14b6540bb..f8b4d67d10927 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -120,8 +120,8 @@ config KVM_AMD
 	  will be called kvm-amd.
 
 config KVM_AMD_SEV
-	def_bool y
 	bool "AMD Secure Encrypted Virtualization (SEV) support"
+	default y
 	depends on KVM_AMD && X86_64
 	depends on CRYPTO_DEV_SP_PSP && !(KVM_AMD=y && CRYPTO_DEV_CCP_DD=m)
 	help
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index a65fc2ae15b49..77e788e928cd4 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -81,7 +81,6 @@ config XEN_PVH
 	bool "Xen PVH guest support"
 	depends on XEN && XEN_PVHVM && ACPI
 	select PVH
-	def_bool n
 	help
 	  Support for running as a Xen PVH guest.
 
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index 3c3f8037ebedd..8ce591679d50e 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -474,7 +474,6 @@ config ACPI_BGRT
 
 config ACPI_REDUCED_HARDWARE_ONLY
 	bool "Hardware-reduced ACPI support only" if EXPERT
-	def_bool n
 	help
 	  This config item changes the way the ACPI code is built.  When this
 	  option is selected, the kernel will use a specialized version of
diff --git a/drivers/iommu/intel/Kconfig b/drivers/iommu/intel/Kconfig
index a4a125666293f..38a7220a72782 100644
--- a/drivers/iommu/intel/Kconfig
+++ b/drivers/iommu/intel/Kconfig
@@ -98,8 +98,8 @@ config INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON
 	  the default value.
 
 config INTEL_IOMMU_PERF_EVENTS
-	def_bool y
 	bool "Intel IOMMU performance events"
+	default y
 	depends on INTEL_IOMMU && PERF_EVENTS
 	help
 	  Selecting this option will enable the performance monitoring
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index a743e2c572fc8..0392154bbcab8 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -519,7 +519,6 @@ config DM_VERITY
 	  If unsure, say N.
 
 config DM_VERITY_VERIFY_ROOTHASH_SIG
-	def_bool n
 	bool "Verity data device root hash signature verification support"
 	depends on DM_VERITY
 	select SYSTEM_DATA_VERIFICATION
diff --git a/init/Kconfig b/init/Kconfig
index e094e57ec456a..635d6d8a79e32 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1513,7 +1513,7 @@ config MULTIUSER
 
 config SGETMASK_SYSCALL
 	bool "sgetmask/ssetmask syscalls support" if EXPERT
-	def_bool PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH
+	default PARISC || M68K || PPC || MIPS || X86 || SPARC || MICROBLAZE || SUPERH
 	help
 	  sys_sgetmask and sys_ssetmask are obsolete system calls
 	  no longer supported in libc but still enabled by default in some
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6352d59c57461..86ae7b1f7dab4 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2128,7 +2128,7 @@ config KCOV_IRQ_AREA_SIZE
 
 menuconfig RUNTIME_TESTING_MENU
 	bool "Runtime Testing"
-	def_bool y
+	default y
 
 if RUNTIME_TESTING_MENU
 
diff --git a/mm/Kconfig b/mm/Kconfig
index ffc3a2ba3a8cd..4914eb3216fcc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -599,7 +599,7 @@ config MEMORY_BALLOON
 # support for memory balloon compaction
 config BALLOON_COMPACTION
 	bool "Allow for balloon memory compaction/migration"
-	def_bool y
+	default y
 	depends on COMPACTION && MEMORY_BALLOON
 	help
 	  Memory fragmentation introduced by ballooning might reduce
@@ -614,7 +614,7 @@ config BALLOON_COMPACTION
 # support for memory compaction
 config COMPACTION
 	bool "Allow for memory compaction"
-	def_bool y
+	default y
 	select MIGRATION
 	depends on MMU
 	help
@@ -637,7 +637,6 @@ config COMPACT_UNEVICTABLE_DEFAULT
 # support for free page reporting
 config PAGE_REPORTING
 	bool "Free page reporting"
-	def_bool n
 	help
 	  Free page reporting allows for the incremental acquisition of
 	  free pages from the buddy allocator for the purpose of reporting
@@ -649,7 +648,7 @@ config PAGE_REPORTING
 #
 config MIGRATION
 	bool "Page migration"
-	def_bool y
+	default y
 	depends on (NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION || CMA) && MMU
 	help
 	  Allows the migration of the physical location of pages of processes
diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig
index a3eeb84d16f9c..e3d388c33d256 100644
--- a/net/dccp/ccids/Kconfig
+++ b/net/dccp/ccids/Kconfig
@@ -13,7 +13,7 @@ config IP_DCCP_CCID2_DEBUG
 
 config IP_DCCP_CCID3
 	bool "CCID-3 (TCP-Friendly)"
-	def_bool y if (IP_DCCP = y || IP_DCCP = m)
+	default IP_DCCP = y || IP_DCCP = m
 	help
 	  CCID-3 denotes TCP-Friendly Rate Control (TFRC), an equation-based
 	  rate-controlled congestion control mechanism.  TFRC is designed to

From 73aa81bea7bb52a76389ca95deb455e1b59cb588 Mon Sep 17 00:00:00 2001
From: Abel Vesa <abel.vesa@linaro.org>
Date: Wed, 31 Jan 2024 14:35:16 +0200
Subject: [PATCH 089/352] iommu/arm-smmu-qcom: Add X1E80100 MDSS compatible

Add the X1E80100 MDSS compatible to clients compatible list, as it also
needs the workarounds.

Signed-off-by: Abel Vesa <abel.vesa@linaro.org>
Link: https://lore.kernel.org/r/20240131-x1e80100-iommu-arm-smmu-qcom-v1-1-c1240419c718@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 12721e66005798b2a07bd8309060c230e52ba2b0)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 8b04ece00420d..5c7cfc51b57c0 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -260,6 +260,7 @@ static const struct of_device_id qcom_smmu_client_of_match[] __maybe_unused = {
 	{ .compatible = "qcom,sm6375-mdss" },
 	{ .compatible = "qcom,sm8150-mdss" },
 	{ .compatible = "qcom,sm8250-mdss" },
+	{ .compatible = "qcom,x1e80100-mdss" },
 	{ }
 };
 

From 469be24138357bdfb597654718d817f77464ad3d Mon Sep 17 00:00:00 2001
From: Kunwu Chan <chentao@kylinos.cn>
Date: Mon, 15 Jan 2024 14:34:34 +0800
Subject: [PATCH 090/352] vfio/pci: WARN_ON driver_override kasprintf failure

kasprintf() returns a pointer to dynamically allocated memory
which can be NULL upon failure.

This is a blocking notifier callback, so errno isn't a proper return
value. Use WARN_ON to small allocation failures.

Signed-off-by: Kunwu Chan <chentao@kylinos.cn>
Link: https://lore.kernel.org/r/20240115063434.20278-1-chentao@kylinos.cn
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 19032628bd7ce8a39cdf0521b6418bf88c25ec80)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 142c9e494e506..ed10ff38e0dc5 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -2067,6 +2067,7 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb,
 			 pci_name(pdev));
 		pdev->driver_override = kasprintf(GFP_KERNEL, "%s",
 						  vdev->vdev.ops->name);
+		WARN_ON(!pdev->driver_override);
 	} else if (action == BUS_NOTIFY_BOUND_DRIVER &&
 		   pdev->is_virtfn && physfn == vdev->pdev) {
 		struct pci_driver *drv = pci_dev_driver(pdev);

From a059d662cfe3bae6f7ab4b35e706080a042e67f2 Mon Sep 17 00:00:00 2001
From: "Ricardo B. Marliere" <ricardo@marliere.net>
Date: Thu, 8 Feb 2024 17:02:04 -0300
Subject: [PATCH 091/352] vfio: mdev: make mdev_bus_type const

Now that the driver core can properly handle constant struct bus_type,
move the mdev_bus_type variable to be a constant structure as well,
placing it into read-only memory which can not be modified at runtime.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Suggested-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Ricardo B. Marliere <ricardo@marliere.net>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kirti Wankhede <kwankhede@nvidia.com>
Link: https://lore.kernel.org/r/20240208-bus_cleanup-vfio-v1-1-ed5da3019949@marliere.net
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 77943f4d2de0c5fa284013b97967e6c271c04310)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/mdev/mdev_driver.c  | 2 +-
 drivers/vfio/mdev/mdev_private.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c
index 7825d83a55f8c..b98322966b3ed 100644
--- a/drivers/vfio/mdev/mdev_driver.c
+++ b/drivers/vfio/mdev/mdev_driver.c
@@ -40,7 +40,7 @@ static int mdev_match(struct device *dev, struct device_driver *drv)
 	return 0;
 }
 
-struct bus_type mdev_bus_type = {
+const struct bus_type mdev_bus_type = {
 	.name		= "mdev",
 	.probe		= mdev_probe,
 	.remove		= mdev_remove,
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
index af457b27f6074..63a1316b08b72 100644
--- a/drivers/vfio/mdev/mdev_private.h
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -13,7 +13,7 @@
 int  mdev_bus_register(void);
 void mdev_bus_unregister(void);
 
-extern struct bus_type mdev_bus_type;
+extern const struct bus_type mdev_bus_type;
 extern const struct attribute_group *mdev_device_groups[];
 
 #define to_mdev_type_attr(_attr)	\

From a3a31d05c34108f5fc34516c900e3ec6f2e7fb28 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Tue, 20 Feb 2024 17:20:53 +0530
Subject: [PATCH 092/352] vfio/pci: rename and export do_io_rw()

do_io_rw() is used to read/write to the device MMIO. The grace hopper
VFIO PCI variant driver require this functionality to read/write to
its memory.

Rename this as vfio_pci_core functions and export as GPL.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240220115055.23546-2-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 4de676d494cd8fb2b4c65e58c19ebbdb36673957)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_rdwr.c | 16 +++++++++-------
 include/linux/vfio_pci_core.h    |  5 ++++-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_rdwr.c b/drivers/vfio/pci/vfio_pci_rdwr.c
index 07fea08ea8a21..03b8f7ada1ac2 100644
--- a/drivers/vfio/pci/vfio_pci_rdwr.c
+++ b/drivers/vfio/pci/vfio_pci_rdwr.c
@@ -96,10 +96,10 @@ VFIO_IOREAD(32)
  * reads with -1.  This is intended for handling MSI-X vector tables and
  * leftover space for ROM BARs.
  */
-static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
-			void __iomem *io, char __user *buf,
-			loff_t off, size_t count, size_t x_start,
-			size_t x_end, bool iswrite)
+ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
+			       void __iomem *io, char __user *buf,
+			       loff_t off, size_t count, size_t x_start,
+			       size_t x_end, bool iswrite)
 {
 	ssize_t done = 0;
 	int ret;
@@ -201,6 +201,7 @@ static ssize_t do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 
 	return done;
 }
+EXPORT_SYMBOL_GPL(vfio_pci_core_do_io_rw);
 
 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar)
 {
@@ -279,8 +280,8 @@ ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 		x_end = vdev->msix_offset + vdev->msix_size;
 	}
 
-	done = do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
-			count, x_start, x_end, iswrite);
+	done = vfio_pci_core_do_io_rw(vdev, res->flags & IORESOURCE_MEM, io, buf, pos,
+				      count, x_start, x_end, iswrite);
 
 	if (done >= 0)
 		*ppos += done;
@@ -348,7 +349,8 @@ ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 	 * probing, so we don't currently worry about access in relation
 	 * to the memory enable bit in the command register.
 	 */
-	done = do_io_rw(vdev, false, iomem, buf, off, count, 0, 0, iswrite);
+	done = vfio_pci_core_do_io_rw(vdev, false, iomem, buf, off, count,
+				      0, 0, iswrite);
 
 	vga_put(vdev->pdev, rsrc);
 
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index 85e84b92751b6..cf9480a31f3e0 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -130,7 +130,10 @@ void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
 int vfio_pci_core_setup_barmap(struct vfio_pci_core_device *vdev, int bar);
 pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
 						pci_channel_state_t state);
-
+ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
+			       void __iomem *io, char __user *buf,
+			       loff_t off, size_t count, size_t x_start,
+			       size_t x_end, bool iswrite);
 #define VFIO_IOWRITE_DECLATION(size) \
 int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev,	\
 			bool test_mem, u##size val, void __iomem *io);

From 49e319b61bb349bdbe75d581044e3b4d499ecf35 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Tue, 20 Feb 2024 17:20:54 +0530
Subject: [PATCH 093/352] vfio/pci: rename and export range_intersect_range

range_intersect_range determines an overlap between two ranges. If an
overlap, the helper function returns the overlapping offset and size.

The VFIO PCI variant driver emulates the PCI config space BAR offset
registers. These offset may be accessed for read/write with a variety
of lengths including sub-word sizes from sub-word offsets. The driver
makes use of this helper function to read/write the targeted part of
the emulated register.

Make this a vfio_pci_core function, rename and export as GPL. Also
update references in virtio driver.

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240220115055.23546-3-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 30e920e1debb437e5aea7a4ccdab61634354297a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 42 +++++++++++++++++
 drivers/vfio/pci/virtio/main.c     | 72 +++++++++++-------------------
 include/linux/vfio_pci_core.h      |  5 +++
 3 files changed, 73 insertions(+), 46 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 7e2e62ab0869c..97422aafaa7b5 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -1966,3 +1966,45 @@ ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev, char __user *buf,
 
 	return done;
 }
+
+/**
+ * vfio_pci_core_range_intersect_range() - Determine overlap between a buffer
+ *					   and register offset ranges.
+ * @buf_start:		start offset of the buffer
+ * @buf_cnt:		number of buffer bytes
+ * @reg_start:		start register offset
+ * @reg_cnt:		number of register bytes
+ * @buf_offset:	start offset of overlap in the buffer
+ * @intersect_count:	number of overlapping bytes
+ * @register_offset:	start offset of overlap in register
+ *
+ * Returns: true if there is overlap, false if not.
+ * The overlap start and size is returned through function args.
+ */
+bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
+					 loff_t reg_start, size_t reg_cnt,
+					 loff_t *buf_offset,
+					 size_t *intersect_count,
+					 size_t *register_offset)
+{
+	if (buf_start <= reg_start &&
+	    buf_start + buf_cnt > reg_start) {
+		*buf_offset = reg_start - buf_start;
+		*intersect_count = min_t(size_t, reg_cnt,
+					 buf_start + buf_cnt - reg_start);
+		*register_offset = 0;
+		return true;
+	}
+
+	if (buf_start > reg_start &&
+	    buf_start < reg_start + reg_cnt) {
+		*buf_offset = 0;
+		*intersect_count = min_t(size_t, buf_cnt,
+					 reg_start + reg_cnt - buf_start);
+		*register_offset = buf_start - reg_start;
+		return true;
+	}
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(vfio_pci_core_range_intersect_range);
diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c
index d5af683837d34..b5d3a8c5bbc9a 100644
--- a/drivers/vfio/pci/virtio/main.c
+++ b/drivers/vfio/pci/virtio/main.c
@@ -132,33 +132,6 @@ virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev,
 	return ret ? ret : count;
 }
 
-static bool range_intersect_range(loff_t range1_start, size_t count1,
-				  loff_t range2_start, size_t count2,
-				  loff_t *start_offset,
-				  size_t *intersect_count,
-				  size_t *register_offset)
-{
-	if (range1_start <= range2_start &&
-	    range1_start + count1 > range2_start) {
-		*start_offset = range2_start - range1_start;
-		*intersect_count = min_t(size_t, count2,
-					 range1_start + count1 - range2_start);
-		*register_offset = 0;
-		return true;
-	}
-
-	if (range1_start > range2_start &&
-	    range1_start < range2_start + count2) {
-		*start_offset = 0;
-		*intersect_count = min_t(size_t, count1,
-					 range2_start + count2 - range1_start);
-		*register_offset = range1_start - range2_start;
-		return true;
-	}
-
-	return false;
-}
-
 static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
 					char __user *buf, size_t count,
 					loff_t *ppos)
@@ -178,16 +151,18 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
 	if (ret < 0)
 		return ret;
 
-	if (range_intersect_range(pos, count, PCI_DEVICE_ID, sizeof(val16),
-				  &copy_offset, &copy_count, &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID,
+						sizeof(val16), &copy_offset,
+						&copy_count, &register_offset)) {
 		val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET);
 		if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count))
 			return -EFAULT;
 	}
 
 	if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) &&
-	    range_intersect_range(pos, count, PCI_COMMAND, sizeof(val16),
-				  &copy_offset, &copy_count, &register_offset)) {
+	    vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+						sizeof(val16), &copy_offset,
+						&copy_count, &register_offset)) {
 		if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset,
 				   copy_count))
 			return -EFAULT;
@@ -197,16 +172,18 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
 			return -EFAULT;
 	}
 
-	if (range_intersect_range(pos, count, PCI_REVISION_ID, sizeof(val8),
-				  &copy_offset, &copy_count, &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID,
+						sizeof(val8), &copy_offset,
+						&copy_count, &register_offset)) {
 		/* Transional needs to have revision 0 */
 		val8 = 0;
 		if (copy_to_user(buf + copy_offset, &val8, copy_count))
 			return -EFAULT;
 	}
 
-	if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, sizeof(val32),
-				  &copy_offset, &copy_count, &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+						sizeof(val32), &copy_offset,
+						&copy_count, &register_offset)) {
 		u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1);
 		u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0);
 
@@ -215,8 +192,9 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
 			return -EFAULT;
 	}
 
-	if (range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, sizeof(val16),
-				  &copy_offset, &copy_count, &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID,
+						sizeof(val16), &copy_offset,
+						&copy_count, &register_offset)) {
 		/*
 		 * Transitional devices use the PCI subsystem device id as
 		 * virtio device id, same as legacy driver always did.
@@ -227,8 +205,9 @@ static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev,
 			return -EFAULT;
 	}
 
-	if (range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, sizeof(val16),
-				  &copy_offset, &copy_count, &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID,
+						sizeof(val16), &copy_offset,
+						&copy_count, &register_offset)) {
 		val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET);
 		if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset,
 				 copy_count))
@@ -270,19 +249,20 @@ static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev,
 	loff_t copy_offset;
 	size_t copy_count;
 
-	if (range_intersect_range(pos, count, PCI_COMMAND, sizeof(virtvdev->pci_cmd),
-				  &copy_offset, &copy_count,
-				  &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND,
+						sizeof(virtvdev->pci_cmd),
+						&copy_offset, &copy_count,
+						&register_offset)) {
 		if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset,
 				   buf + copy_offset,
 				   copy_count))
 			return -EFAULT;
 	}
 
-	if (range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
-				  sizeof(virtvdev->pci_base_addr_0),
-				  &copy_offset, &copy_count,
-				  &register_offset)) {
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0,
+						sizeof(virtvdev->pci_base_addr_0),
+						&copy_offset, &copy_count,
+						&register_offset)) {
 		if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset,
 				   buf + copy_offset,
 				   copy_count))
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index cf9480a31f3e0..a2c8b8bba7119 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -134,6 +134,11 @@ ssize_t vfio_pci_core_do_io_rw(struct vfio_pci_core_device *vdev, bool test_mem,
 			       void __iomem *io, char __user *buf,
 			       loff_t off, size_t count, size_t x_start,
 			       size_t x_end, bool iswrite);
+bool vfio_pci_core_range_intersect_range(loff_t buf_start, size_t buf_cnt,
+					 loff_t reg_start, size_t reg_cnt,
+					 loff_t *buf_offset,
+					 size_t *intersect_count,
+					 size_t *register_offset);
 #define VFIO_IOWRITE_DECLATION(size) \
 int vfio_pci_core_iowrite##size(struct vfio_pci_core_device *vdev,	\
 			bool test_mem, u##size val, void __iomem *io);

From fb0c8a79d3f7ede1044dc4cdc8802ee063e7ab9f Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Tue, 20 Feb 2024 17:20:55 +0530
Subject: [PATCH 094/352] vfio/nvgrace-gpu: Add vfio pci variant module for
 grace hopper

NVIDIA's upcoming Grace Hopper Superchip provides a PCI-like device
for the on-chip GPU that is the logical OS representation of the
internal proprietary chip-to-chip cache coherent interconnect.

The device is peculiar compared to a real PCI device in that whilst
there is a real 64b PCI BAR1 (comprising region 2 & region 3) on the
device, it is not used to access device memory once the faster
chip-to-chip interconnect is initialized (occurs at the time of host
system boot). The device memory is accessed instead using the chip-to-chip
interconnect that is exposed as a contiguous physically addressable
region on the host. This device memory aperture can be obtained from host
ACPI table using device_property_read_u64(), according to the FW
specification. Since the device memory is cache coherent with the CPU,
it can be mmap into the user VMA with a cacheable mapping using
remap_pfn_range() and used like a regular RAM. The device memory
is not added to the host kernel, but mapped directly as this reduces
memory wastage due to struct pages.

There is also a requirement of a minimum reserved 1G uncached region
(termed as resmem) to support the Multi-Instance GPU (MIG) feature [1].
This is to work around a HW defect. Based on [2], the requisite properties
(uncached, unaligned access) can be achieved through a VM mapping (S1)
of NORMAL_NC and host (S2) mapping with MemAttr[2:0]=0b101. To provide
a different non-cached property to the reserved 1G region, it needs to
be carved out from the device memory and mapped as a separate region
in Qemu VMA with pgprot_writecombine(). pgprot_writecombine() sets the
Qemu VMA page properties (pgprot) as NORMAL_NC.

Provide a VFIO PCI variant driver that adapts the unique device memory
representation into a more standard PCI representation facing userspace.

The variant driver exposes these two regions - the non-cached reserved
(resmem) and the cached rest of the device memory (termed as usemem) as
separate VFIO 64b BAR regions. This is divergent from the baremetal
approach, where the device memory is exposed as a device memory region.
The decision for a different approach was taken in view of the fact that
it would necessiate additional code in Qemu to discover and insert those
regions in the VM IPA, along with the additional VM ACPI DSDT changes to
communicate the device memory region IPA to the VM workloads. Moreover,
this behavior would have to be added to a variety of emulators (beyond
top of tree Qemu) out there desiring grace hopper support.

Since the device implements 64-bit BAR0, the VFIO PCI variant driver
maps the uncached carved out region to the next available PCI BAR (i.e.
comprising of region 2 and 3). The cached device memory aperture is
assigned BAR region 4 and 5. Qemu will then naturally generate a PCI
device in the VM with the uncached aperture reported as BAR2 region,
the cacheable as BAR4. The variant driver provides emulation for these
fake BARs' PCI config space offset registers.

The hardware ensures that the system does not crash when the memory
is accessed with the memory enable turned off. It synthesis ~0 reads
and dropped writes on such access. So there is no need to support the
disablement/enablement of BAR through PCI_COMMAND config space register.

The memory layout on the host looks like the following:
               devmem (memlength)
|--------------------------------------------------|
|-------------cached------------------------|--NC--|
|                                           |
usemem.memphys                              resmem.memphys

PCI BARs need to be aligned to the power-of-2, but the actual memory on the
device may not. A read or write access to the physical address from the
last device PFN up to the next power-of-2 aligned physical address
results in reading ~0 and dropped writes. Note that the GPU device
driver [6] is capable of knowing the exact device memory size through
separate means. The device memory size is primarily kept in the system
ACPI tables for use by the VFIO PCI variant module.

Note that the usemem memory is added by the VM Nvidia device driver [5]
to the VM kernel as memblocks. Hence make the usable memory size memblock
(MEMBLK_SIZE) aligned. This is a hardwired ABI value between the GPU FW and
VFIO driver. The VM device driver make use of the same value for its
calculation to determine USEMEM size.

Currently there is no provision in KVM for a S2 mapping with
MemAttr[2:0]=0b101, but there is an ongoing effort to provide the same [3].
As previously mentioned, resmem is mapped pgprot_writecombine(), that
sets the Qemu VMA page properties (pgprot) as NORMAL_NC. Using the
proposed changes in [3] and [4], KVM marks the region with
MemAttr[2:0]=0b101 in S2.

If the device memory properties are not present, the driver registers the
vfio-pci-core function pointers. Since there are no ACPI memory properties
generated for the VM, the variant driver inside the VM will only use
the vfio-pci-core ops and hence try to map the BARs as non cached. This
is not a problem as the CPUs have FWB enabled which blocks the VM
mapping's ability to override the cacheability set by the host mapping.

This goes along with a qemu series [6] to provides the necessary
implementation of the Grace Hopper Superchip firmware specification so
that the guest operating system can see the correct ACPI modeling for
the coherent GPU device. Verified with the CUDA workload in the VM.

[1] https://www.nvidia.com/en-in/technologies/multi-instance-gpu/
[2] section D8.5.5 of https://developer.arm.com/documentation/ddi0487/latest/
[3] https://lore.kernel.org/all/20240211174705.31992-1-ankita@nvidia.com/
[4] https://lore.kernel.org/all/20230907181459.18145-2-ankita@nvidia.com/
[5] https://github.com/NVIDIA/open-gpu-kernel-modules
[6] https://lore.kernel.org/all/20231203060245.31593-1-ankita@nvidia.com/

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yishai Hadas <yishaih@nvidia.com>
Reviewed-by: Zhi Wang <zhi.wang.linux@gmail.com>
Signed-off-by: Aniket Agashe <aniketa@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240220115055.23546-4-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 701ab935859fcfd4a8c8a97f3ee4fb5294a9d481)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 MAINTAINERS                           |   6 +
 drivers/vfio/pci/Kconfig              |   2 +
 drivers/vfio/pci/Makefile             |   2 +
 drivers/vfio/pci/nvgrace-gpu/Kconfig  |  10 +
 drivers/vfio/pci/nvgrace-gpu/Makefile |   3 +
 drivers/vfio/pci/nvgrace-gpu/main.c   | 879 ++++++++++++++++++++++++++
 6 files changed, 902 insertions(+)
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/Kconfig
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/Makefile
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/main.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 5c71554a5392e..268d88f7423e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -23108,6 +23108,12 @@ L:	virtualization@lists.linux-foundation.org
 S:	Maintained
 F:	drivers/vfio/pci/virtio
 
+VFIO NVIDIA GRACE GPU DRIVER
+M:	Ankit Agrawal <ankita@nvidia.com>
+L:	kvm@vger.kernel.org
+S:	Supported
+F:	drivers/vfio/pci/nvgrace-gpu/
+
 VFIO PCI DEVICE SPECIFIC DRIVERS
 R:	Jason Gunthorpe <jgg@nvidia.com>
 R:	Yishai Hadas <yishaih@nvidia.com>
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig
index 18c397df566d8..15821a2d77d25 100644
--- a/drivers/vfio/pci/Kconfig
+++ b/drivers/vfio/pci/Kconfig
@@ -67,4 +67,6 @@ source "drivers/vfio/pci/pds/Kconfig"
 
 source "drivers/vfio/pci/virtio/Kconfig"
 
+source "drivers/vfio/pci/nvgrace-gpu/Kconfig"
+
 endmenu
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 046139a4eca5b..ce7a61f1d912b 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_HISI_ACC_VFIO_PCI) += hisilicon/
 obj-$(CONFIG_PDS_VFIO_PCI) += pds/
 
 obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio/
+
+obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu/
diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig
new file mode 100644
index 0000000000000..a7f624b37e410
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config NVGRACE_GPU_VFIO_PCI
+	tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip"
+	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	select VFIO_PCI_CORE
+	help
+	  VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is
+	  required to assign the GPU device to userspace using KVM/qemu/etc.
+
+	  If you don't know what to do here, say N.
diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile
new file mode 100644
index 0000000000000..3ca8c187897a9
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o
+nvgrace-gpu-vfio-pci-y := main.o
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
new file mode 100644
index 0000000000000..25814006352d0
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -0,0 +1,879 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/sizes.h>
+#include <linux/vfio_pci_core.h>
+
+/*
+ * The device memory usable to the workloads running in the VM is cached
+ * and showcased as a 64b device BAR (comprising of BAR4 and BAR5 region)
+ * to the VM and is represented as usemem.
+ * Moreover, the VM GPU device driver needs a non-cacheable region to
+ * support the MIG feature. This region is also exposed as a 64b BAR
+ * (comprising of BAR2 and BAR3 region) and represented as resmem.
+ */
+#define RESMEM_REGION_INDEX VFIO_PCI_BAR2_REGION_INDEX
+#define USEMEM_REGION_INDEX VFIO_PCI_BAR4_REGION_INDEX
+
+/* Memory size expected as non cached and reserved by the VM driver */
+#define RESMEM_SIZE SZ_1G
+
+/* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
+#define MEMBLK_SIZE SZ_512M
+
+/*
+ * The state of the two device memory region - resmem and usemem - is
+ * saved as struct mem_region.
+ */
+struct mem_region {
+	phys_addr_t memphys;    /* Base physical address of the region */
+	size_t memlength;       /* Region size */
+	size_t bar_size;        /* Reported region BAR size */
+	__le64 bar_val;         /* Emulated BAR offset registers */
+	union {
+		void *memaddr;
+		void __iomem *ioaddr;
+	};                      /* Base virtual address of the region */
+};
+
+struct nvgrace_gpu_pci_core_device {
+	struct vfio_pci_core_device core_device;
+	/* Cached and usable memory for the VM. */
+	struct mem_region usemem;
+	/* Non cached memory carved out from the end of device memory */
+	struct mem_region resmem;
+	/* Lock to control device memory kernel mapping */
+	struct mutex remap_lock;
+};
+
+static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+
+	nvdev->resmem.bar_val = 0;
+	nvdev->usemem.bar_val = 0;
+}
+
+/* Choose the structure corresponding to the fake BAR with a given index. */
+static struct mem_region *
+nvgrace_gpu_memregion(int index,
+		      struct nvgrace_gpu_pci_core_device *nvdev)
+{
+	if (index == USEMEM_REGION_INDEX)
+		return &nvdev->usemem;
+
+	if (index == RESMEM_REGION_INDEX)
+		return &nvdev->resmem;
+
+	return NULL;
+}
+
+static int nvgrace_gpu_open_device(struct vfio_device *core_vdev)
+{
+	struct vfio_pci_core_device *vdev =
+		container_of(core_vdev, struct vfio_pci_core_device, vdev);
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+	int ret;
+
+	ret = vfio_pci_core_enable(vdev);
+	if (ret)
+		return ret;
+
+	if (nvdev->usemem.memlength) {
+		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+		mutex_init(&nvdev->remap_lock);
+	}
+
+	vfio_pci_core_finish_enable(vdev);
+
+	return 0;
+}
+
+static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+
+	/* Unmap the mapping to the device memory cached region */
+	if (nvdev->usemem.memaddr) {
+		memunmap(nvdev->usemem.memaddr);
+		nvdev->usemem.memaddr = NULL;
+	}
+
+	/* Unmap the mapping to the device memory non-cached region */
+	if (nvdev->resmem.ioaddr) {
+		iounmap(nvdev->resmem.ioaddr);
+		nvdev->resmem.ioaddr = NULL;
+	}
+
+	mutex_destroy(&nvdev->remap_lock);
+
+	vfio_pci_core_close_device(core_vdev);
+}
+
+static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
+			    struct vm_area_struct *vma)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+	struct mem_region *memregion;
+	unsigned long start_pfn;
+	u64 req_len, pgoff, end;
+	unsigned int index;
+	int ret = 0;
+
+	index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
+
+	memregion = nvgrace_gpu_memregion(index, nvdev);
+	if (!memregion)
+		return vfio_pci_core_mmap(core_vdev, vma);
+
+	/*
+	 * Request to mmap the BAR. Map to the CPU accessible memory on the
+	 * GPU using the memory information gathered from the system ACPI
+	 * tables.
+	 */
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	if (check_sub_overflow(vma->vm_end, vma->vm_start, &req_len) ||
+	    check_add_overflow(PHYS_PFN(memregion->memphys), pgoff, &start_pfn) ||
+	    check_add_overflow(PFN_PHYS(pgoff), req_len, &end))
+		return -EOVERFLOW;
+
+	/*
+	 * Check that the mapping request does not go beyond available device
+	 * memory size
+	 */
+	if (end > memregion->memlength)
+		return -EINVAL;
+
+	/*
+	 * The carved out region of the device memory needs the NORMAL_NC
+	 * property. Communicate as such to the hypervisor.
+	 */
+	if (index == RESMEM_REGION_INDEX)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+	/*
+	 * Perform a PFN map to the memory and back the device BAR by the
+	 * GPU memory.
+	 *
+	 * The available GPU memory size may not be power-of-2 aligned. The
+	 * remainder is only backed by vfio_device_ops read/write handlers.
+	 *
+	 * During device reset, the GPU is safely disconnected to the CPU
+	 * and access to the BAR will be immediately returned preventing
+	 * machine check.
+	 */
+	ret = remap_pfn_range(vma, vma->vm_start, start_pfn,
+			      req_len, vma->vm_page_prot);
+	if (ret)
+		return ret;
+
+	vma->vm_pgoff = start_pfn;
+
+	return 0;
+}
+
+static long
+nvgrace_gpu_ioctl_get_region_info(struct vfio_device *core_vdev,
+				  unsigned long arg)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+	unsigned long minsz = offsetofend(struct vfio_region_info, offset);
+	struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+	struct vfio_region_info_cap_sparse_mmap *sparse;
+	struct vfio_region_info info;
+	struct mem_region *memregion;
+	u32 size;
+	int ret;
+
+	if (copy_from_user(&info, (void __user *)arg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	/*
+	 * Request to determine the BAR region information. Send the
+	 * GPU memory information.
+	 */
+	memregion = nvgrace_gpu_memregion(info.index, nvdev);
+	if (!memregion)
+		return vfio_pci_core_ioctl(core_vdev,
+					   VFIO_DEVICE_GET_REGION_INFO, arg);
+
+	size = struct_size(sparse, areas, 1);
+
+	/*
+	 * Setup for sparse mapping for the device memory. Only the
+	 * available device memory on the hardware is shown as a
+	 * mappable region.
+	 */
+	sparse = kzalloc(size, GFP_KERNEL);
+	if (!sparse)
+		return -ENOMEM;
+
+	sparse->nr_areas = 1;
+	sparse->areas[0].offset = 0;
+	sparse->areas[0].size = memregion->memlength;
+	sparse->header.id = VFIO_REGION_INFO_CAP_SPARSE_MMAP;
+	sparse->header.version = 1;
+
+	ret = vfio_info_add_capability(&caps, &sparse->header, size);
+	kfree(sparse);
+	if (ret)
+		return ret;
+
+	info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+	/*
+	 * The region memory size may not be power-of-2 aligned.
+	 * Given that the memory  as a BAR and may not be
+	 * aligned, roundup to the next power-of-2.
+	 */
+	info.size = memregion->bar_size;
+	info.flags = VFIO_REGION_INFO_FLAG_READ |
+		     VFIO_REGION_INFO_FLAG_WRITE |
+		     VFIO_REGION_INFO_FLAG_MMAP;
+
+	if (caps.size) {
+		info.flags |= VFIO_REGION_INFO_FLAG_CAPS;
+		if (info.argsz < sizeof(info) + caps.size) {
+			info.argsz = sizeof(info) + caps.size;
+			info.cap_offset = 0;
+		} else {
+			vfio_info_cap_shift(&caps, sizeof(info));
+			if (copy_to_user((void __user *)arg +
+					 sizeof(info), caps.buf,
+					 caps.size)) {
+				kfree(caps.buf);
+				return -EFAULT;
+			}
+			info.cap_offset = sizeof(info);
+		}
+		kfree(caps.buf);
+	}
+	return copy_to_user((void __user *)arg, &info, minsz) ?
+			    -EFAULT : 0;
+}
+
+static long nvgrace_gpu_ioctl(struct vfio_device *core_vdev,
+			      unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+	case VFIO_DEVICE_GET_REGION_INFO:
+		return nvgrace_gpu_ioctl_get_region_info(core_vdev, arg);
+	case VFIO_DEVICE_IOEVENTFD:
+		return -ENOTTY;
+	case VFIO_DEVICE_RESET:
+		nvgrace_gpu_init_fake_bar_emu_regs(core_vdev);
+		fallthrough;
+	default:
+		return vfio_pci_core_ioctl(core_vdev, cmd, arg);
+	}
+}
+
+static __le64
+nvgrace_gpu_get_read_value(size_t bar_size, u64 flags, __le64 val64)
+{
+	u64 tmp_val;
+
+	tmp_val = le64_to_cpu(val64);
+	tmp_val &= ~(bar_size - 1);
+	tmp_val |= flags;
+
+	return cpu_to_le64(tmp_val);
+}
+
+/*
+ * Both the usable (usemem) and the reserved (resmem) device memory region
+ * are exposed as a 64b fake device BARs in the VM. These fake BARs must
+ * respond to the accesses on their respective PCI config space offsets.
+ *
+ * resmem BAR owns PCI_BASE_ADDRESS_2 & PCI_BASE_ADDRESS_3.
+ * usemem BAR owns PCI_BASE_ADDRESS_4 & PCI_BASE_ADDRESS_5.
+ */
+static ssize_t
+nvgrace_gpu_read_config_emu(struct vfio_device *core_vdev,
+			    char __user *buf, size_t count, loff_t *ppos)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	struct mem_region *memregion = NULL;
+	__le64 val64;
+	size_t register_offset;
+	loff_t copy_offset;
+	size_t copy_count;
+	int ret;
+
+	ret = vfio_pci_core_read(core_vdev, buf, count, ppos);
+	if (ret < 0)
+		return ret;
+
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
+						sizeof(val64),
+						&copy_offset, &copy_count,
+						&register_offset))
+		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
+	else if (vfio_pci_core_range_intersect_range(pos, count,
+						     PCI_BASE_ADDRESS_4,
+						     sizeof(val64),
+						     &copy_offset, &copy_count,
+						     &register_offset))
+		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
+
+	if (memregion) {
+		val64 = nvgrace_gpu_get_read_value(memregion->bar_size,
+						   PCI_BASE_ADDRESS_MEM_TYPE_64 |
+						   PCI_BASE_ADDRESS_MEM_PREFETCH,
+						   memregion->bar_val);
+		if (copy_to_user(buf + copy_offset,
+				 (void *)&val64 + register_offset, copy_count)) {
+			/*
+			 * The position has been incremented in
+			 * vfio_pci_core_read. Reset the offset back to the
+			 * starting position.
+			 */
+			*ppos -= count;
+			return -EFAULT;
+		}
+	}
+
+	return count;
+}
+
+static ssize_t
+nvgrace_gpu_write_config_emu(struct vfio_device *core_vdev,
+			     const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+	u64 pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	struct mem_region *memregion = NULL;
+	size_t register_offset;
+	loff_t copy_offset;
+	size_t copy_count;
+
+	if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_2,
+						sizeof(u64), &copy_offset,
+						&copy_count, &register_offset))
+		memregion = nvgrace_gpu_memregion(RESMEM_REGION_INDEX, nvdev);
+	else if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_4,
+						     sizeof(u64), &copy_offset,
+						     &copy_count, &register_offset))
+		memregion = nvgrace_gpu_memregion(USEMEM_REGION_INDEX, nvdev);
+
+	if (memregion) {
+		if (copy_from_user((void *)&memregion->bar_val + register_offset,
+				   buf + copy_offset, copy_count))
+			return -EFAULT;
+		*ppos += copy_count;
+		return copy_count;
+	}
+
+	return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+/*
+ * Ad hoc map the device memory in the module kernel VA space. Primarily needed
+ * as vfio does not require the userspace driver to only perform accesses through
+ * mmaps of the vfio-pci BAR regions and such accesses should be supported using
+ * vfio_device_ops read/write implementations.
+ *
+ * The usemem region is cacheable memory and hence is memremaped.
+ * The resmem region is non-cached and is mapped using ioremap_wc (NORMAL_NC).
+ */
+static int
+nvgrace_gpu_map_device_mem(int index,
+			   struct nvgrace_gpu_pci_core_device *nvdev)
+{
+	struct mem_region *memregion;
+	int ret = 0;
+
+	memregion = nvgrace_gpu_memregion(index, nvdev);
+	if (!memregion)
+		return -EINVAL;
+
+	mutex_lock(&nvdev->remap_lock);
+
+	if (memregion->memaddr)
+		goto unlock;
+
+	if (index == USEMEM_REGION_INDEX)
+		memregion->memaddr = memremap(memregion->memphys,
+					      memregion->memlength,
+					      MEMREMAP_WB);
+	else
+		memregion->ioaddr = ioremap_wc(memregion->memphys,
+					       memregion->memlength);
+
+	if (!memregion->memaddr)
+		ret = -ENOMEM;
+
+unlock:
+	mutex_unlock(&nvdev->remap_lock);
+
+	return ret;
+}
+
+/*
+ * Read the data from the device memory (mapped either through ioremap
+ * or memremap) into the user buffer.
+ */
+static int
+nvgrace_gpu_map_and_read(struct nvgrace_gpu_pci_core_device *nvdev,
+			 char __user *buf, size_t mem_count, loff_t *ppos)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+	int ret;
+
+	if (!mem_count)
+		return 0;
+
+	/*
+	 * Handle read on the BAR regions. Map to the target device memory
+	 * physical address and copy to the request read buffer.
+	 */
+	ret = nvgrace_gpu_map_device_mem(index, nvdev);
+	if (ret)
+		return ret;
+
+	if (index == USEMEM_REGION_INDEX) {
+		if (copy_to_user(buf,
+				 (u8 *)nvdev->usemem.memaddr + offset,
+				 mem_count))
+			ret = -EFAULT;
+	} else {
+		/*
+		 * The hardware ensures that the system does not crash when
+		 * the device memory is accessed with the memory enable
+		 * turned off. It synthesizes ~0 on such read. So there is
+		 * no need to check or support the disablement/enablement of
+		 * BAR through PCI_COMMAND config space register. Pass
+		 * test_mem flag as false.
+		 */
+		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
+					     nvdev->resmem.ioaddr,
+					     buf, offset, mem_count,
+					     0, 0, false);
+	}
+
+	return ret;
+}
+
+/*
+ * Read count bytes from the device memory at an offset. The actual device
+ * memory size (available) may not be a power-of-2. So the driver fakes
+ * the size to a power-of-2 (reported) when exposing to a user space driver.
+ *
+ * Reads starting beyond the reported size generate -EINVAL; reads extending
+ * beyond the actual device size is filled with ~0; reads extending beyond
+ * the reported size are truncated.
+ */
+static ssize_t
+nvgrace_gpu_read_mem(struct nvgrace_gpu_pci_core_device *nvdev,
+		     char __user *buf, size_t count, loff_t *ppos)
+{
+	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	struct mem_region *memregion;
+	size_t mem_count, i;
+	u8 val = 0xFF;
+	int ret;
+
+	/* No need to do NULL check as caller does. */
+	memregion = nvgrace_gpu_memregion(index, nvdev);
+
+	if (offset >= memregion->bar_size)
+		return -EINVAL;
+
+	/* Clip short the read request beyond reported BAR size */
+	count = min(count, memregion->bar_size - (size_t)offset);
+
+	/*
+	 * Determine how many bytes to be actually read from the device memory.
+	 * Read request beyond the actual device memory size is filled with ~0,
+	 * while those beyond the actual reported size is skipped.
+	 */
+	if (offset >= memregion->memlength)
+		mem_count = 0;
+	else
+		mem_count = min(count, memregion->memlength - (size_t)offset);
+
+	ret = nvgrace_gpu_map_and_read(nvdev, buf, mem_count, ppos);
+	if (ret)
+		return ret;
+
+	/*
+	 * Only the device memory present on the hardware is mapped, which may
+	 * not be power-of-2 aligned. A read to an offset beyond the device memory
+	 * size is filled with ~0.
+	 */
+	for (i = mem_count; i < count; i++) {
+		ret = put_user(val, (unsigned char __user *)(buf + i));
+		if (ret)
+			return ret;
+	}
+
+	*ppos += count;
+	return count;
+}
+
+static ssize_t
+nvgrace_gpu_read(struct vfio_device *core_vdev,
+		 char __user *buf, size_t count, loff_t *ppos)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+
+	if (nvgrace_gpu_memregion(index, nvdev))
+		return nvgrace_gpu_read_mem(nvdev, buf, count, ppos);
+
+	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+		return nvgrace_gpu_read_config_emu(core_vdev, buf, count, ppos);
+
+	return vfio_pci_core_read(core_vdev, buf, count, ppos);
+}
+
+/*
+ * Write the data to the device memory (mapped either through ioremap
+ * or memremap) from the user buffer.
+ */
+static int
+nvgrace_gpu_map_and_write(struct nvgrace_gpu_pci_core_device *nvdev,
+			  const char __user *buf, size_t mem_count,
+			  loff_t *ppos)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+	int ret;
+
+	if (!mem_count)
+		return 0;
+
+	ret = nvgrace_gpu_map_device_mem(index, nvdev);
+	if (ret)
+		return ret;
+
+	if (index == USEMEM_REGION_INDEX) {
+		if (copy_from_user((u8 *)nvdev->usemem.memaddr + pos,
+				   buf, mem_count))
+			return -EFAULT;
+	} else {
+		/*
+		 * The hardware ensures that the system does not crash when
+		 * the device memory is accessed with the memory enable
+		 * turned off. It drops such writes. So there is no need to
+		 * check or support the disablement/enablement of BAR
+		 * through PCI_COMMAND config space register. Pass test_mem
+		 * flag as false.
+		 */
+		ret = vfio_pci_core_do_io_rw(&nvdev->core_device, false,
+					     nvdev->resmem.ioaddr,
+					     (char __user *)buf, pos, mem_count,
+					     0, 0, true);
+	}
+
+	return ret;
+}
+
+/*
+ * Write count bytes to the device memory at a given offset. The actual device
+ * memory size (available) may not be a power-of-2. So the driver fakes the
+ * size to a power-of-2 (reported) when exposing to a user space driver.
+ *
+ * Writes extending beyond the reported size are truncated; writes starting
+ * beyond the reported size generate -EINVAL.
+ */
+static ssize_t
+nvgrace_gpu_write_mem(struct nvgrace_gpu_pci_core_device *nvdev,
+		      size_t count, loff_t *ppos, const char __user *buf)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	u64 offset = *ppos & VFIO_PCI_OFFSET_MASK;
+	struct mem_region *memregion;
+	size_t mem_count;
+	int ret = 0;
+
+	/* No need to do NULL check as caller does. */
+	memregion = nvgrace_gpu_memregion(index, nvdev);
+
+	if (offset >= memregion->bar_size)
+		return -EINVAL;
+
+	/* Clip short the write request beyond reported BAR size */
+	count = min(count, memregion->bar_size - (size_t)offset);
+
+	/*
+	 * Determine how many bytes to be actually written to the device memory.
+	 * Do not write to the offset beyond available size.
+	 */
+	if (offset >= memregion->memlength)
+		goto exitfn;
+
+	/*
+	 * Only the device memory present on the hardware is mapped, which may
+	 * not be power-of-2 aligned. Drop access outside the available device
+	 * memory on the hardware.
+	 */
+	mem_count = min(count, memregion->memlength - (size_t)offset);
+
+	ret = nvgrace_gpu_map_and_write(nvdev, buf, mem_count, ppos);
+	if (ret)
+		return ret;
+
+exitfn:
+	*ppos += count;
+	return count;
+}
+
+static ssize_t
+nvgrace_gpu_write(struct vfio_device *core_vdev,
+		  const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_vdev, struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+
+	if (nvgrace_gpu_memregion(index, nvdev))
+		return nvgrace_gpu_write_mem(nvdev, count, ppos, buf);
+
+	if (index == VFIO_PCI_CONFIG_REGION_INDEX)
+		return nvgrace_gpu_write_config_emu(core_vdev, buf, count, ppos);
+
+	return vfio_pci_core_write(core_vdev, buf, count, ppos);
+}
+
+static const struct vfio_device_ops nvgrace_gpu_pci_ops = {
+	.name		= "nvgrace-gpu-vfio-pci",
+	.init		= vfio_pci_core_init_dev,
+	.release	= vfio_pci_core_release_dev,
+	.open_device	= nvgrace_gpu_open_device,
+	.close_device	= nvgrace_gpu_close_device,
+	.ioctl		= nvgrace_gpu_ioctl,
+	.device_feature	= vfio_pci_core_ioctl_feature,
+	.read		= nvgrace_gpu_read,
+	.write		= nvgrace_gpu_write,
+	.mmap		= nvgrace_gpu_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
+	.name		= "nvgrace-gpu-vfio-pci-core",
+	.init		= vfio_pci_core_init_dev,
+	.release	= vfio_pci_core_release_dev,
+	.open_device	= nvgrace_gpu_open_device,
+	.close_device	= vfio_pci_core_close_device,
+	.ioctl		= vfio_pci_core_ioctl,
+	.device_feature	= vfio_pci_core_ioctl_feature,
+	.read		= vfio_pci_core_read,
+	.write		= vfio_pci_core_write,
+	.mmap		= vfio_pci_core_mmap,
+	.request	= vfio_pci_core_request,
+	.match		= vfio_pci_core_match,
+	.bind_iommufd	= vfio_iommufd_physical_bind,
+	.unbind_iommufd	= vfio_iommufd_physical_unbind,
+	.attach_ioas	= vfio_iommufd_physical_attach_ioas,
+	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
+};
+
+static int
+nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
+				  u64 *pmemphys, u64 *pmemlength)
+{
+	int ret;
+
+	/*
+	 * The memory information is present in the system ACPI tables as DSD
+	 * properties nvidia,gpu-mem-base-pa and nvidia,gpu-mem-size.
+	 */
+	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-base-pa",
+				       pmemphys);
+	if (ret)
+		return ret;
+
+	if (*pmemphys > type_max(phys_addr_t))
+		return -EOVERFLOW;
+
+	ret = device_property_read_u64(&pdev->dev, "nvidia,gpu-mem-size",
+				       pmemlength);
+	if (ret)
+		return ret;
+
+	if (*pmemlength > type_max(size_t))
+		return -EOVERFLOW;
+
+	/*
+	 * If the C2C link is not up due to an error, the coherent device
+	 * memory size is returned as 0. Fail in such case.
+	 */
+	if (*pmemlength == 0)
+		return -ENOMEM;
+
+	return ret;
+}
+
+static int
+nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
+			      struct nvgrace_gpu_pci_core_device *nvdev,
+			      u64 memphys, u64 memlength)
+{
+	int ret = 0;
+
+	/*
+	 * The VM GPU device driver needs a non-cacheable region to support
+	 * the MIG feature. Since the device memory is mapped as NORMAL cached,
+	 * carve out a region from the end with a different NORMAL_NC
+	 * property (called as reserved memory and represented as resmem). This
+	 * region then is exposed as a 64b BAR (region 2 and 3) to the VM, while
+	 * exposing the rest (termed as usable memory and represented using usemem)
+	 * as cacheable 64b BAR (region 4 and 5).
+	 *
+	 *               devmem (memlength)
+	 * |-------------------------------------------------|
+	 * |                                           |
+	 * usemem.memphys                              resmem.memphys
+	 */
+	nvdev->usemem.memphys = memphys;
+
+	/*
+	 * The device memory exposed to the VM is added to the kernel by the
+	 * VM driver module in chunks of memory block size. Only the usable
+	 * memory (usemem) is added to the kernel for usage by the VM
+	 * workloads. Make the usable memory size memblock aligned.
+	 */
+	if (check_sub_overflow(memlength, RESMEM_SIZE,
+			       &nvdev->usemem.memlength)) {
+		ret = -EOVERFLOW;
+		goto done;
+	}
+
+	/*
+	 * The USEMEM part of the device memory has to be MEMBLK_SIZE
+	 * aligned. This is a hardwired ABI value between the GPU FW and
+	 * VFIO driver. The VM device driver is also aware of it and make
+	 * use of the value for its calculation to determine USEMEM size.
+	 */
+	nvdev->usemem.memlength = round_down(nvdev->usemem.memlength,
+					     MEMBLK_SIZE);
+	if (nvdev->usemem.memlength == 0) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	if ((check_add_overflow(nvdev->usemem.memphys,
+				nvdev->usemem.memlength,
+				&nvdev->resmem.memphys)) ||
+	    (check_sub_overflow(memlength, nvdev->usemem.memlength,
+				&nvdev->resmem.memlength))) {
+		ret = -EOVERFLOW;
+		goto done;
+	}
+
+	/*
+	 * The memory regions are exposed as BARs. Calculate and save
+	 * the BAR size for them.
+	 */
+	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+	nvdev->resmem.bar_size = roundup_pow_of_two(nvdev->resmem.memlength);
+done:
+	return ret;
+}
+
+static int nvgrace_gpu_probe(struct pci_dev *pdev,
+			     const struct pci_device_id *id)
+{
+	const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
+	struct nvgrace_gpu_pci_core_device *nvdev;
+	u64 memphys, memlength;
+	int ret;
+
+	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
+	if (!ret)
+		ops = &nvgrace_gpu_pci_ops;
+
+	nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
+				  &pdev->dev, ops);
+	if (IS_ERR(nvdev))
+		return PTR_ERR(nvdev);
+
+	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
+
+	if (ops == &nvgrace_gpu_pci_ops) {
+		/*
+		 * Device memory properties are identified in the host ACPI
+		 * table. Set the nvgrace_gpu_pci_core_device structure.
+		 */
+		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
+						    memphys, memlength);
+		if (ret)
+			goto out_put_vdev;
+	}
+
+	ret = vfio_pci_core_register_device(&nvdev->core_device);
+	if (ret)
+		goto out_put_vdev;
+
+	return ret;
+
+out_put_vdev:
+	vfio_put_device(&nvdev->core_device.vdev);
+	return ret;
+}
+
+static void nvgrace_gpu_remove(struct pci_dev *pdev)
+{
+	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+
+	vfio_pci_core_unregister_device(core_device);
+	vfio_put_device(&core_device->vdev);
+}
+
+static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
+	/* GH200 120GB */
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
+	/* GH200 480GB */
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+	{}
+};
+
+MODULE_DEVICE_TABLE(pci, nvgrace_gpu_vfio_pci_table);
+
+static struct pci_driver nvgrace_gpu_vfio_pci_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = nvgrace_gpu_vfio_pci_table,
+	.probe = nvgrace_gpu_probe,
+	.remove = nvgrace_gpu_remove,
+	.err_handler = &vfio_pci_core_err_handlers,
+	.driver_managed_dma = true,
+};
+
+module_pci_driver(nvgrace_gpu_vfio_pci_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
+MODULE_AUTHOR("Aniket Agashe <aniketa@nvidia.com>");
+MODULE_DESCRIPTION("VFIO NVGRACE GPU PF - User Level driver for NVIDIA devices with CPU coherently accessible device memory");

From b9be74a1e7aacd6003cd37098587c0beefcdadbd Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sat, 24 Feb 2024 20:35:43 +0530
Subject: [PATCH 095/352] KVM: arm64: Introduce new flag for non-cacheable IO
 memory

Currently, KVM for ARM64 maps at stage 2 memory that is considered device
(i.e. it is not RAM) with DEVICE_nGnRE memory attributes; this setting
overrides (as per the ARM architecture [1]) any device MMIO mapping
present at stage 1, resulting in a set-up whereby a guest operating
system cannot determine device MMIO mapping memory attributes on its
own but it is always overridden by the KVM stage 2 default.

This set-up does not allow guest operating systems to select device
memory attributes independently from KVM stage-2 mappings
(refer to [1], "Combining stage 1 and stage 2 memory type attributes"),
which turns out to be an issue in that guest operating systems
(e.g. Linux) may request to map devices MMIO regions with memory
attributes that guarantee better performance (e.g. gathering
attribute - that for some devices can generate larger PCIe memory
writes TLPs) and specific operations (e.g. unaligned transactions)
such as the NormalNC memory type.

The default device stage 2 mapping was chosen in KVM for ARM64 since
it was considered safer (i.e. it would not allow guests to trigger
uncontained failures ultimately crashing the machine) but this
turned out to be asynchronous (SError) defeating the purpose.

Failures containability is a property of the platform and is independent
from the memory type used for MMIO device memory mappings.

Actually, DEVICE_nGnRE memory type is even more problematic than
Normal-NC memory type in terms of faults containability in that e.g.
aborts triggered on DEVICE_nGnRE loads cannot be made, architecturally,
synchronous (i.e. that would imply that the processor should issue at
most 1 load transaction at a time - it cannot pipeline them - otherwise
the synchronous abort semantics would break the no-speculation attribute
attached to DEVICE_XXX memory).

This means that regardless of the combined stage1+stage2 mappings a
platform is safe if and only if device transactions cannot trigger
uncontained failures and that in turn relies on platform capabilities
and the device type being assigned (i.e. PCIe AER/DPC error containment
and RAS architecture[3]); therefore the default KVM device stage 2
memory attributes play no role in making device assignment safer
for a given platform (if the platform design adheres to design
guidelines outlined in [3]) and therefore can be relaxed.

For all these reasons, relax the KVM stage 2 device memory attributes
from DEVICE_nGnRE to Normal-NC.

The NormalNC was chosen over a different Normal memory type default
at stage-2 (e.g. Normal Write-through) to avoid cache allocation/snooping.

Relaxing S2 KVM device MMIO mappings to Normal-NC is not expected to
trigger any issue on guest device reclaim use cases either (i.e. device
MMIO unmap followed by a device reset) at least for PCIe devices, in that
in PCIe a device reset is architected and carried out through PCI config
space transactions that are naturally ordered with respect to MMIO
transactions according to the PCI ordering rules.

Having Normal-NC S2 default puts guests in control (thanks to
stage1+stage2 combined memory attributes rules [1]) of device MMIO
regions memory mappings, according to the rules described in [1]
and summarized here ([(S1) - stage1], [(S2) - stage 2]):

S1           |  S2           | Result
NORMAL-WB    |  NORMAL-NC    | NORMAL-NC
NORMAL-WT    |  NORMAL-NC    | NORMAL-NC
NORMAL-NC    |  NORMAL-NC    | NORMAL-NC
DEVICE<attr> |  NORMAL-NC    | DEVICE<attr>

It is worth noting that currently, to map devices MMIO space to user
space in a device pass-through use case the VFIO framework applies memory
attributes derived from pgprot_noncached() settings applied to VMAs, which
result in device-nGnRnE memory attributes for the stage-1 VMM mappings.

This means that a userspace mapping for device MMIO space carried
out with the current VFIO framework and a guest OS mapping for the same
MMIO space may result in a mismatched alias as described in [2].

Defaulting KVM device stage-2 mappings to Normal-NC attributes does not
change anything in this respect, in that the mismatched aliases would
only affect (refer to [2] for a detailed explanation) ordering between
the userspace and GuestOS mappings resulting stream of transactions
(i.e. it does not cause loss of property for either stream of
transactions on its own), which is harmless given that the userspace
and GuestOS access to the device is carried out through independent
transactions streams.

A Normal-NC flag is not present today. So add a new kvm_pgtable_prot
(KVM_PGTABLE_PROT_NORMAL_NC) flag for it, along with its
corresponding PTE value 0x5 (0b101) determined from [1].

Lastly, adapt the stage2 PTE property setter function
(stage2_set_prot_attr) to handle the NormalNC attribute.

The entire discussion leading to this patch series may be followed through
the following links.
Link: https://lore.kernel.org/all/20230907181459.18145-3-ankita@nvidia.com
Link: https://lore.kernel.org/r/20231205033015.10044-1-ankita@nvidia.com

[1] section D8.5.5 - DDI0487J_a_a-profile_architecture_reference_manual.pdf
[2] section B2.8 - DDI0487J_a_a-profile_architecture_reference_manual.pdf
[3] sections 1.7.7.3/1.8.5.2/appendix C - DEN0029H_SBSA_7.1.pdf

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240224150546.368-2-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
(cherry picked from commit c034ec84e8795cf379bd47cc8871445f070a0110)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  2 ++
 arch/arm64/include/asm/memory.h      |  2 ++
 arch/arm64/kvm/hyp/pgtable.c         | 24 +++++++++++++++++++-----
 3 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index cfdf40f734b12..19278dfe79782 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -197,6 +197,7 @@ enum kvm_pgtable_stage2_flags {
  * @KVM_PGTABLE_PROT_W:		Write permission.
  * @KVM_PGTABLE_PROT_R:		Read permission.
  * @KVM_PGTABLE_PROT_DEVICE:	Device attributes.
+ * @KVM_PGTABLE_PROT_NORMAL_NC:	Normal noncacheable attributes.
  * @KVM_PGTABLE_PROT_SW0:	Software bit 0.
  * @KVM_PGTABLE_PROT_SW1:	Software bit 1.
  * @KVM_PGTABLE_PROT_SW2:	Software bit 2.
@@ -208,6 +209,7 @@ enum kvm_pgtable_prot {
 	KVM_PGTABLE_PROT_R			= BIT(2),
 
 	KVM_PGTABLE_PROT_DEVICE			= BIT(3),
+	KVM_PGTABLE_PROT_NORMAL_NC		= BIT(4),
 
 	KVM_PGTABLE_PROT_SW0			= BIT(55),
 	KVM_PGTABLE_PROT_SW1			= BIT(56),
diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index d82305ab420f7..449ca2ff1df60 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -173,6 +173,7 @@
  * Memory types for Stage-2 translation
  */
 #define MT_S2_NORMAL		0xf
+#define MT_S2_NORMAL_NC		0x5
 #define MT_S2_DEVICE_nGnRE	0x1
 
 /*
@@ -180,6 +181,7 @@
  * Stage-2 enforces Normal-WB and Device-nGnRE
  */
 #define MT_S2_FWB_NORMAL	6
+#define MT_S2_FWB_NORMAL_NC	5
 #define MT_S2_FWB_DEVICE_nGnRE	1
 
 #ifdef CONFIG_ARM64_4K_PAGES
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index ce5cef7d73c41..b7cb9b67a063f 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -717,15 +717,29 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
 static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
 				kvm_pte_t *ptep)
 {
-	bool device = prot & KVM_PGTABLE_PROT_DEVICE;
-	kvm_pte_t attr = device ? KVM_S2_MEMATTR(pgt, DEVICE_nGnRE) :
-			    KVM_S2_MEMATTR(pgt, NORMAL);
+	kvm_pte_t attr;
 	u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
 
+	switch (prot & (KVM_PGTABLE_PROT_DEVICE |
+			KVM_PGTABLE_PROT_NORMAL_NC)) {
+	case KVM_PGTABLE_PROT_DEVICE | KVM_PGTABLE_PROT_NORMAL_NC:
+		return -EINVAL;
+	case KVM_PGTABLE_PROT_DEVICE:
+		if (prot & KVM_PGTABLE_PROT_X)
+			return -EINVAL;
+		attr = KVM_S2_MEMATTR(pgt, DEVICE_nGnRE);
+		break;
+	case KVM_PGTABLE_PROT_NORMAL_NC:
+		if (prot & KVM_PGTABLE_PROT_X)
+			return -EINVAL;
+		attr = KVM_S2_MEMATTR(pgt, NORMAL_NC);
+		break;
+	default:
+		attr = KVM_S2_MEMATTR(pgt, NORMAL);
+	}
+
 	if (!(prot & KVM_PGTABLE_PROT_X))
 		attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
-	else if (device)
-		return -EINVAL;
 
 	if (prot & KVM_PGTABLE_PROT_R)
 		attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;

From ae8a6b0a9f6ad97350473b74a1112cb3a4e52f9d Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sat, 24 Feb 2024 20:35:44 +0530
Subject: [PATCH 096/352] mm: Introduce new flag to indicate wc safe

The VM_ALLOW_ANY_UNCACHED flag is implemented for ARM64, allowing KVM
stage 2 device mapping attributes to use NormalNC rather than
DEVICE_nGnRE, which allows guest mappings supporting write-combining
attributes (WC). ARM does not architecturally guarantee this is safe,
and indeed some MMIO regions like the GICv2 VCPU interface can trigger
uncontained faults if NormalNC is used.

Even worse, the expectation is that there are platforms where even
DEVICE_nGnRE can allow uncontained faults in corner cases. Unfortunately
existing ARM IP requires platform integration to take responsibility to
prevent this.

To safely use VFIO in KVM the platform must guarantee full safety in the
guest where no action taken against a MMIO mapping can trigger an
uncontained failure. The assumption is that most VFIO PCI platforms
support this for both mapping types, at least in common flows, based
on some expectations of how PCI IP is integrated. This can be enabled
more broadly, for instance into vfio-platform drivers, but only after
the platform vendor completes auditing for safety.

The VMA flag VM_ALLOW_ANY_UNCACHED was found to be the simplest and
cleanest way to communicate the information from VFIO to KVM that
mapping the region in S2 as NormalNC is safe. KVM consumes it to
activate the code that does the S2 mapping as NormalNC.

Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Acked-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240224150546.368-3-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
(cherry picked from commit 5c656fcdd6c60f71fccb07fe7b9d8d7e6c9811ff)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/mm.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index adbab01e2e90a..dedd899d538ff 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -391,6 +391,20 @@ extern unsigned int kobjsize(const void *objp);
 # define VM_UFFD_MINOR		VM_NONE
 #endif /* CONFIG_HAVE_ARCH_USERFAULTFD_MINOR */
 
+/*
+ * This flag is used to connect VFIO to arch specific KVM code. It
+ * indicates that the memory under this VMA is safe for use with any
+ * non-cachable memory type inside KVM. Some VFIO devices, on some
+ * platforms, are thought to be unsafe and can cause machine crashes
+ * if KVM does not lock down the memory type.
+ */
+#ifdef CONFIG_64BIT
+#define VM_ALLOW_ANY_UNCACHED_BIT	39
+#define VM_ALLOW_ANY_UNCACHED		BIT(VM_ALLOW_ANY_UNCACHED_BIT)
+#else
+#define VM_ALLOW_ANY_UNCACHED		VM_NONE
+#endif
+
 /* Bits set in the VMA until the stack is in its final location */
 #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY)
 

From 17ae616201c2c54269ef79c9da0d036fc9e0ece1 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sat, 24 Feb 2024 20:35:45 +0530
Subject: [PATCH 097/352] KVM: arm64: Set io memory s2 pte as normalnc for vfio
 pci device

To provide VM with the ability to get device IO memory with NormalNC
property, map device MMIO in KVM for ARM64 at stage2 as NormalNC.
Having NormalNC S2 default puts guests in control (based on [1],
"Combining stage 1 and stage 2 memory type attributes") of device
MMIO regions memory mappings. The rules are summarized below:
([(S1) - stage1], [(S2) - stage 2])

S1           |  S2           | Result
NORMAL-WB    |  NORMAL-NC    | NORMAL-NC
NORMAL-WT    |  NORMAL-NC    | NORMAL-NC
NORMAL-NC    |  NORMAL-NC    | NORMAL-NC
DEVICE<attr> |  NORMAL-NC    | DEVICE<attr>

Still this cannot be generalized to non PCI devices such as GICv2.
There is insufficient information and uncertainity in the behavior
of non PCI driver. A driver must indicate support using the
new flag VM_ALLOW_ANY_UNCACHED.

Adapt KVM to make use of the flag VM_ALLOW_ANY_UNCACHED as indicator to
activate the S2 setting to NormalNc.

[1] section D8.5.5 of DDI0487J_a_a-profile_architecture_reference_manual.pdf

Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240224150546.368-4-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
(cherry picked from commit 8c47ce3e1d2c285349edf426b98e8460ce3e2f33)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/kvm/mmu.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 92270acfc00d4..29db0d71cdd7e 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1381,7 +1381,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	int ret = 0;
 	bool write_fault, writable, force_pte = false;
 	bool exec_fault, mte_allowed;
-	bool device = false;
+	bool device = false, vfio_allow_any_uc = false;
 	unsigned long mmu_seq;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
@@ -1472,6 +1472,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	gfn = fault_ipa >> PAGE_SHIFT;
 	mte_allowed = kvm_vma_mte_allowed(vma);
 
+	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
+
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
 
@@ -1557,10 +1559,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
 
-	if (device)
-		prot |= KVM_PGTABLE_PROT_DEVICE;
-	else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
+	if (device) {
+		if (vfio_allow_any_uc)
+			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+		else
+			prot |= KVM_PGTABLE_PROT_DEVICE;
+	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
 		prot |= KVM_PGTABLE_PROT_X;
+	}
 
 	/*
 	 * Under the premise of getting a FSC_PERM fault, we just need to relax

From 95d2c7d1927f0cf360525f2476a9afd2f2fa439f Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sat, 24 Feb 2024 20:35:46 +0530
Subject: [PATCH 098/352] vfio: Convey kvm that the vfio-pci device is wc safe

The VM_ALLOW_ANY_UNCACHED flag is implemented for ARM64,
allowing KVM stage 2 device mapping attributes to use Normal-NC
rather than DEVICE_nGnRE, which allows guest mappings supporting
write-combining attributes (WC). ARM does not architecturally
guarantee this is safe, and indeed some MMIO regions like the GICv2
VCPU interface can trigger uncontained faults if Normal-NC is used.

To safely use VFIO in KVM the platform must guarantee full safety
in the guest where no action taken against a MMIO mapping can
trigger an uncontained failure. The expectation is that most VFIO PCI
platforms support this for both mapping types, at least in common
flows, based on some expectations of how PCI IP is integrated. So
make vfio-pci set the VM_ALLOW_ANY_UNCACHED flag.

Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Link: https://lore.kernel.org/r/20240224150546.368-5-ankita@nvidia.com
Signed-off-by: Oliver Upton <oliver.upton@linux.dev>
(cherry picked from commit a39d3a966a090989b89c0287a67cd98c85ae2f52)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index ed10ff38e0dc5..d8c95cc16be81 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1882,8 +1882,25 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
 	/*
 	 * See remap_pfn_range(), called from vfio_pci_fault() but we can't
 	 * change vm_flags within the fault handler.  Set them now.
+	 *
+	 * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
+	 * allowing KVM stage 2 device mapping attributes to use Normal-NC
+	 * rather than DEVICE_nGnRE, which allows guest mappings
+	 * supporting write-combining attributes (WC). ARM does not
+	 * architecturally guarantee this is safe, and indeed some MMIO
+	 * regions like the GICv2 VCPU interface can trigger uncontained
+	 * faults if Normal-NC is used.
+	 *
+	 * To safely use VFIO in KVM the platform must guarantee full
+	 * safety in the guest where no action taken against a MMIO
+	 * mapping can trigger an uncontained failure. The assumption is
+	 * that most VFIO PCI platforms support this for both mapping types,
+	 * at least in common flows, based on some expectations of how
+	 * PCI IP is integrated. Hence VM_ALLOW_ANY_UNCACHED is set in
+	 * the VMA flags.
 	 */
-	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+	vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED | VM_IO | VM_PFNMAP |
+			VM_DONTEXPAND | VM_DONTDUMP);
 	vma->vm_ops = &vfio_pci_mmap_ops;
 
 	return 0;

From 63d5049fbaf990f5861ffe1f97365e6cae60bc11 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:12 -0400
Subject: [PATCH 099/352] iommu/arm-smmu-v3: Make STE programming independent
 of the callers

As the comment in arm_smmu_write_strtab_ent() explains, this routine has
been limited to only work correctly in certain scenarios that the caller
must ensure. Generally the caller must put the STE into ABORT or BYPASS
before attempting to program it to something else.

The iommu core APIs would ideally expect the driver to do a hitless change
of iommu_domain in a number of cases:

 - RESV_DIRECT support wants IDENTITY -> DMA -> IDENTITY to be hitless
   for the RESV ranges

 - PASID upgrade has IDENTIY on the RID with no PASID then a PASID paging
   domain installed. The RID should not be impacted

 - PASID downgrade has IDENTIY on the RID and all PASID's removed.
   The RID should not be impacted

 - RID does PAGING -> BLOCKING with active PASID, PASID's should not be
   impacted

 - NESTING -> NESTING for carrying all the above hitless cases in a VM
   into the hypervisor. To comprehensively emulate the HW in a VM we
   should assume the VM OS is running logic like this and expecting
   hitless updates to be relayed to real HW.

For CD updates arm_smmu_write_ctx_desc() has a similar comment explaining
how limited it is, and the driver does have a need for hitless CD updates:

 - SMMUv3 BTM S1 ASID re-label

 - SVA mm release should change the CD to answert not-present to all
   requests without allowing logging (EPD0)

The next patches/series are going to start removing some of this logic
from the callers, and add more complex state combinations than currently.
At the end everything that can be hitless will be hitless, including all
of the above.

Introduce arm_smmu_write_ste() which will run through the multi-qword
programming sequence to avoid creating an incoherent 'torn' STE in the HW
caches. It automatically detects which of two algorithms to use:

1) The disruptive V=0 update described in the spec which disrupts the
   entry and does three syncs to make the change:
       - Write V=0 to QWORD 0
       - Write the entire STE except QWORD 0
       - Write QWORD 0

2) A hitless update algorithm that follows the same rational that the driver
   already uses. It is safe to change IGNORED bits that HW doesn't use:
       - Write the target value into all currently unused bits
       - Write a single QWORD, this makes the new STE live atomically
       - Ensure now unused bits are 0

The detection of which path to use and the implementation of the hitless
update rely on a "used bitmask" describing what bits the HW is actually
using based on the V/CFG/etc bits. This flows from the spec language,
typically indicated as IGNORED.

Knowing which bits the HW is using we can update the bits it does not use
and then compute how many QWORDS need to be changed. If only one qword
needs to be updated the hitless algorithm is possible.

Later patches will include CD updates in this mechanism so make the
implementation generic using a struct arm_smmu_entry_writer and struct
arm_smmu_entry_writer_ops to abstract the differences between STE and CD
to be plugged in.

At this point it generates the same sequence of updates as the current
code, except that zeroing the VMID on entry to BYPASS/ABORT will do an
extra sync (this seems to be an existing bug).

Going forward this will use a V=0 transition instead of cycling through
ABORT if a hitfull change is required. This seems more appropriate as ABORT
will fail DMAs without any logging, but dropping a DMA due to transient
V=0 is probably signaling a bug, so the C_BAD_STE is valuable.

Add STRTAB_STE_1_SHCFG_INCOMING to s2_cfg, this was editing the STE in
place and subtly inherited the value of data[1] from abort/bypass.

Signed-off-by: Michael Shavit <mshavit@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7da51af9125c624318c8099de13c5ddefd47e9e8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 275 +++++++++++++++-----
 1 file changed, 211 insertions(+), 64 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5a2c60075e8e2..050ef96129bb4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -47,6 +47,9 @@ enum arm_smmu_msi_index {
 	ARM_SMMU_MAX_MSIS,
 };
 
+static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
+				      ioasid_t sid);
+
 static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
 	[EVTQ_MSI_INDEX] = {
 		ARM_SMMU_EVTQ_IRQ_CFG0,
@@ -966,6 +969,199 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
 	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 }
 
+/*
+ * Based on the value of ent report which bits of the STE the HW will access. It
+ * would be nice if this was complete according to the spec, but minimally it
+ * has to capture the bits this driver uses.
+ */
+static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
+				  struct arm_smmu_ste *used_bits)
+{
+	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0]));
+
+	used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
+	if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V)))
+		return;
+
+	used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
+
+	/* S1 translates */
+	if (cfg & BIT(0)) {
+		used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
+						  STRTAB_STE_0_S1CTXPTR_MASK |
+						  STRTAB_STE_0_S1CDMAX);
+		used_bits->data[1] |=
+			cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
+				    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
+				    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
+				    STRTAB_STE_1_EATS);
+		used_bits->data[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
+	}
+
+	/* S2 translates */
+	if (cfg & BIT(1)) {
+		used_bits->data[1] |=
+			cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
+		used_bits->data[2] |=
+			cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
+				    STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
+				    STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
+		used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
+	}
+
+	if (cfg == STRTAB_STE_0_CFG_BYPASS)
+		used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
+}
+
+/*
+ * Figure out if we can do a hitless update of entry to become target. Returns a
+ * bit mask where 1 indicates that qword needs to be set disruptively.
+ * unused_update is an intermediate value of entry that has unused bits set to
+ * their new values.
+ */
+static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
+				    const struct arm_smmu_ste *target,
+				    struct arm_smmu_ste *unused_update)
+{
+	struct arm_smmu_ste target_used = {};
+	struct arm_smmu_ste cur_used = {};
+	u8 used_qword_diff = 0;
+	unsigned int i;
+
+	arm_smmu_get_ste_used(entry, &cur_used);
+	arm_smmu_get_ste_used(target, &target_used);
+
+	for (i = 0; i != ARRAY_SIZE(target_used.data); i++) {
+		/*
+		 * Check that masks are up to date, the make functions are not
+		 * allowed to set a bit to 1 if the used function doesn't say it
+		 * is used.
+		 */
+		WARN_ON_ONCE(target->data[i] & ~target_used.data[i]);
+
+		/* Bits can change because they are not currently being used */
+		unused_update->data[i] = (entry->data[i] & cur_used.data[i]) |
+					 (target->data[i] & ~cur_used.data[i]);
+		/*
+		 * Each bit indicates that a used bit in a qword needs to be
+		 * changed after unused_update is applied.
+		 */
+		if ((unused_update->data[i] & target_used.data[i]) !=
+		    target->data[i])
+			used_qword_diff |= 1 << i;
+	}
+	return used_qword_diff;
+}
+
+static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
+		      struct arm_smmu_ste *entry,
+		      const struct arm_smmu_ste *target, unsigned int start,
+		      unsigned int len)
+{
+	bool changed = false;
+	unsigned int i;
+
+	for (i = start; len != 0; len--, i++) {
+		if (entry->data[i] != target->data[i]) {
+			WRITE_ONCE(entry->data[i], target->data[i]);
+			changed = true;
+		}
+	}
+
+	if (changed)
+		arm_smmu_sync_ste_for_sid(smmu, sid);
+	return changed;
+}
+
+/*
+ * Update the STE/CD to the target configuration. The transition from the
+ * current entry to the target entry takes place over multiple steps that
+ * attempts to make the transition hitless if possible. This function takes care
+ * not to create a situation where the HW can perceive a corrupted entry. HW is
+ * only required to have a 64 bit atomicity with stores from the CPU, while
+ * entries are many 64 bit values big.
+ *
+ * The difference between the current value and the target value is analyzed to
+ * determine which of three updates are required - disruptive, hitless or no
+ * change.
+ *
+ * In the most general disruptive case we can make any update in three steps:
+ *  - Disrupting the entry (V=0)
+ *  - Fill now unused qwords, execpt qword 0 which contains V
+ *  - Make qword 0 have the final value and valid (V=1) with a single 64
+ *    bit store
+ *
+ * However this disrupts the HW while it is happening. There are several
+ * interesting cases where a STE/CD can be updated without disturbing the HW
+ * because only a small number of bits are changing (S1DSS, CONFIG, etc) or
+ * because the used bits don't intersect. We can detect this by calculating how
+ * many 64 bit values need update after adjusting the unused bits and skip the
+ * V=0 process. This relies on the IGNORED behavior described in the
+ * specification.
+ */
+static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
+			       struct arm_smmu_ste *entry,
+			       const struct arm_smmu_ste *target)
+{
+	unsigned int num_entry_qwords = ARRAY_SIZE(target->data);
+	struct arm_smmu_device *smmu = master->smmu;
+	struct arm_smmu_ste unused_update;
+	u8 used_qword_diff;
+
+	used_qword_diff =
+		arm_smmu_entry_qword_diff(entry, target, &unused_update);
+	if (hweight8(used_qword_diff) == 1) {
+		/*
+		 * Only one qword needs its used bits to be changed. This is a
+		 * hitless update, update all bits the current STE is ignoring
+		 * to their new values, then update a single "critical qword" to
+		 * change the STE and finally 0 out any bits that are now unused
+		 * in the target configuration.
+		 */
+		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
+
+		/*
+		 * Skip writing unused bits in the critical qword since we'll be
+		 * writing it in the next step anyways. This can save a sync
+		 * when the only change is in that qword.
+		 */
+		unused_update.data[critical_qword_index] =
+			entry->data[critical_qword_index];
+		entry_set(smmu, sid, entry, &unused_update, 0, num_entry_qwords);
+		entry_set(smmu, sid, entry, target, critical_qword_index, 1);
+		entry_set(smmu, sid, entry, target, 0, num_entry_qwords);
+	} else if (used_qword_diff) {
+		/*
+		 * At least two qwords need their inuse bits to be changed. This
+		 * requires a breaking update, zero the V bit, write all qwords
+		 * but 0, then set qword 0
+		 */
+		unused_update.data[0] = entry->data[0] & (~STRTAB_STE_0_V);
+		entry_set(smmu, sid, entry, &unused_update, 0, 1);
+		entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1);
+		entry_set(smmu, sid, entry, target, 0, 1);
+	} else {
+		/*
+		 * No inuse bit changed. Sanity check that all unused bits are 0
+		 * in the entry. The target was already sanity checked by
+		 * compute_qword_diff().
+		 */
+		WARN_ON_ONCE(
+			entry_set(smmu, sid, entry, target, 0, num_entry_qwords));
+	}
+
+	/* It's likely that we'll want to use the new STE soon */
+	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
+		struct arm_smmu_cmdq_ent
+			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
+					 .prefetch = {
+						 .sid = sid,
+					 } };
+
+		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+	}
+}
+
 static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 			     int ssid, bool leaf)
 {
@@ -1249,34 +1445,12 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
 static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 				      struct arm_smmu_ste *dst)
 {
-	/*
-	 * This is hideously complicated, but we only really care about
-	 * three cases at the moment:
-	 *
-	 * 1. Invalid (all zero) -> bypass/fault (init)
-	 * 2. Bypass/fault -> translation/bypass (attach)
-	 * 3. Translation/bypass -> bypass/fault (detach)
-	 *
-	 * Given that we can't update the STE atomically and the SMMU
-	 * doesn't read the thing in a defined order, that leaves us
-	 * with the following maintenance requirements:
-	 *
-	 * 1. Update Config, return (init time STEs aren't live)
-	 * 2. Write everything apart from dword 0, sync, write dword 0, sync
-	 * 3. Update Config, sync
-	 */
-	u64 val = le64_to_cpu(dst->data[0]);
-	bool ste_live = false;
+	u64 val;
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_ctx_desc_cfg *cd_table = NULL;
 	struct arm_smmu_s2_cfg *s2_cfg = NULL;
 	struct arm_smmu_domain *smmu_domain = master->domain;
-	struct arm_smmu_cmdq_ent prefetch_cmd = {
-		.opcode		= CMDQ_OP_PREFETCH_CFG,
-		.prefetch	= {
-			.sid	= sid,
-		},
-	};
+	struct arm_smmu_ste target = {};
 
 	if (smmu_domain) {
 		switch (smmu_domain->stage) {
@@ -1291,22 +1465,6 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 		}
 	}
 
-	if (val & STRTAB_STE_0_V) {
-		switch (FIELD_GET(STRTAB_STE_0_CFG, val)) {
-		case STRTAB_STE_0_CFG_BYPASS:
-			break;
-		case STRTAB_STE_0_CFG_S1_TRANS:
-		case STRTAB_STE_0_CFG_S2_TRANS:
-			ste_live = true;
-			break;
-		case STRTAB_STE_0_CFG_ABORT:
-			BUG_ON(!disable_bypass);
-			break;
-		default:
-			BUG(); /* STE corruption */
-		}
-	}
-
 	/* Nuke the existing STE_0 value, as we're going to rewrite it */
 	val = STRTAB_STE_0_V;
 
@@ -1317,16 +1475,11 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 		else
 			val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS);
 
-		dst->data[0] = cpu_to_le64(val);
-		dst->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
+		target.data[0] = cpu_to_le64(val);
+		target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
 						STRTAB_STE_1_SHCFG_INCOMING));
-		dst->data[2] = 0; /* Nuke the VMID */
-		/*
-		 * The SMMU can perform negative caching, so we must sync
-		 * the STE regardless of whether the old value was live.
-		 */
-		if (smmu)
-			arm_smmu_sync_ste_for_sid(smmu, sid);
+		target.data[2] = 0; /* Nuke the VMID */
+		arm_smmu_write_ste(master, sid, dst, &target);
 		return;
 	}
 
@@ -1334,8 +1487,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 		u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
 			STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
 
-		BUG_ON(ste_live);
-		dst->data[1] = cpu_to_le64(
+		target.data[1] = cpu_to_le64(
 			 FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
 			 FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 			 FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
@@ -1344,7 +1496,7 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 
 		if (smmu->features & ARM_SMMU_FEAT_STALLS &&
 		    !master->stall_enabled)
-			dst->data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
+			target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
 
 		val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
 			FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) |
@@ -1353,8 +1505,9 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 	}
 
 	if (s2_cfg) {
-		BUG_ON(ste_live);
-		dst->data[2] = cpu_to_le64(
+		target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
+						STRTAB_STE_1_SHCFG_INCOMING));
+		target.data[2] = cpu_to_le64(
 			 FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
 			 FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
 #ifdef __BIG_ENDIAN
@@ -1363,23 +1516,17 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 			 STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
 			 STRTAB_STE_2_S2R);
 
-		dst->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
+		target.data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
 
 		val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS);
 	}
 
 	if (master->ats_enabled)
-		dst->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
+		target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
 						 STRTAB_STE_1_EATS_TRANS));
 
-	arm_smmu_sync_ste_for_sid(smmu, sid);
-	/* See comment in arm_smmu_write_ctx_desc() */
-	WRITE_ONCE(dst->data[0], cpu_to_le64(val));
-	arm_smmu_sync_ste_for_sid(smmu, sid);
-
-	/* It's likely that we'll want to use the new STE soon */
-	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH))
-		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+	target.data[0] = cpu_to_le64(val);
+	arm_smmu_write_ste(master, sid, dst, &target);
 }
 
 static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab,

From 92cbb5a498a5eeca914426fcf9e76b851c5ac8b0 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:13 -0400
Subject: [PATCH 100/352] iommu/arm-smmu-v3: Consolidate the STE generation for
 abort/bypass

This allows writing the flow of arm_smmu_write_strtab_ent() around abort
and bypass domains more naturally.

Note that the core code no longer supplies NULL domains, though there is
still a flow in the driver that end up in arm_smmu_write_strtab_ent() with
NULL. A later patch will remove it.

Remove the duplicate calculation of the STE in arm_smmu_init_bypass_stes()
and remove the force parameter. arm_smmu_rmr_install_bypass_ste() can now
simply invoke arm_smmu_make_bypass_ste() directly.

Rename arm_smmu_init_bypass_stes() to arm_smmu_init_initial_stes() to
better reflect its purpose.

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7686aa5f8d61388eeaa64730363ffc8df20a481d)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 97 ++++++++++++---------
 1 file changed, 55 insertions(+), 42 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 050ef96129bb4..34e1d83fff478 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1442,6 +1442,24 @@ static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
 	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 }
 
+static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
+{
+	memset(target, 0, sizeof(*target));
+	target->data[0] = cpu_to_le64(
+		STRTAB_STE_0_V |
+		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT));
+}
+
+static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target)
+{
+	memset(target, 0, sizeof(*target));
+	target->data[0] = cpu_to_le64(
+		STRTAB_STE_0_V |
+		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS));
+	target->data[1] = cpu_to_le64(
+		FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
+}
+
 static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 				      struct arm_smmu_ste *dst)
 {
@@ -1452,37 +1470,31 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 	struct arm_smmu_domain *smmu_domain = master->domain;
 	struct arm_smmu_ste target = {};
 
-	if (smmu_domain) {
-		switch (smmu_domain->stage) {
-		case ARM_SMMU_DOMAIN_S1:
-			cd_table = &master->cd_table;
-			break;
-		case ARM_SMMU_DOMAIN_S2:
-			s2_cfg = &smmu_domain->s2_cfg;
-			break;
-		default:
-			break;
-		}
-	}
-
-	/* Nuke the existing STE_0 value, as we're going to rewrite it */
-	val = STRTAB_STE_0_V;
-
-	/* Bypass/fault */
-	if (!smmu_domain || !(cd_table || s2_cfg)) {
-		if (!smmu_domain && disable_bypass)
-			val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT);
+	if (!smmu_domain) {
+		if (disable_bypass)
+			arm_smmu_make_abort_ste(&target);
 		else
-			val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS);
+			arm_smmu_make_bypass_ste(&target);
+		arm_smmu_write_ste(master, sid, dst, &target);
+		return;
+	}
 
-		target.data[0] = cpu_to_le64(val);
-		target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
-						STRTAB_STE_1_SHCFG_INCOMING));
-		target.data[2] = 0; /* Nuke the VMID */
+	switch (smmu_domain->stage) {
+	case ARM_SMMU_DOMAIN_S1:
+		cd_table = &master->cd_table;
+		break;
+	case ARM_SMMU_DOMAIN_S2:
+		s2_cfg = &smmu_domain->s2_cfg;
+		break;
+	case ARM_SMMU_DOMAIN_BYPASS:
+		arm_smmu_make_bypass_ste(&target);
 		arm_smmu_write_ste(master, sid, dst, &target);
 		return;
 	}
 
+	/* Nuke the existing STE_0 value, as we're going to rewrite it */
+	val = STRTAB_STE_0_V;
+
 	if (cd_table) {
 		u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
 			STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
@@ -1529,22 +1541,20 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 	arm_smmu_write_ste(master, sid, dst, &target);
 }
 
-static void arm_smmu_init_bypass_stes(struct arm_smmu_ste *strtab,
-				      unsigned int nent, bool force)
+/*
+ * This can safely directly manipulate the STE memory without a sync sequence
+ * because the STE table has not been installed in the SMMU yet.
+ */
+static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
+				       unsigned int nent)
 {
 	unsigned int i;
-	u64 val = STRTAB_STE_0_V;
-
-	if (disable_bypass && !force)
-		val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT);
-	else
-		val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS);
 
 	for (i = 0; i < nent; ++i) {
-		strtab->data[0] = cpu_to_le64(val);
-		strtab->data[1] = cpu_to_le64(FIELD_PREP(
-			STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
-		strtab->data[2] = 0;
+		if (disable_bypass)
+			arm_smmu_make_abort_ste(strtab);
+		else
+			arm_smmu_make_bypass_ste(strtab);
 		strtab++;
 	}
 }
@@ -1572,7 +1582,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 		return -ENOMEM;
 	}
 
-	arm_smmu_init_bypass_stes(desc->l2ptr, 1 << STRTAB_SPLIT, false);
+	arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
 	arm_smmu_write_strtab_l1_desc(strtab, desc);
 	return 0;
 }
@@ -3165,7 +3175,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
 	cfg->strtab_base_cfg = reg;
 
-	arm_smmu_init_bypass_stes(strtab, cfg->num_l1_ents, false);
+	arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents);
 	return 0;
 }
 
@@ -3876,7 +3886,6 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu)
 	iort_get_rmr_sids(dev_fwnode(smmu->dev), &rmr_list);
 
 	list_for_each_entry(e, &rmr_list, list) {
-		struct arm_smmu_ste *step;
 		struct iommu_iort_rmr_data *rmr;
 		int ret, i;
 
@@ -3889,8 +3898,12 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu)
 				continue;
 			}
 
-			step = arm_smmu_get_step_for_sid(smmu, rmr->sids[i]);
-			arm_smmu_init_bypass_stes(step, 1, true);
+			/*
+			 * STE table is not programmed to HW, see
+			 * arm_smmu_initial_bypass_stes()
+			 */
+			arm_smmu_make_bypass_ste(
+				arm_smmu_get_step_for_sid(smmu, rmr->sids[i]));
 		}
 	}
 

From b1ffdc4dc03c5eced7b18985893eea90a9c03945 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:14 -0400
Subject: [PATCH 101/352] iommu/arm-smmu-v3: Move the STE generation for S1 and
 S2 domains into functions

This is preparation to move the STE calculation higher up in to the call
chain and remove arm_smmu_write_strtab_ent(). These new functions will be
called directly from attach_dev.

Reviewed-by: Moritz Fischer <mdf@kernel.org>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit efe15df08727d483bd247ff905a828f0de955de6)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 138 ++++++++++++--------
 1 file changed, 83 insertions(+), 55 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 34e1d83fff478..8f826da7036ca 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1460,13 +1460,89 @@ static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target)
 		FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
 }
 
+static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
+				      struct arm_smmu_master *master)
+{
+	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
+	struct arm_smmu_device *smmu = master->smmu;
+
+	memset(target, 0, sizeof(*target));
+	target->data[0] = cpu_to_le64(
+		STRTAB_STE_0_V |
+		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) |
+		FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt) |
+		(cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
+		FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax));
+
+	target->data[1] = cpu_to_le64(
+		FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
+		FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
+		FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
+		FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
+		((smmu->features & ARM_SMMU_FEAT_STALLS &&
+		  !master->stall_enabled) ?
+			 STRTAB_STE_1_S1STALLD :
+			 0) |
+		FIELD_PREP(STRTAB_STE_1_EATS,
+			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+
+	if (smmu->features & ARM_SMMU_FEAT_E2H) {
+		/*
+		 * To support BTM the streamworld needs to match the
+		 * configuration of the CPU so that the ASID broadcasts are
+		 * properly matched. This means either S/NS-EL2-E2H (hypervisor)
+		 * or NS-EL1 (guest). Since an SVA domain can be installed in a
+		 * PASID this should always use a BTM compatible configuration
+		 * if the HW supports it.
+		 */
+		target->data[1] |= cpu_to_le64(
+			FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_EL2));
+	} else {
+		target->data[1] |= cpu_to_le64(
+			FIELD_PREP(STRTAB_STE_1_STRW, STRTAB_STE_1_STRW_NSEL1));
+
+		/*
+		 * VMID 0 is reserved for stage-2 bypass EL1 STEs, see
+		 * arm_smmu_domain_alloc_id()
+		 */
+		target->data[2] =
+			cpu_to_le64(FIELD_PREP(STRTAB_STE_2_S2VMID, 0));
+	}
+}
+
+static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
+					struct arm_smmu_master *master,
+					struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
+
+	memset(target, 0, sizeof(*target));
+	target->data[0] = cpu_to_le64(
+		STRTAB_STE_0_V |
+		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS));
+
+	target->data[1] = cpu_to_le64(
+		FIELD_PREP(STRTAB_STE_1_EATS,
+			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) |
+		FIELD_PREP(STRTAB_STE_1_SHCFG,
+			   STRTAB_STE_1_SHCFG_INCOMING));
+
+	target->data[2] = cpu_to_le64(
+		FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
+		FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
+		STRTAB_STE_2_S2AA64 |
+#ifdef __BIG_ENDIAN
+		STRTAB_STE_2_S2ENDI |
+#endif
+		STRTAB_STE_2_S2PTW |
+		STRTAB_STE_2_S2R);
+
+	target->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
+}
+
 static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 				      struct arm_smmu_ste *dst)
 {
-	u64 val;
-	struct arm_smmu_device *smmu = master->smmu;
-	struct arm_smmu_ctx_desc_cfg *cd_table = NULL;
-	struct arm_smmu_s2_cfg *s2_cfg = NULL;
 	struct arm_smmu_domain *smmu_domain = master->domain;
 	struct arm_smmu_ste target = {};
 
@@ -1481,63 +1557,15 @@ static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
 
 	switch (smmu_domain->stage) {
 	case ARM_SMMU_DOMAIN_S1:
-		cd_table = &master->cd_table;
+		arm_smmu_make_cdtable_ste(&target, master);
 		break;
 	case ARM_SMMU_DOMAIN_S2:
-		s2_cfg = &smmu_domain->s2_cfg;
+		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
 		break;
 	case ARM_SMMU_DOMAIN_BYPASS:
 		arm_smmu_make_bypass_ste(&target);
-		arm_smmu_write_ste(master, sid, dst, &target);
-		return;
-	}
-
-	/* Nuke the existing STE_0 value, as we're going to rewrite it */
-	val = STRTAB_STE_0_V;
-
-	if (cd_table) {
-		u64 strw = smmu->features & ARM_SMMU_FEAT_E2H ?
-			STRTAB_STE_1_STRW_EL2 : STRTAB_STE_1_STRW_NSEL1;
-
-		target.data[1] = cpu_to_le64(
-			 FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
-			 FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
-			 FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
-			 FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
-			 FIELD_PREP(STRTAB_STE_1_STRW, strw));
-
-		if (smmu->features & ARM_SMMU_FEAT_STALLS &&
-		    !master->stall_enabled)
-			target.data[1] |= cpu_to_le64(STRTAB_STE_1_S1STALLD);
-
-		val |= (cd_table->cdtab_dma & STRTAB_STE_0_S1CTXPTR_MASK) |
-			FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S1_TRANS) |
-			FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax) |
-			FIELD_PREP(STRTAB_STE_0_S1FMT, cd_table->s1fmt);
-	}
-
-	if (s2_cfg) {
-		target.data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
-						STRTAB_STE_1_SHCFG_INCOMING));
-		target.data[2] = cpu_to_le64(
-			 FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
-			 FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
-#ifdef __BIG_ENDIAN
-			 STRTAB_STE_2_S2ENDI |
-#endif
-			 STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2AA64 |
-			 STRTAB_STE_2_S2R);
-
-		target.data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
-
-		val |= FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_S2_TRANS);
+		break;
 	}
-
-	if (master->ats_enabled)
-		target.data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_EATS,
-						 STRTAB_STE_1_EATS_TRANS));
-
-	target.data[0] = cpu_to_le64(val);
 	arm_smmu_write_ste(master, sid, dst, &target);
 }
 

From 4d8181f992d59b4515def8c76d434c20704bb63d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:15 -0400
Subject: [PATCH 102/352] iommu/arm-smmu-v3: Build the whole STE in
 arm_smmu_make_s2_domain_ste()

Half the code was living in arm_smmu_domain_finalise_s2(), just move it
here and take the values directly from the pgtbl_ops instead of storing
copies.

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 71b0aa10b18dd589a899449457e5ab7c1da00d01)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 27 ++++++++++++---------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 --
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8f826da7036ca..bb81c881b722a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1515,6 +1515,11 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 					struct arm_smmu_domain *smmu_domain)
 {
 	struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
+	const struct io_pgtable_cfg *pgtbl_cfg =
+		&io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops)->cfg;
+	typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr =
+		&pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
+	u64 vtcr_val;
 
 	memset(target, 0, sizeof(*target));
 	target->data[0] = cpu_to_le64(
@@ -1527,9 +1532,16 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 		FIELD_PREP(STRTAB_STE_1_SHCFG,
 			   STRTAB_STE_1_SHCFG_INCOMING));
 
+	vtcr_val = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) |
+		   FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) |
+		   FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) |
+		   FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) |
+		   FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) |
+		   FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) |
+		   FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps);
 	target->data[2] = cpu_to_le64(
 		FIELD_PREP(STRTAB_STE_2_S2VMID, s2_cfg->vmid) |
-		FIELD_PREP(STRTAB_STE_2_VTCR, s2_cfg->vtcr) |
+		FIELD_PREP(STRTAB_STE_2_VTCR, vtcr_val) |
 		STRTAB_STE_2_S2AA64 |
 #ifdef __BIG_ENDIAN
 		STRTAB_STE_2_S2ENDI |
@@ -1537,7 +1549,8 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 		STRTAB_STE_2_S2PTW |
 		STRTAB_STE_2_S2R);
 
-	target->data[3] = cpu_to_le64(s2_cfg->vttbr & STRTAB_STE_3_S2TTB_MASK);
+	target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s2_cfg.vttbr &
+				      STRTAB_STE_3_S2TTB_MASK);
 }
 
 static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
@@ -2267,7 +2280,6 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
 	int vmid;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
-	typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr;
 
 	/* Reserve VMID 0 for stage-2 bypass STEs */
 	vmid = ida_alloc_range(&smmu->vmid_map, 1, (1 << smmu->vmid_bits) - 1,
@@ -2275,16 +2287,7 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
 	if (vmid < 0)
 		return vmid;
 
-	vtcr = &pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
 	cfg->vmid	= (u16)vmid;
-	cfg->vttbr	= pgtbl_cfg->arm_lpae_s2_cfg.vttbr;
-	cfg->vtcr	= FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) |
-			  FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) |
-			  FIELD_PREP(STRTAB_STE_2_VTCR_S2IR0, vtcr->irgn) |
-			  FIELD_PREP(STRTAB_STE_2_VTCR_S2OR0, vtcr->orgn) |
-			  FIELD_PREP(STRTAB_STE_2_VTCR_S2SH0, vtcr->sh) |
-			  FIELD_PREP(STRTAB_STE_2_VTCR_S2TG, vtcr->tg) |
-			  FIELD_PREP(STRTAB_STE_2_VTCR_S2PS, vtcr->ps);
 	return 0;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 65fb388d51734..eb669121f1954 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -609,8 +609,6 @@ struct arm_smmu_ctx_desc_cfg {
 
 struct arm_smmu_s2_cfg {
 	u16				vmid;
-	u64				vttbr;
-	u64				vtcr;
 };
 
 struct arm_smmu_strtab_cfg {

From 0ca8fca6c649123f1cdcafbe2e3995bab0a1bfbb Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:17 -0400
Subject: [PATCH 103/352] iommu/arm-smmu-v3: Compute the STE only once for each
 master

Currently arm_smmu_install_ste_for_dev() iterates over every SID and
computes from scratch an identical STE. Every SID should have the same STE
contents. Turn this inside out so that the STE is supplied by the caller
and arm_smmu_install_ste_for_dev() simply installs it to every SID.

This is possible now that the STE generation does not inform what sequence
should be used to program it.

This allows splitting the STE calculation up according to the call site,
which following patches will make use of, and removes the confusing NULL
domain special case that only supported arm_smmu_detach_dev().

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/6-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 65547275d76965c3106fbcd4a9244242eb88224c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 57 ++++++++-------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index bb81c881b722a..a67491d476a21 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1553,35 +1553,6 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 				      STRTAB_STE_3_S2TTB_MASK);
 }
 
-static void arm_smmu_write_strtab_ent(struct arm_smmu_master *master, u32 sid,
-				      struct arm_smmu_ste *dst)
-{
-	struct arm_smmu_domain *smmu_domain = master->domain;
-	struct arm_smmu_ste target = {};
-
-	if (!smmu_domain) {
-		if (disable_bypass)
-			arm_smmu_make_abort_ste(&target);
-		else
-			arm_smmu_make_bypass_ste(&target);
-		arm_smmu_write_ste(master, sid, dst, &target);
-		return;
-	}
-
-	switch (smmu_domain->stage) {
-	case ARM_SMMU_DOMAIN_S1:
-		arm_smmu_make_cdtable_ste(&target, master);
-		break;
-	case ARM_SMMU_DOMAIN_S2:
-		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
-		break;
-	case ARM_SMMU_DOMAIN_BYPASS:
-		arm_smmu_make_bypass_ste(&target);
-		break;
-	}
-	arm_smmu_write_ste(master, sid, dst, &target);
-}
-
 /*
  * This can safely directly manipulate the STE memory without a sync sequence
  * because the STE table has not been installed in the SMMU yet.
@@ -2378,7 +2349,8 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
 	}
 }
 
-static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
+static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
+					 const struct arm_smmu_ste *target)
 {
 	int i, j;
 	struct arm_smmu_device *smmu = master->smmu;
@@ -2395,7 +2367,7 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master)
 		if (j < i)
 			continue;
 
-		arm_smmu_write_strtab_ent(master, sid, step);
+		arm_smmu_write_ste(master, sid, step, target);
 	}
 }
 
@@ -2502,6 +2474,7 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 {
 	unsigned long flags;
+	struct arm_smmu_ste target;
 	struct arm_smmu_domain *smmu_domain = master->domain;
 
 	if (!smmu_domain)
@@ -2515,7 +2488,11 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 
 	master->domain = NULL;
 	master->ats_enabled = false;
-	arm_smmu_install_ste_for_dev(master);
+	if (disable_bypass)
+		arm_smmu_make_abort_ste(&target);
+	else
+		arm_smmu_make_bypass_ste(&target);
+	arm_smmu_install_ste_for_dev(master, &target);
 	/*
 	 * Clearing the CD entry isn't strictly required to detach the domain
 	 * since the table is uninstalled anyway, but it helps avoid confusion
@@ -2530,6 +2507,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
 	int ret = 0;
 	unsigned long flags;
+	struct arm_smmu_ste target;
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct arm_smmu_device *smmu;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -2591,7 +2569,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	list_add(&master->domain_head, &smmu_domain->devices);
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+	switch (smmu_domain->stage) {
+	case ARM_SMMU_DOMAIN_S1:
 		if (!master->cd_table.cdtab) {
 			ret = arm_smmu_alloc_cd_tables(master);
 			if (ret) {
@@ -2605,9 +2584,17 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			master->domain = NULL;
 			goto out_list_del;
 		}
-	}
 
-	arm_smmu_install_ste_for_dev(master);
+		arm_smmu_make_cdtable_ste(&target, master);
+		break;
+	case ARM_SMMU_DOMAIN_S2:
+		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
+		break;
+	case ARM_SMMU_DOMAIN_BYPASS:
+		arm_smmu_make_bypass_ste(&target);
+		break;
+	}
+	arm_smmu_install_ste_for_dev(master, &target);
 
 	arm_smmu_enable_ats(master);
 	goto out_unlock;

From 8627c37b0132e0b64424effc3b065033c17278a1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:18 -0400
Subject: [PATCH 104/352] iommu/arm-smmu-v3: Do not change the STE twice during
 arm_smmu_attach_dev()

This was needed because the STE code required the STE to be in
ABORT/BYPASS inorder to program a cdtable or S2 STE. Now that the STE code
can automatically handle all transitions we can remove this step
from the attach_dev flow.

A few small bugs exist because of this:

1) If the core code does BLOCKED -> UNMANAGED with disable_bypass=false
   then there will be a moment where the STE points at BYPASS. Since
   this can be done by VFIO/IOMMUFD it is a small security race.

2) If the core code does IDENTITY -> DMA then any IOMMU_RESV_DIRECT
   regions will temporarily become BLOCKED. We'd like drivers to
   work in a way that allows IOMMU_RESV_DIRECT to be continuously
   functional during these transitions.

Make arm_smmu_release_device() put the STE back to the correct
ABORT/BYPASS setting. Fix a bug where a IOMMU_RESV_DIRECT was ignored on
this path.

As noted before the reordering of the linked list/STE/CD changes is OK
against concurrent arm_smmu_share_asid() because of the
arm_smmu_asid_lock.

Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/7-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 8c73c32c83ce7f3c31864cb044abd3daefacd996)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index a67491d476a21..65e4d47c6c33d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2474,7 +2474,6 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 {
 	unsigned long flags;
-	struct arm_smmu_ste target;
 	struct arm_smmu_domain *smmu_domain = master->domain;
 
 	if (!smmu_domain)
@@ -2488,11 +2487,6 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 
 	master->domain = NULL;
 	master->ats_enabled = false;
-	if (disable_bypass)
-		arm_smmu_make_abort_ste(&target);
-	else
-		arm_smmu_make_bypass_ste(&target);
-	arm_smmu_install_ste_for_dev(master, &target);
 	/*
 	 * Clearing the CD entry isn't strictly required to detach the domain
 	 * since the table is uninstalled anyway, but it helps avoid confusion
@@ -2840,9 +2834,18 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 static void arm_smmu_release_device(struct device *dev)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_ste target;
 
 	if (WARN_ON(arm_smmu_master_sva_enabled(master)))
 		iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
+
+	/* Put the STE back to what arm_smmu_init_strtab() sets */
+	if (disable_bypass && !dev->iommu->require_direct)
+		arm_smmu_make_abort_ste(&target);
+	else
+		arm_smmu_make_bypass_ste(&target);
+	arm_smmu_install_ste_for_dev(master, &target);
+
 	arm_smmu_detach_dev(master);
 	arm_smmu_disable_pasid(master);
 	arm_smmu_remove_master(master);

From 6c61624e24ed16d1857de92c7b15e0452b153629 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:19 -0400
Subject: [PATCH 105/352] iommu/arm-smmu-v3: Put writing the context descriptor
 in the right order

Get closer to the IOMMU API ideal that changes between domains can be
hitless. The ordering for the CD table entry is not entirely clean from
this perspective.

When switching away from a STE with a CD table programmed in it we should
write the new STE first, then clear any old data in the CD entry.

If we are programming a CD table for the first time to a STE then the CD
entry should be programmed before the STE is loaded.

If we are replacing a CD table entry when the STE already points at the CD
entry then we just need to do the make/break sequence.

Lift this code out of arm_smmu_detach_dev() so it can all be sequenced
properly. The only other caller is arm_smmu_release_device() and it is
going to free the cdtable anyhow, so it doesn't matter what is in it.

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/8-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d2e053d73247b68144c7f44d002ebf56acaf2d48)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 29 ++++++++++++++-------
 1 file changed, 20 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 65e4d47c6c33d..16063f60ee51c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2487,14 +2487,6 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 
 	master->domain = NULL;
 	master->ats_enabled = false;
-	/*
-	 * Clearing the CD entry isn't strictly required to detach the domain
-	 * since the table is uninstalled anyway, but it helps avoid confusion
-	 * in the call to arm_smmu_write_ctx_desc on the next attach (which
-	 * expects the entry to be empty).
-	 */
-	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1 && master->cd_table.cdtab)
-		arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL);
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -2571,6 +2563,17 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 				master->domain = NULL;
 				goto out_list_del;
 			}
+		} else {
+			/*
+			 * arm_smmu_write_ctx_desc() relies on the entry being
+			 * invalid to work, clear any existing entry.
+			 */
+			ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
+						      NULL);
+			if (ret) {
+				master->domain = NULL;
+				goto out_list_del;
+			}
 		}
 
 		ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd);
@@ -2580,15 +2583,23 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		}
 
 		arm_smmu_make_cdtable_ste(&target, master);
+		arm_smmu_install_ste_for_dev(master, &target);
 		break;
 	case ARM_SMMU_DOMAIN_S2:
 		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
+		arm_smmu_install_ste_for_dev(master, &target);
+		if (master->cd_table.cdtab)
+			arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
+						      NULL);
 		break;
 	case ARM_SMMU_DOMAIN_BYPASS:
 		arm_smmu_make_bypass_ste(&target);
+		arm_smmu_install_ste_for_dev(master, &target);
+		if (master->cd_table.cdtab)
+			arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
+						      NULL);
 		break;
 	}
-	arm_smmu_install_ste_for_dev(master, &target);
 
 	arm_smmu_enable_ats(master);
 	goto out_unlock;

From de4ae82b5030e630745a144fb3fcff16d51c8ce2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:20 -0400
Subject: [PATCH 106/352] iommu/arm-smmu-v3: Pass smmu_domain to
 arm_enable/disable_ats()

The caller already has the domain, just pass it in. A following patch will
remove master->domain.

Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/9-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d550ddc5b789f258cb5ce3bfe74af6d5383589b5)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 16063f60ee51c..64f9a6927f52e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2386,12 +2386,12 @@ static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
 	return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
 }
 
-static void arm_smmu_enable_ats(struct arm_smmu_master *master)
+static void arm_smmu_enable_ats(struct arm_smmu_master *master,
+				struct arm_smmu_domain *smmu_domain)
 {
 	size_t stu;
 	struct pci_dev *pdev;
 	struct arm_smmu_device *smmu = master->smmu;
-	struct arm_smmu_domain *smmu_domain = master->domain;
 
 	/* Don't enable ATS at the endpoint if it's not enabled in the STE */
 	if (!master->ats_enabled)
@@ -2407,10 +2407,9 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 		dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
 }
 
-static void arm_smmu_disable_ats(struct arm_smmu_master *master)
+static void arm_smmu_disable_ats(struct arm_smmu_master *master,
+				 struct arm_smmu_domain *smmu_domain)
 {
-	struct arm_smmu_domain *smmu_domain = master->domain;
-
 	if (!master->ats_enabled)
 		return;
 
@@ -2479,7 +2478,7 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 	if (!smmu_domain)
 		return;
 
-	arm_smmu_disable_ats(master);
+	arm_smmu_disable_ats(master, smmu_domain);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_del(&master->domain_head);
@@ -2601,7 +2600,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		break;
 	}
 
-	arm_smmu_enable_ats(master);
+	arm_smmu_enable_ats(master, smmu_domain);
 	goto out_unlock;
 
 out_list_del:

From c48005364e7639185edf7cb657f9f38da5357811 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:21 -0400
Subject: [PATCH 107/352] iommu/arm-smmu-v3: Remove arm_smmu_master->domain

Introducing global statics which are of type struct iommu_domain, not
struct arm_smmu_domain makes it difficult to retain
arm_smmu_master->domain, as it can no longer point to an IDENTITY or
BLOCKED domain.

The only place that uses the value is arm_smmu_detach_dev(). Change things
to work like other drivers and call iommu_get_domain_for_dev() to obtain
the current domain.

The master->domain is subtly protecting the master->domain_head against
being unused as only PAGING domains will set master->domain and only
paging domains use the master->domain_head. To make it simple keep the
master->domain_head initialized so that the list_del() logic just does
nothing for attached non-PAGING domains.

Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/10-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 1b50017d39f650d78a0066734d6fe05920a8c9e8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 ++++++++-------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
 2 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 64f9a6927f52e..bc01ad9e4cc52 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2472,19 +2472,20 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 
 static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 {
+	struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev);
+	struct arm_smmu_domain *smmu_domain;
 	unsigned long flags;
-	struct arm_smmu_domain *smmu_domain = master->domain;
 
-	if (!smmu_domain)
+	if (!domain)
 		return;
 
+	smmu_domain = to_smmu_domain(domain);
 	arm_smmu_disable_ats(master, smmu_domain);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_del(&master->domain_head);
+	list_del_init(&master->domain_head);
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-	master->domain = NULL;
 	master->ats_enabled = false;
 }
 
@@ -2538,8 +2539,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
 	arm_smmu_detach_dev(master);
 
-	master->domain = smmu_domain;
-
 	/*
 	 * The SMMU does not support enabling ATS with bypass. When the STE is
 	 * in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests and
@@ -2558,10 +2557,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	case ARM_SMMU_DOMAIN_S1:
 		if (!master->cd_table.cdtab) {
 			ret = arm_smmu_alloc_cd_tables(master);
-			if (ret) {
-				master->domain = NULL;
+			if (ret)
 				goto out_list_del;
-			}
 		} else {
 			/*
 			 * arm_smmu_write_ctx_desc() relies on the entry being
@@ -2569,17 +2566,13 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			 */
 			ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
 						      NULL);
-			if (ret) {
-				master->domain = NULL;
+			if (ret)
 				goto out_list_del;
-			}
 		}
 
 		ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd);
-		if (ret) {
-			master->domain = NULL;
+		if (ret)
 			goto out_list_del;
-		}
 
 		arm_smmu_make_cdtable_ste(&target, master);
 		arm_smmu_install_ste_for_dev(master, &target);
@@ -2605,7 +2598,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
 out_list_del:
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_del(&master->domain_head);
+	list_del_init(&master->domain_head);
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 out_unlock:
@@ -2806,6 +2799,7 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 	master->dev = dev;
 	master->smmu = smmu;
 	INIT_LIST_HEAD(&master->bonds);
+	INIT_LIST_HEAD(&master->domain_head);
 	dev_iommu_priv_set(dev, master);
 
 	ret = arm_smmu_insert_master(smmu, master);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index eb669121f1954..6b63ea7dae72d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -695,7 +695,6 @@ struct arm_smmu_stream {
 struct arm_smmu_master {
 	struct arm_smmu_device		*smmu;
 	struct device			*dev;
-	struct arm_smmu_domain		*domain;
 	struct list_head		domain_head;
 	struct arm_smmu_stream		*streams;
 	/* Locked by the iommu core using the group mutex */

From 35b0ab57dc55f654796aed5af46574b10a0a51af Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:22 -0400
Subject: [PATCH 108/352] iommu/arm-smmu-v3: Check that the RID domain is S1 in
 SVA

The SVA code only works if the RID domain is a S1 domain and has already
installed the cdtable.

Originally the check for this was in arm_smmu_sva_bind() but when the op
was removed the test didn't get copied over to the new
arm_smmu_sva_set_dev_pasid().

Without the test wrong usage usually will hit a WARN_ON() in
arm_smmu_write_ctx_desc() due to a missing ctx table.

However, the next patches wil change things so that an IDENTITY domain is
not a struct arm_smmu_domain and this will get into memory corruption if
the struct is wrongly casted.

Fail in arm_smmu_sva_set_dev_pasid() if the STE does not have a S1, which
is a proxy for the STE having a pointer to the CD table. Write it in a way
that will be compatible with the next patches.

Fixes: 386fa64fd52b ("arm-smmu-v3/sva: Add SVA domain support")
Reported-by: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
Closes: https://lore.kernel.org/linux-iommu/2a828e481416405fb3a4cceb9e075a59@huawei.com/
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/11-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit ae91f6552c301e5e8569667e9d5440d5f75a90c4)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 874d1f977d90c..2cd433a9c8a0f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -363,7 +363,13 @@ static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid,
 	struct arm_smmu_bond *bond;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_domain *smmu_domain;
+
+	if (!(domain->type & __IOMMU_DOMAIN_PAGING))
+		return -ENODEV;
+	smmu_domain = to_smmu_domain(domain);
+	if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
+		return -ENODEV;
 
 	if (!master || !master->sva_enabled)
 		return -ENODEV;

From 0e39932a4a1155d4571605b9be636f5025325c73 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:23 -0400
Subject: [PATCH 109/352] iommu/arm-smmu-v3: Add a global static IDENTITY
 domain

Move to the new static global for identity domains. Move all the logic out
of arm_smmu_attach_dev into an identity only function.

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/12-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 12dacfb5b938cdd90ced0109165eee9cb27061d9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 82 +++++++++++++++------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
 2 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index bc01ad9e4cc52..94eb4bceb2e71 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2165,8 +2165,7 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 		return arm_smmu_sva_domain_alloc();
 
 	if (type != IOMMU_DOMAIN_UNMANAGED &&
-	    type != IOMMU_DOMAIN_DMA &&
-	    type != IOMMU_DOMAIN_IDENTITY)
+	    type != IOMMU_DOMAIN_DMA)
 		return NULL;
 
 	/*
@@ -2274,11 +2273,6 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
 
-	if (domain->type == IOMMU_DOMAIN_IDENTITY) {
-		smmu_domain->stage = ARM_SMMU_DOMAIN_BYPASS;
-		return 0;
-	}
-
 	/* Restrict the stage to what we can actually support */
 	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
 		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
@@ -2476,7 +2470,7 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 	struct arm_smmu_domain *smmu_domain;
 	unsigned long flags;
 
-	if (!domain)
+	if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING))
 		return;
 
 	smmu_domain = to_smmu_domain(domain);
@@ -2539,15 +2533,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 
 	arm_smmu_detach_dev(master);
 
-	/*
-	 * The SMMU does not support enabling ATS with bypass. When the STE is
-	 * in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests and
-	 * Translated transactions are denied as though ATS is disabled for the
-	 * stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and
-	 * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry).
-	 */
-	if (smmu_domain->stage != ARM_SMMU_DOMAIN_BYPASS)
-		master->ats_enabled = arm_smmu_ats_supported(master);
+	master->ats_enabled = arm_smmu_ats_supported(master);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_add(&master->domain_head, &smmu_domain->devices);
@@ -2584,13 +2570,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
 						      NULL);
 		break;
-	case ARM_SMMU_DOMAIN_BYPASS:
-		arm_smmu_make_bypass_ste(&target);
-		arm_smmu_install_ste_for_dev(master, &target);
-		if (master->cd_table.cdtab)
-			arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
-						      NULL);
-		break;
 	}
 
 	arm_smmu_enable_ats(master, smmu_domain);
@@ -2606,6 +2585,60 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	return ret;
 }
 
+static int arm_smmu_attach_dev_ste(struct device *dev,
+				   struct arm_smmu_ste *ste)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+	if (arm_smmu_master_sva_enabled(master))
+		return -EBUSY;
+
+	/*
+	 * Do not allow any ASID to be changed while are working on the STE,
+	 * otherwise we could miss invalidations.
+	 */
+	mutex_lock(&arm_smmu_asid_lock);
+
+	/*
+	 * The SMMU does not support enabling ATS with bypass/abort. When the
+	 * STE is in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests
+	 * and Translated transactions are denied as though ATS is disabled for
+	 * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and
+	 * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry).
+	 */
+	arm_smmu_detach_dev(master);
+
+	arm_smmu_install_ste_for_dev(master, ste);
+	mutex_unlock(&arm_smmu_asid_lock);
+
+	/*
+	 * This has to be done after removing the master from the
+	 * arm_smmu_domain->devices to avoid races updating the same context
+	 * descriptor from arm_smmu_share_asid().
+	 */
+	if (master->cd_table.cdtab)
+		arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL);
+	return 0;
+}
+
+static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
+					struct device *dev)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_make_bypass_ste(&ste);
+	return arm_smmu_attach_dev_ste(dev, &ste);
+}
+
+static const struct iommu_domain_ops arm_smmu_identity_ops = {
+	.attach_dev = arm_smmu_attach_dev_identity,
+};
+
+static struct iommu_domain arm_smmu_identity_domain = {
+	.type = IOMMU_DOMAIN_IDENTITY,
+	.ops = &arm_smmu_identity_ops,
+};
+
 static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
 			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
 			      int prot, gfp_t gfp, size_t *mapped)
@@ -2995,6 +3028,7 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
 }
 
 static struct iommu_ops arm_smmu_ops = {
+	.identity_domain	= &arm_smmu_identity_domain,
 	.capable		= arm_smmu_capable,
 	.domain_alloc		= arm_smmu_domain_alloc,
 	.probe_device		= arm_smmu_probe_device,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 6b63ea7dae72d..23baf117e7e4b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -712,7 +712,6 @@ struct arm_smmu_master {
 enum arm_smmu_domain_stage {
 	ARM_SMMU_DOMAIN_S1 = 0,
 	ARM_SMMU_DOMAIN_S2,
-	ARM_SMMU_DOMAIN_BYPASS,
 };
 
 struct arm_smmu_domain {

From 8dd001fa4a2a27e7ad3f95c5b4db353ea4497ef9 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:24 -0400
Subject: [PATCH 110/352] iommu/arm-smmu-v3: Add a global static BLOCKED domain

Using the same design as the IDENTITY domain install an
STRTAB_STE_0_CFG_ABORT STE.

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/13-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 352bd64cd8288c5c6808735d52a75809dfef8635)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 94eb4bceb2e71..2799aaaa2a5a3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2639,6 +2639,24 @@ static struct iommu_domain arm_smmu_identity_domain = {
 	.ops = &arm_smmu_identity_ops,
 };
 
+static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
+					struct device *dev)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_make_abort_ste(&ste);
+	return arm_smmu_attach_dev_ste(dev, &ste);
+}
+
+static const struct iommu_domain_ops arm_smmu_blocked_ops = {
+	.attach_dev = arm_smmu_attach_dev_blocked,
+};
+
+static struct iommu_domain arm_smmu_blocked_domain = {
+	.type = IOMMU_DOMAIN_BLOCKED,
+	.ops = &arm_smmu_blocked_ops,
+};
+
 static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
 			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
 			      int prot, gfp_t gfp, size_t *mapped)
@@ -3029,6 +3047,7 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
 
 static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
+	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
 	.domain_alloc		= arm_smmu_domain_alloc,
 	.probe_device		= arm_smmu_probe_device,

From 363fdecb88d307e829f4a907414ea30cb9db8fbd Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:25 -0400
Subject: [PATCH 111/352] iommu/arm-smmu-v3: Use the identity/blocked domain
 during release

Consolidate some more code by having release call
arm_smmu_attach_dev_identity/blocked() instead of open coding this.

Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/14-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d36464f40f29c984984168ea89e08629bdba41df)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2799aaaa2a5a3..4c74e6a87fa2f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2889,19 +2889,16 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 static void arm_smmu_release_device(struct device *dev)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-	struct arm_smmu_ste target;
 
 	if (WARN_ON(arm_smmu_master_sva_enabled(master)))
 		iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
 
 	/* Put the STE back to what arm_smmu_init_strtab() sets */
 	if (disable_bypass && !dev->iommu->require_direct)
-		arm_smmu_make_abort_ste(&target);
+		arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
 	else
-		arm_smmu_make_bypass_ste(&target);
-	arm_smmu_install_ste_for_dev(master, &target);
+		arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev);
 
-	arm_smmu_detach_dev(master);
 	arm_smmu_disable_pasid(master);
 	arm_smmu_remove_master(master);
 	if (master->cd_table.cdtab)

From 01fb420a98b4e80a8b848eab246e1bf8f6fdad10 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:26 -0400
Subject: [PATCH 112/352] iommu/arm-smmu-v3: Pass arm_smmu_domain and
 arm_smmu_device to finalize

Instead of putting container_of() casts in the internals, use the proper
type in this call chain. This makes it easier to check that the two global
static domains are not leaking into call chains they should not.

Passing the smmu avoids the only caller from having to set it and unset it
in the error path.

Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/15-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d8cd200609cf6a404cda73794f0c8c4fd74c568c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 +++++++++++----------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 4c74e6a87fa2f..ef5258ad997f2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -88,6 +88,9 @@ static struct arm_smmu_option_prop arm_smmu_options[] = {
 	{ 0, NULL},
 };
 
+static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
+				    struct arm_smmu_device *smmu);
+
 static void parse_driver_options(struct arm_smmu_device *smmu)
 {
 	int i = 0;
@@ -2207,12 +2210,12 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 	kfree(smmu_domain);
 }
 
-static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
+static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
+				       struct arm_smmu_domain *smmu_domain,
 				       struct io_pgtable_cfg *pgtbl_cfg)
 {
 	int ret;
 	u32 asid;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
 	typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr;
 
@@ -2244,11 +2247,11 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_domain *smmu_domain,
 	return ret;
 }
 
-static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
+static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
+				       struct arm_smmu_domain *smmu_domain,
 				       struct io_pgtable_cfg *pgtbl_cfg)
 {
 	int vmid;
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
 	struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
 
 	/* Reserve VMID 0 for stage-2 bypass STEs */
@@ -2261,17 +2264,17 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_domain *smmu_domain,
 	return 0;
 }
 
-static int arm_smmu_domain_finalise(struct iommu_domain *domain)
+static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
+				    struct arm_smmu_device *smmu)
 {
 	int ret;
 	unsigned long ias, oas;
 	enum io_pgtable_fmt fmt;
 	struct io_pgtable_cfg pgtbl_cfg;
 	struct io_pgtable_ops *pgtbl_ops;
-	int (*finalise_stage_fn)(struct arm_smmu_domain *,
-				 struct io_pgtable_cfg *);
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	int (*finalise_stage_fn)(struct arm_smmu_device *smmu,
+				 struct arm_smmu_domain *smmu_domain,
+				 struct io_pgtable_cfg *pgtbl_cfg);
 
 	/* Restrict the stage to what we can actually support */
 	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
@@ -2310,17 +2313,18 @@ static int arm_smmu_domain_finalise(struct iommu_domain *domain)
 	if (!pgtbl_ops)
 		return -ENOMEM;
 
-	domain->pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
-	domain->geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
-	domain->geometry.force_aperture = true;
+	smmu_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
+	smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
+	smmu_domain->domain.geometry.force_aperture = true;
 
-	ret = finalise_stage_fn(smmu_domain, &pgtbl_cfg);
+	ret = finalise_stage_fn(smmu, smmu_domain, &pgtbl_cfg);
 	if (ret < 0) {
 		free_io_pgtable_ops(pgtbl_ops);
 		return ret;
 	}
 
 	smmu_domain->pgtbl_ops = pgtbl_ops;
+	smmu_domain->smmu = smmu;
 	return 0;
 }
 
@@ -2512,10 +2516,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	mutex_lock(&smmu_domain->init_mutex);
 
 	if (!smmu_domain->smmu) {
-		smmu_domain->smmu = smmu;
-		ret = arm_smmu_domain_finalise(domain);
-		if (ret)
-			smmu_domain->smmu = NULL;
+		ret = arm_smmu_domain_finalise(smmu_domain, smmu);
 	} else if (smmu_domain->smmu != smmu)
 		ret = -EINVAL;
 

From 874013b2516dc76b4f7917d99b0b787bfc2ebae3 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 26 Feb 2024 13:07:27 -0400
Subject: [PATCH 113/352] iommu/arm-smmu-v3: Convert to domain_alloc_paging()

Now that the BLOCKED and IDENTITY behaviors are managed with their own
domains change to the domain_alloc_paging() op.

For now SVA remains using the old interface, eventually it will get its
own op that can pass in the device and mm_struct which will let us have a
sane lifetime for the mmu_notifier.

Call arm_smmu_domain_finalise() early if dev is available.

Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/16-v6-96275f25c39d+2d4-smmuv3_newapi_p1_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 327e10b47ae99f76ac53f0b8b73a0539f390d2d2)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 22 ++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ef5258ad997f2..e5d097b622fba 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2162,14 +2162,15 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 
 static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 {
-	struct arm_smmu_domain *smmu_domain;
 
 	if (type == IOMMU_DOMAIN_SVA)
 		return arm_smmu_sva_domain_alloc();
+	return ERR_PTR(-EOPNOTSUPP);
+}
 
-	if (type != IOMMU_DOMAIN_UNMANAGED &&
-	    type != IOMMU_DOMAIN_DMA)
-		return NULL;
+static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
+{
+	struct arm_smmu_domain *smmu_domain;
 
 	/*
 	 * Allocate the domain and initialise some of its data structures.
@@ -2178,13 +2179,23 @@ static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
 	 */
 	smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
 	if (!smmu_domain)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	mutex_init(&smmu_domain->init_mutex);
 	INIT_LIST_HEAD(&smmu_domain->devices);
 	spin_lock_init(&smmu_domain->devices_lock);
 	INIT_LIST_HEAD(&smmu_domain->mmu_notifiers);
 
+	if (dev) {
+		struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+		int ret;
+
+		ret = arm_smmu_domain_finalise(smmu_domain, master->smmu);
+		if (ret) {
+			kfree(smmu_domain);
+			return ERR_PTR(ret);
+		}
+	}
 	return &smmu_domain->domain;
 }
 
@@ -3048,6 +3059,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
 	.domain_alloc		= arm_smmu_domain_alloc,
+	.domain_alloc_paging    = arm_smmu_domain_alloc_paging,
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.device_group		= arm_smmu_device_group,

From 0c5fcaf30a6a47d94358a0bf0084355144667052 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Fri, 16 Feb 2024 15:40:24 +0100
Subject: [PATCH 114/352] iommu: constify pointer to bus_type

Make pointer to bus_type a pointer to const for code safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20240216144027.185959-1-krzysztof.kozlowski@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit e70e9ecd7cb349d00736c594b8e9bada0a762238)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu-priv.h | 5 +++--
 drivers/iommu/iommu.c      | 5 +++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index 2024a23133486..5f731d994803c 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -21,10 +21,11 @@ int iommu_group_replace_domain(struct iommu_group *group,
 			       struct iommu_domain *new_domain);
 
 int iommu_device_register_bus(struct iommu_device *iommu,
-			      const struct iommu_ops *ops, struct bus_type *bus,
+			      const struct iommu_ops *ops,
+			      const struct bus_type *bus,
 			      struct notifier_block *nb);
 void iommu_device_unregister_bus(struct iommu_device *iommu,
-				 struct bus_type *bus,
+				 const struct bus_type *bus,
 				 struct notifier_block *nb);
 
 #endif /* __LINUX_IOMMU_PRIV_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9b94c973670b2..15d97aabf11e3 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(iommu_device_unregister);
 
 #if IS_ENABLED(CONFIG_IOMMUFD_TEST)
 void iommu_device_unregister_bus(struct iommu_device *iommu,
-				 struct bus_type *bus,
+				 const struct bus_type *bus,
 				 struct notifier_block *nb)
 {
 	bus_unregister_notifier(bus, nb);
@@ -303,7 +303,8 @@ EXPORT_SYMBOL_GPL(iommu_device_unregister_bus);
  * some memory to hold a notifier_block.
  */
 int iommu_device_register_bus(struct iommu_device *iommu,
-			      const struct iommu_ops *ops, struct bus_type *bus,
+			      const struct iommu_ops *ops,
+			      const struct bus_type *bus,
 			      struct notifier_block *nb)
 {
 	int err;

From 4b39f0a3d731256619e691796b16f48ab2e95841 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Fri, 16 Feb 2024 15:40:25 +0100
Subject: [PATCH 115/352] iommu: constify of_phandle_args in xlate

The xlate callbacks are supposed to translate of_phandle_args to proper
provider without modifying the of_phandle_args.  Make the argument
pointer to const for code safety and readability.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240216144027.185959-2-krzysztof.kozlowski@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit b42a905b6aad40c092cf17f4b295a4c389bc7206)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/apple-dart.c                  | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 ++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 3 ++-
 drivers/iommu/arm/arm-smmu/qcom_iommu.c     | 3 ++-
 drivers/iommu/exynos-iommu.c                | 2 +-
 drivers/iommu/iommu.c                       | 2 +-
 drivers/iommu/ipmmu-vmsa.c                  | 4 ++--
 drivers/iommu/msm_iommu.c                   | 4 ++--
 drivers/iommu/mtk_iommu.c                   | 3 ++-
 drivers/iommu/mtk_iommu_v1.c                | 3 ++-
 drivers/iommu/rockchip-iommu.c              | 2 +-
 drivers/iommu/sprd-iommu.c                  | 3 ++-
 drivers/iommu/sun50i-iommu.c                | 2 +-
 drivers/iommu/tegra-smmu.c                  | 4 ++--
 drivers/iommu/virtio-iommu.c                | 3 ++-
 include/linux/iommu.h                       | 4 ++--
 16 files changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/apple-dart.c b/drivers/iommu/apple-dart.c
index ef3ee95706dac..eb1e62cd499a5 100644
--- a/drivers/iommu/apple-dart.c
+++ b/drivers/iommu/apple-dart.c
@@ -779,7 +779,8 @@ static void apple_dart_domain_free(struct iommu_domain *domain)
 	kfree(dart_domain);
 }
 
-static int apple_dart_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int apple_dart_of_xlate(struct device *dev,
+			       const struct of_phandle_args *args)
 {
 	struct apple_dart_master_cfg *cfg = dev_iommu_priv_get(dev);
 	struct platform_device *iommu_pdev = of_find_device_by_node(args->np);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e5d097b622fba..38540ec665a2c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2950,7 +2950,8 @@ static int arm_smmu_enable_nesting(struct iommu_domain *domain)
 	return ret;
 }
 
-static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int arm_smmu_of_xlate(struct device *dev,
+			     const struct of_phandle_args *args)
 {
 	return iommu_fwspec_add_ids(dev, args->args, 1);
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 6317aaf7b3ab1..c572d877b0e10 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1546,7 +1546,8 @@ static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
 	return ret;
 }
 
-static int arm_smmu_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int arm_smmu_of_xlate(struct device *dev,
+			     const struct of_phandle_args *args)
 {
 	u32 mask, fwid = 0;
 
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index 17a1c163fef66..e079bb7a993e2 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -546,7 +546,8 @@ static struct iommu_device *qcom_iommu_probe_device(struct device *dev)
 	return &qcom_iommu->iommu;
 }
 
-static int qcom_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int qcom_iommu_of_xlate(struct device *dev,
+			       const struct of_phandle_args *args)
 {
 	struct qcom_iommu_dev *qcom_iommu;
 	struct platform_device *iommu_pdev;
diff --git a/drivers/iommu/exynos-iommu.c b/drivers/iommu/exynos-iommu.c
index 2c6e9094f1e97..d98c9161948a2 100644
--- a/drivers/iommu/exynos-iommu.c
+++ b/drivers/iommu/exynos-iommu.c
@@ -1431,7 +1431,7 @@ static void exynos_iommu_release_device(struct device *dev)
 }
 
 static int exynos_iommu_of_xlate(struct device *dev,
-				 struct of_phandle_args *spec)
+				 const struct of_phandle_args *spec)
 {
 	struct platform_device *sysmmu = of_find_device_by_node(spec->np);
 	struct exynos_iommu_owner *owner = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 15d97aabf11e3..cba919e3f02a5 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2855,7 +2855,7 @@ void iommu_fwspec_free(struct device *dev)
 }
 EXPORT_SYMBOL_GPL(iommu_fwspec_free);
 
-int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids)
+int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	int i, new_num;
diff --git a/drivers/iommu/ipmmu-vmsa.c b/drivers/iommu/ipmmu-vmsa.c
index 90d3f03242db8..b657cc09605f4 100644
--- a/drivers/iommu/ipmmu-vmsa.c
+++ b/drivers/iommu/ipmmu-vmsa.c
@@ -709,7 +709,7 @@ static phys_addr_t ipmmu_iova_to_phys(struct iommu_domain *io_domain,
 }
 
 static int ipmmu_init_platform_device(struct device *dev,
-				      struct of_phandle_args *args)
+				      const struct of_phandle_args *args)
 {
 	struct platform_device *ipmmu_pdev;
 
@@ -773,7 +773,7 @@ static bool ipmmu_device_is_allowed(struct device *dev)
 }
 
 static int ipmmu_of_xlate(struct device *dev,
-			  struct of_phandle_args *spec)
+			  const struct of_phandle_args *spec)
 {
 	if (!ipmmu_device_is_allowed(dev))
 		return -ENODEV;
diff --git a/drivers/iommu/msm_iommu.c b/drivers/iommu/msm_iommu.c
index f86af9815d6f9..989e0869d8055 100644
--- a/drivers/iommu/msm_iommu.c
+++ b/drivers/iommu/msm_iommu.c
@@ -598,7 +598,7 @@ static void print_ctx_regs(void __iomem *base, int ctx)
 
 static int insert_iommu_master(struct device *dev,
 				struct msm_iommu_dev **iommu,
-				struct of_phandle_args *spec)
+				const struct of_phandle_args *spec)
 {
 	struct msm_iommu_ctx_dev *master = dev_iommu_priv_get(dev);
 	int sid;
@@ -626,7 +626,7 @@ static int insert_iommu_master(struct device *dev,
 }
 
 static int qcom_iommu_of_xlate(struct device *dev,
-			       struct of_phandle_args *spec)
+			       const struct of_phandle_args *spec)
 {
 	struct msm_iommu_dev *iommu = NULL, *iter;
 	unsigned long flags;
diff --git a/drivers/iommu/mtk_iommu.c b/drivers/iommu/mtk_iommu.c
index 51d0eba8cbdf3..358e8ee9506c0 100644
--- a/drivers/iommu/mtk_iommu.c
+++ b/drivers/iommu/mtk_iommu.c
@@ -957,7 +957,8 @@ static struct iommu_group *mtk_iommu_device_group(struct device *dev)
 	return group;
 }
 
-static int mtk_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int mtk_iommu_of_xlate(struct device *dev,
+			      const struct of_phandle_args *args)
 {
 	struct platform_device *m4updev;
 
diff --git a/drivers/iommu/mtk_iommu_v1.c b/drivers/iommu/mtk_iommu_v1.c
index 32cc8341d3726..0ddcd153b568d 100644
--- a/drivers/iommu/mtk_iommu_v1.c
+++ b/drivers/iommu/mtk_iommu_v1.c
@@ -398,7 +398,8 @@ static const struct iommu_ops mtk_iommu_v1_ops;
  * MTK generation one iommu HW only support one iommu domain, and all the client
  * sharing the same iova address space.
  */
-static int mtk_iommu_v1_create_mapping(struct device *dev, struct of_phandle_args *args)
+static int mtk_iommu_v1_create_mapping(struct device *dev,
+				       const struct of_phandle_args *args)
 {
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct mtk_iommu_v1_data *data;
diff --git a/drivers/iommu/rockchip-iommu.c b/drivers/iommu/rockchip-iommu.c
index 2685861c0a126..da79d9f4cf637 100644
--- a/drivers/iommu/rockchip-iommu.c
+++ b/drivers/iommu/rockchip-iommu.c
@@ -1140,7 +1140,7 @@ static void rk_iommu_release_device(struct device *dev)
 }
 
 static int rk_iommu_of_xlate(struct device *dev,
-			     struct of_phandle_args *args)
+			     const struct of_phandle_args *args)
 {
 	struct platform_device *iommu_dev;
 	struct rk_iommudata *data;
diff --git a/drivers/iommu/sprd-iommu.c b/drivers/iommu/sprd-iommu.c
index 537359f109979..ba53571a82390 100644
--- a/drivers/iommu/sprd-iommu.c
+++ b/drivers/iommu/sprd-iommu.c
@@ -390,7 +390,8 @@ static struct iommu_device *sprd_iommu_probe_device(struct device *dev)
 	return &sdev->iommu;
 }
 
-static int sprd_iommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int sprd_iommu_of_xlate(struct device *dev,
+			       const struct of_phandle_args *args)
 {
 	struct platform_device *pdev;
 
diff --git a/drivers/iommu/sun50i-iommu.c b/drivers/iommu/sun50i-iommu.c
index 41484a5a399bb..decd52cba998a 100644
--- a/drivers/iommu/sun50i-iommu.c
+++ b/drivers/iommu/sun50i-iommu.c
@@ -819,7 +819,7 @@ static struct iommu_device *sun50i_iommu_probe_device(struct device *dev)
 }
 
 static int sun50i_iommu_of_xlate(struct device *dev,
-				 struct of_phandle_args *args)
+				 const struct of_phandle_args *args)
 {
 	struct platform_device *iommu_pdev = of_find_device_by_node(args->np);
 	unsigned id = args->args[0];
diff --git a/drivers/iommu/tegra-smmu.c b/drivers/iommu/tegra-smmu.c
index 310871728ab4b..14e525bd0d9bb 100644
--- a/drivers/iommu/tegra-smmu.c
+++ b/drivers/iommu/tegra-smmu.c
@@ -830,7 +830,7 @@ static struct tegra_smmu *tegra_smmu_find(struct device_node *np)
 }
 
 static int tegra_smmu_configure(struct tegra_smmu *smmu, struct device *dev,
-				struct of_phandle_args *args)
+				const struct of_phandle_args *args)
 {
 	const struct iommu_ops *ops = smmu->iommu.ops;
 	int err;
@@ -959,7 +959,7 @@ static struct iommu_group *tegra_smmu_device_group(struct device *dev)
 }
 
 static int tegra_smmu_of_xlate(struct device *dev,
-			       struct of_phandle_args *args)
+			       const struct of_phandle_args *args)
 {
 	struct platform_device *iommu_pdev = of_find_device_by_node(args->np);
 	struct tegra_mc *mc = platform_get_drvdata(iommu_pdev);
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index 34db37fd9675c..04048f64a2c0f 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -1051,7 +1051,8 @@ static struct iommu_group *viommu_device_group(struct device *dev)
 		return generic_device_group(dev);
 }
 
-static int viommu_of_xlate(struct device *dev, struct of_phandle_args *args)
+static int viommu_of_xlate(struct device *dev,
+			   const struct of_phandle_args *args)
 {
 	return iommu_fwspec_add_ids(dev, args->args, 1);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 36375e7651f48..3495f6a634138 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -567,7 +567,7 @@ struct iommu_ops {
 	/* Request/Free a list of reserved regions for a device */
 	void (*get_resv_regions)(struct device *dev, struct list_head *list);
 
-	int (*of_xlate)(struct device *dev, struct of_phandle_args *args);
+	int (*of_xlate)(struct device *dev, const struct of_phandle_args *args);
 	bool (*is_attach_deferred)(struct device *dev);
 
 	/* Per device IOMMU features */
@@ -989,7 +989,7 @@ struct iommu_mm_data {
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops);
 void iommu_fwspec_free(struct device *dev);
-int iommu_fwspec_add_ids(struct device *dev, u32 *ids, int num_ids);
+int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids);
 const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
 
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)

From ecd2660badc2a5123da7f0f975171f0b9be0ca9a Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Fri, 16 Feb 2024 15:40:26 +0100
Subject: [PATCH 116/352] iommu: constify fwnode in iommu_ops_from_fwnode()

Make pointer to fwnode_handle a pointer to const for code safety.

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240216144027.185959-3-krzysztof.kozlowski@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 5896e6e39b86c1d820b3ccf5caea9aef40c2eacd)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu.c | 2 +-
 include/linux/iommu.h | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cba919e3f02a5..659a77f7bb833 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2804,7 +2804,7 @@ bool iommu_default_passthrough(void)
 }
 EXPORT_SYMBOL_GPL(iommu_default_passthrough);
 
-const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
+const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode)
 {
 	const struct iommu_ops *ops = NULL;
 	struct iommu_device *iommu;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3495f6a634138..6e1563c943d6c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -990,7 +990,7 @@ int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,
 		      const struct iommu_ops *ops);
 void iommu_fwspec_free(struct device *dev);
 int iommu_fwspec_add_ids(struct device *dev, const u32 *ids, int num_ids);
-const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode);
+const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode);
 
 static inline struct iommu_fwspec *dev_iommu_fwspec_get(struct device *dev)
 {
@@ -1313,7 +1313,7 @@ static inline int iommu_fwspec_add_ids(struct device *dev, u32 *ids,
 }
 
 static inline
-const struct iommu_ops *iommu_ops_from_fwnode(struct fwnode_handle *fwnode)
+const struct iommu_ops *iommu_ops_from_fwnode(const struct fwnode_handle *fwnode)
 {
 	return NULL;
 }

From cda186fd3f430942e908c978e65c3a24547d267e Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Date: Fri, 16 Feb 2024 15:40:27 +0100
Subject: [PATCH 117/352] iommu: re-use local fwnode variable in
 iommu_ops_from_fwnode()

iommu_ops_from_fwnode() stores &iommu_spec->np->fwnode in local
variable, so use it to simplify the code (iommu_spec is not changed
between these dereferences).

Signed-off-by: Krzysztof Kozlowski <krzysztof.kozlowski@linaro.org>
Link: https://lore.kernel.org/r/20240216144027.185959-4-krzysztof.kozlowski@linaro.org
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit b5a1f7513a2fcb1b9e646128ff0cb0dbba5ed6c1)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/of_iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 719652b608407..3afe0b48a48db 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -29,7 +29,7 @@ static int of_iommu_xlate(struct device *dev,
 	    !of_device_is_available(iommu_spec->np))
 		return -ENODEV;
 
-	ret = iommu_fwspec_init(dev, &iommu_spec->np->fwnode, ops);
+	ret = iommu_fwspec_init(dev, fwnode, ops);
 	if (ret)
 		return ret;
 	/*

From 8e7d3d48678fb7f63b91822168b3616b0c87dcfb Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 29 Feb 2024 19:39:34 +0000
Subject: [PATCH 118/352] vfio/nvgrace-gpu: Convey kvm to map device memory
 region as noncached

The NVIDIA Grace Hopper GPUs have device memory that is supposed to be
used as a regular RAM. It is accessible through CPU-GPU chip-to-chip
cache coherent interconnect and is present in the system physical
address space. The device memory is split into two regions - termed
as usemem and resmem - in the system physical address space,
with each region mapped and exposed to the VM as a separate fake
device BAR [1].

Owing to a hardware defect for Multi-Instance GPU (MIG) feature [2],
there is a requirement - as a workaround - for the resmem BAR to
display uncached memory characteristics. Based on [3], on system with
FWB enabled such as Grace Hopper, the requisite properties
(uncached, unaligned access) can be achieved through a VM mapping (S1)
of NORMAL_NC and host mapping (S2) of MT_S2_FWB_NORMAL_NC.

KVM currently maps the MMIO region in S2 as MT_S2_FWB_DEVICE_nGnRE by
default. The fake device BARs thus displays DEVICE_nGnRE behavior in the
VM.

The following table summarizes the behavior for the various S1 and S2
mapping combinations for systems with FWB enabled [3].
S1           |  S2           | Result
NORMAL_NC    |  NORMAL_NC    | NORMAL_NC
NORMAL_NC    |  DEVICE_nGnRE | DEVICE_nGnRE

Recently a change was added that modifies this default behavior and
make KVM map MMIO as MT_S2_FWB_NORMAL_NC when a VMA flag
VM_ALLOW_ANY_UNCACHED is set [4]. Setting S2 as MT_S2_FWB_NORMAL_NC
provides the desired behavior (uncached, unaligned access) for resmem.

To use VM_ALLOW_ANY_UNCACHED flag, the platform must guarantee that
no action taken on the MMIO mapping can trigger an uncontained
failure. The Grace Hopper satisfies this requirement. So set
the VM_ALLOW_ANY_UNCACHED flag in the VMA.

Applied over next-20240227.
base-commit: 22ba90670a51

Link: https://lore.kernel.org/all/20240220115055.23546-4-ankita@nvidia.com/ [1]
Link: https://www.nvidia.com/en-in/technologies/multi-instance-gpu/ [2]
Link: https://developer.arm.com/documentation/ddi0487/latest/ section D8.5.5 [3]
Link: https://lore.kernel.org/all/20240224150546.368-1-ankita@nvidia.com/ [4]

Cc: Alex Williamson <alex.williamson@redhat.com>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Vikram Sethi <vsethi@nvidia.com>
Cc: Zhi Wang <zhiw@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240229193934.2417-1-ankita@nvidia.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 81617c17bf58f008a57da74b97e60a0bf8e971fd)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 25814006352d0..a7fd018aa5483 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -160,8 +160,17 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 	 * The carved out region of the device memory needs the NORMAL_NC
 	 * property. Communicate as such to the hypervisor.
 	 */
-	if (index == RESMEM_REGION_INDEX)
+	if (index == RESMEM_REGION_INDEX) {
+		/*
+		 * The nvgrace-gpu module has no issues with uncontained
+		 * failures on NORMAL_NC accesses. VM_ALLOW_ANY_UNCACHED is
+		 * set to communicate to the KVM to S2 map as NORMAL_NC.
+		 * This opens up guest usage of NORMAL_NC for this mapping.
+		 */
+		vm_flags_set(vma, VM_ALLOW_ANY_UNCACHED);
+
 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+	}
 
 	/*
 	 * Perform a PFN map to the memory and back the device BAR by the

From 7ae1386a71424141599608e7325a7456bf3bbd08 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 29 Feb 2024 15:35:40 -0700
Subject: [PATCH 119/352] Revert "vfio/type1: Unpin zero pages"

This reverts commit 873aefb376bbc0ed1dd2381ea1d6ec88106fdbd4.

This was a heinous workaround and it turns out it's been fixed in mm
twice since it was introduced.  Most recently, commit c8070b787519
("mm: Don't pin ZERO_PAGE in pin_user_pages()") would have prevented
running up the zeropage refcount, but even before that commit
84209e87c696 ("mm/gup: reliable R/O long-term pinning in COW mappings")
avoids the vfio use case from pinning the zeropage at all, instead
replacing it with exclusive anonymous pages.

Remove this now useless overhead.

Suggested-by: David Hildenbrand <david@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240229223544.257207-1-alex.williamson@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 5b992412776cdf2ec88b5b5138112e6b36e47995)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b2854d7939ce0..b5c15fe8f9fcf 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -567,18 +567,6 @@ static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 	ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
 				    pages, NULL);
 	if (ret > 0) {
-		int i;
-
-		/*
-		 * The zero page is always resident, we don't need to pin it
-		 * and it falls into our invalid/reserved test so we don't
-		 * unpin in put_pfn().  Unpin all zero pages in the batch here.
-		 */
-		for (i = 0 ; i < ret; i++) {
-			if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
-				unpin_user_page(pages[i]);
-		}
-
 		*pfn = page_to_pfn(pages[0]);
 		goto done;
 	}

From 26583d3c550de7de8880a868cecea5c878ddd4de Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Mon, 4 Mar 2024 12:05:42 +0000
Subject: [PATCH 120/352] iommu/dma: Document min_align_mask assumption

iommu-dma does not explicitly reference min_align_mask since we already
assume that will be less than or equal to any typical IOVA granule.
We wouldn't realistically expect to see the case where it is larger, and
that would be non-trivial to support, however for the sake of reasoning
(particularly around the interaction with SWIOTLB), let's clearly
enforce the assumption.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Link: https://lore.kernel.org/r/dbb4d2d8e5d1691ac9a6c67e9758904e6c447ba5.1709553942.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit e2addba4930558a6f37a2e4b2cb6f726638a6dce)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/dma-iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 639efa0c40721..e4cb26f6a9434 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -859,6 +859,11 @@ static dma_addr_t __iommu_dma_map(struct device *dev, phys_addr_t phys,
 	    iommu_deferred_attach(dev, domain))
 		return DMA_MAPPING_ERROR;
 
+	/* If anyone ever wants this we'd need support in the IOVA allocator */
+	if (dev_WARN_ONCE(dev, dma_get_min_align_mask(dev) > iova_mask(iovad),
+	    "Unsupported alignment constraint\n"))
+		return DMA_MAPPING_ERROR;
+
 	size = iova_align(iovad, size + iova_off);
 
 	iova = iommu_dma_alloc_iova(domain, size, dma_mask, dev);

From e9a3919549e49a17a1d535394f75e0734133c564 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 4 Mar 2024 15:50:08 -0400
Subject: [PATCH 121/352] iommu/arm-smmu-v3: Add cpu_to_le64() around
 STRTAB_STE_0_V

STRTAB_STE_0_V is a CPU value, it needs conversion for sparse to be clean.

The missing annotation was a mistake introduced by splitting the ops out
from the STE writer.

Fixes: 7da51af9125c ("iommu/arm-smmu-v3: Make STE programming independent of the callers")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202403011441.5WqGrYjp-lkp@intel.com/
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/0-v1-98b23ebb0c84+9f-smmu_cputole_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 0493e739ccc60a3e0870847f1a12d6d79b86a1fc)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 38540ec665a2c..c66dbd0d6b5c1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1139,7 +1139,8 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
 		 * requires a breaking update, zero the V bit, write all qwords
 		 * but 0, then set qword 0
 		 */
-		unused_update.data[0] = entry->data[0] & (~STRTAB_STE_0_V);
+		unused_update.data[0] = entry->data[0] &
+					cpu_to_le64(~STRTAB_STE_0_V);
 		entry_set(smmu, sid, entry, &unused_update, 0, 1);
 		entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1);
 		entry_set(smmu, sid, entry, target, 0, 1);

From 19cf06d89c83faf9849be5fa4776403292dc4ca1 Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Sat, 23 Mar 2024 13:46:58 +0000
Subject: [PATCH 122/352] iommu/arm-smmu-v3: Fix access for STE.SHCFG

STE attributes(NSCFG, PRIVCFG, INSTCFG) use value 0 for "Use Icomming",
for some reason SHCFG doesn't follow that, and it is defined as "0b01".

Currently the driver sets SHCFG to Use Incoming for stage-2 and bypass
domains.

However according to the User Manual (ARM IHI 0070 F.b):
	When SMMU_IDR1.ATTR_TYPES_OVR == 0, this field is RES0 and the
	incoming Shareability attribute is used.

This patch adds a condition for writing SHCFG to Use incoming to be
compliant with the architecture, and defines ATTR_TYPE_OVR as a new
feature discovered from IDR1.
This also required to propagate the SMMU through some functions args.

There is no need to add similar condition for the newly introduced function
arm_smmu_get_ste_used() as the values of the STE are the same before and
after any transition, so this will not trigger any change. (we already
do the same for the VMID).

Although this is a misconfiguration from the driver, this has been there
for a long time, so probably no HW running Linux is affected by it.

Reported-by: Will Deacon <will@kernel.org>
Closes: https://lore.kernel.org/all/20240215134952.GA690@willie-the-truck/

Signed-off-by: Mostafa Saleh <smostafa@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240323134658.464743-1-smostafa@google.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit ec9098d6bffea6e82d63640134c123a3d96e0781)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 ++++++++++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 ++
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c66dbd0d6b5c1..7161e76edcaa5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1454,14 +1454,17 @@ static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
 		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT));
 }
 
-static void arm_smmu_make_bypass_ste(struct arm_smmu_ste *target)
+static void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
+				     struct arm_smmu_ste *target)
 {
 	memset(target, 0, sizeof(*target));
 	target->data[0] = cpu_to_le64(
 		STRTAB_STE_0_V |
 		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_BYPASS));
-	target->data[1] = cpu_to_le64(
-		FIELD_PREP(STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
+
+	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
+		target->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
+							 STRTAB_STE_1_SHCFG_INCOMING));
 }
 
 static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
@@ -1524,6 +1527,7 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 	typeof(&pgtbl_cfg->arm_lpae_s2_cfg.vtcr) vtcr =
 		&pgtbl_cfg->arm_lpae_s2_cfg.vtcr;
 	u64 vtcr_val;
+	struct arm_smmu_device *smmu = master->smmu;
 
 	memset(target, 0, sizeof(*target));
 	target->data[0] = cpu_to_le64(
@@ -1532,9 +1536,11 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 
 	target->data[1] = cpu_to_le64(
 		FIELD_PREP(STRTAB_STE_1_EATS,
-			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0) |
-		FIELD_PREP(STRTAB_STE_1_SHCFG,
-			   STRTAB_STE_1_SHCFG_INCOMING));
+			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+
+	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
+		target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
+							  STRTAB_STE_1_SHCFG_INCOMING));
 
 	vtcr_val = FIELD_PREP(STRTAB_STE_2_VTCR_S2T0SZ, vtcr->tsz) |
 		   FIELD_PREP(STRTAB_STE_2_VTCR_S2SL0, vtcr->sl) |
@@ -1561,7 +1567,8 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
  * This can safely directly manipulate the STE memory without a sync sequence
  * because the STE table has not been installed in the SMMU yet.
  */
-static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
+static void arm_smmu_init_initial_stes(struct arm_smmu_device *smmu,
+				       struct arm_smmu_ste *strtab,
 				       unsigned int nent)
 {
 	unsigned int i;
@@ -1570,7 +1577,7 @@ static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
 		if (disable_bypass)
 			arm_smmu_make_abort_ste(strtab);
 		else
-			arm_smmu_make_bypass_ste(strtab);
+			arm_smmu_make_bypass_ste(smmu, strtab);
 		strtab++;
 	}
 }
@@ -1598,7 +1605,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 		return -ENOMEM;
 	}
 
-	arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
+	arm_smmu_init_initial_stes(smmu, desc->l2ptr, 1 << STRTAB_SPLIT);
 	arm_smmu_write_strtab_l1_desc(strtab, desc);
 	return 0;
 }
@@ -2638,8 +2645,9 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
 					struct device *dev)
 {
 	struct arm_smmu_ste ste;
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
-	arm_smmu_make_bypass_ste(&ste);
+	arm_smmu_make_bypass_ste(master->smmu, &ste);
 	return arm_smmu_attach_dev_ste(dev, &ste);
 }
 
@@ -3265,7 +3273,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
 	cfg->strtab_base_cfg = reg;
 
-	arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents);
+	arm_smmu_init_initial_stes(smmu, strtab, cfg->num_l1_ents);
 	return 0;
 }
 
@@ -3777,6 +3785,9 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 		return -ENXIO;
 	}
 
+	if (reg & IDR1_ATTR_TYPES_OVR)
+		smmu->features |= ARM_SMMU_FEAT_ATTR_TYPES_OVR;
+
 	/* Queue sizes, capped to ensure natural alignment */
 	smmu->cmdq.q.llq.max_n_shift = min_t(u32, CMDQ_MAX_SZ_SHIFT,
 					     FIELD_GET(IDR1_CMDQS, reg));
@@ -3992,7 +4003,7 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu)
 			 * STE table is not programmed to HW, see
 			 * arm_smmu_initial_bypass_stes()
 			 */
-			arm_smmu_make_bypass_ste(
+			arm_smmu_make_bypass_ste(smmu,
 				arm_smmu_get_step_for_sid(smmu, rmr->sids[i]));
 		}
 	}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 23baf117e7e4b..2a19bb63e5c6d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -44,6 +44,7 @@
 #define IDR1_TABLES_PRESET		(1 << 30)
 #define IDR1_QUEUES_PRESET		(1 << 29)
 #define IDR1_REL			(1 << 28)
+#define IDR1_ATTR_TYPES_OVR		(1 << 27)
 #define IDR1_CMDQS			GENMASK(25, 21)
 #define IDR1_EVTQS			GENMASK(20, 16)
 #define IDR1_PRIQS			GENMASK(15, 11)
@@ -647,6 +648,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_SVA		(1 << 17)
 #define ARM_SMMU_FEAT_E2H		(1 << 18)
 #define ARM_SMMU_FEAT_NESTING		(1 << 19)
+#define ARM_SMMU_FEAT_ATTR_TYPES_OVR	(1 << 20)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)

From e270fc6e40a10f812ee4acfed204d4764762519a Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Tue, 26 Mar 2024 20:45:48 -0700
Subject: [PATCH 123/352] swiotlb: fix swiotlb_bounce() to do partial sync's
 correctly

In current code, swiotlb_bounce() may do partial sync's correctly in
some circumstances, but may incorrectly fail in other circumstances.
The failure cases require both of these to be true:

1) swiotlb_align_offset() returns a non-zero "offset" value
2) the tlb_addr of the partial sync area points into the first
"offset" bytes of the _second_ or subsequent swiotlb slot allocated
for the mapping

Code added in commit 868c9ddc182b ("swiotlb: add overflow checks
to swiotlb_bounce") attempts to WARN on the invalid case where
tlb_addr points into the first "offset" bytes of the _first_
allocated slot. But there's no way for swiotlb_bounce() to distinguish
the first slot from the second and subsequent slots, so the WARN
can be triggered incorrectly when #2 above is true.

Related, current code calculates an adjustment to the orig_addr stored
in the swiotlb slot. The adjustment compensates for the difference
in the tlb_addr used for the partial sync vs. the tlb_addr for the full
mapping. The adjustment is stored in the local variable tlb_offset.
But when #1 and #2 above are true, it's valid for this adjustment to
be negative. In such case the arithmetic to adjust orig_addr produces
the wrong result due to tlb_offset being declared as unsigned.

Fix these problems by removing the over-constraining validations added
in 868c9ddc182b. Change the declaration of tlb_offset to be signed
instead of unsigned so the adjustment arithmetic works correctly.

Tested with a test-only hack to how swiotlb_tbl_map_single() calls
swiotlb_bounce(). Instead of calling swiotlb_bounce() just once
for the entire mapped area, do a loop with each iteration doing
only a 128 byte partial sync until the entire mapped area is
sync'ed. Then with swiotlb=force on the kernel boot line, run a
variety of raw disk writes followed by read and verification of
all bytes of the written data. The storage device has DMA
min_align_mask set, and the writes are done with a variety of
original buffer memory address alignments and overall buffer
sizes. For many of the combinations, current code triggers the
WARN statements, or the data verification fails. With the fixes,
no WARNs occur and all verifications pass.

Fixes: 5f89468e2f06 ("swiotlb: manipulate orig_addr when tlb_addr has offset")
Fixes: 868c9ddc182b ("swiotlb: add overflow checks to swiotlb_bounce")
Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Dominique Martinet <dominique.martinet@atmark-techno.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
(cherry picked from commit e8068f2d756d57a5206fa3180ade365a8c12ed85)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 kernel/dma/swiotlb.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index b77a661989990..3386a7a4f834e 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -863,27 +863,23 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
 	size_t alloc_size = mem->slots[index].alloc_size;
 	unsigned long pfn = PFN_DOWN(orig_addr);
 	unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
-	unsigned int tlb_offset, orig_addr_offset;
+	int tlb_offset;
 
 	if (orig_addr == INVALID_PHYS_ADDR)
 		return;
 
-	tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
-	orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr);
-	if (tlb_offset < orig_addr_offset) {
-		dev_WARN_ONCE(dev, 1,
-			"Access before mapping start detected. orig offset %u, requested offset %u.\n",
-			orig_addr_offset, tlb_offset);
-		return;
-	}
-
-	tlb_offset -= orig_addr_offset;
-	if (tlb_offset > alloc_size) {
-		dev_WARN_ONCE(dev, 1,
-			"Buffer overflow detected. Allocation size: %zu. Mapping size: %zu+%u.\n",
-			alloc_size, size, tlb_offset);
-		return;
-	}
+	/*
+	 * It's valid for tlb_offset to be negative. This can happen when the
+	 * "offset" returned by swiotlb_align_offset() is non-zero, and the
+	 * tlb_addr is pointing within the first "offset" bytes of the second
+	 * or subsequent slots of the allocated swiotlb area. While it's not
+	 * valid for tlb_addr to be pointing within the first "offset" bytes
+	 * of the first slot, there's no way to check for such an error since
+	 * this function can't distinguish the first slot from the second and
+	 * subsequent slots.
+	 */
+	tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) -
+		     swiotlb_align_offset(dev, 0, orig_addr);
 
 	orig_addr += tlb_offset;
 	alloc_size -= tlb_offset;

From 0289ed012a62e42c56a146d0764a00b78327d6f6 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 5 Apr 2024 17:52:07 +0100
Subject: [PATCH 124/352] iommu/arm-smmu-v3: Retire disable_bypass parameter

The disable_bypass parameter has been mostly meaningless for a long time
since the introduction of default domains. Its original intent is now
fulfilled by the controls users have over the default domain type, and
its remaining effect in the brief window between Stream Table
initialisation and default domain creation hardly seems worth the
complication. Furthermore, thanks to 2-level Stream Tables, disabling
disable_bypass (there's another reason not to like it right there) has
never guaranteed that any particular StreamID *will* bypass anyway - any
device which might actually care about that wants RMRs - so there's not
really much lost by taking away that option (which has already been
non-default for nearing 6 years now).

As part of this, also remove the weird behaviour where we "successfully"
probe and register a non-functional SMMU if the DT "#iommu-cells"
property is wrong. I have no memory of what possessed me to think that
was a good idea at the time, and by now I suspect it's likely to break
things worse than simply failing probe would.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Link: https://lore.kernel.org/r/ea3ac4cd595a81b5511729601b2f7d4668178438.1712335927.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 734554fdfce6731b22f0777ec3f1e4a853354883)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 46 ++++++---------------
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 7161e76edcaa5..db5d2428081a4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -30,11 +30,6 @@
 #include "arm-smmu-v3.h"
 #include "../../dma-iommu.h"
 
-static bool disable_bypass = true;
-module_param(disable_bypass, bool, 0444);
-MODULE_PARM_DESC(disable_bypass,
-	"Disable bypass streams such that incoming transactions from devices that are not attached to an iommu domain will report an abort back to the device and will not be allowed to pass through the SMMU.");
-
 static bool disable_msipolling;
 module_param(disable_msipolling, bool, 0444);
 MODULE_PARM_DESC(disable_msipolling,
@@ -1567,17 +1562,13 @@ static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
  * This can safely directly manipulate the STE memory without a sync sequence
  * because the STE table has not been installed in the SMMU yet.
  */
-static void arm_smmu_init_initial_stes(struct arm_smmu_device *smmu,
-				       struct arm_smmu_ste *strtab,
+static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
 				       unsigned int nent)
 {
 	unsigned int i;
 
 	for (i = 0; i < nent; ++i) {
-		if (disable_bypass)
-			arm_smmu_make_abort_ste(strtab);
-		else
-			arm_smmu_make_bypass_ste(smmu, strtab);
+		arm_smmu_make_abort_ste(strtab);
 		strtab++;
 	}
 }
@@ -1605,7 +1596,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 		return -ENOMEM;
 	}
 
-	arm_smmu_init_initial_stes(smmu, desc->l2ptr, 1 << STRTAB_SPLIT);
+	arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
 	arm_smmu_write_strtab_l1_desc(strtab, desc);
 	return 0;
 }
@@ -2915,10 +2906,10 @@ static void arm_smmu_release_device(struct device *dev)
 		iopf_queue_remove_device(master->smmu->evtq.iopf, dev);
 
 	/* Put the STE back to what arm_smmu_init_strtab() sets */
-	if (disable_bypass && !dev->iommu->require_direct)
-		arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
-	else
+	if (dev->iommu->require_direct)
 		arm_smmu_attach_dev_identity(&arm_smmu_identity_domain, dev);
+	else
+		arm_smmu_attach_dev_blocked(&arm_smmu_blocked_domain, dev);
 
 	arm_smmu_disable_pasid(master);
 	arm_smmu_remove_master(master);
@@ -3273,7 +3264,7 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
 	cfg->strtab_base_cfg = reg;
 
-	arm_smmu_init_initial_stes(smmu, strtab, cfg->num_l1_ents);
+	arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents);
 	return 0;
 }
 
@@ -3502,7 +3493,7 @@ static int arm_smmu_device_disable(struct arm_smmu_device *smmu)
 	return ret;
 }
 
-static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
+static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
 {
 	int ret;
 	u32 reg, enables;
@@ -3512,7 +3503,6 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
 	reg = readl_relaxed(smmu->base + ARM_SMMU_CR0);
 	if (reg & CR0_SMMUEN) {
 		dev_warn(smmu->dev, "SMMU currently enabled! Resetting...\n");
-		WARN_ON(is_kdump_kernel() && !disable_bypass);
 		arm_smmu_update_gbpa(smmu, GBPA_ABORT, 0);
 	}
 
@@ -3619,14 +3609,8 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu, bool bypass)
 	if (is_kdump_kernel())
 		enables &= ~(CR0_EVTQEN | CR0_PRIQEN);
 
-	/* Enable the SMMU interface, or ensure bypass */
-	if (!bypass || disable_bypass) {
-		enables |= CR0_SMMUEN;
-	} else {
-		ret = arm_smmu_update_gbpa(smmu, 0, GBPA_ABORT);
-		if (ret)
-			return ret;
-	}
+	/* Enable the SMMU interface */
+	enables |= CR0_SMMUEN;
 	ret = arm_smmu_write_reg_sync(smmu, enables, ARM_SMMU_CR0,
 				      ARM_SMMU_CR0ACK);
 	if (ret) {
@@ -4018,7 +4002,6 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 	resource_size_t ioaddr;
 	struct arm_smmu_device *smmu;
 	struct device *dev = &pdev->dev;
-	bool bypass;
 
 	smmu = devm_kzalloc(dev, sizeof(*smmu), GFP_KERNEL);
 	if (!smmu)
@@ -4029,12 +4012,9 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 		ret = arm_smmu_device_dt_probe(pdev, smmu);
 	} else {
 		ret = arm_smmu_device_acpi_probe(pdev, smmu);
-		if (ret == -ENODEV)
-			return ret;
 	}
-
-	/* Set bypass mode according to firmware probing result */
-	bypass = !!ret;
+	if (ret)
+		return ret;
 
 	/* Base address */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
@@ -4098,7 +4078,7 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 	arm_smmu_rmr_install_bypass_ste(smmu);
 
 	/* Reset the device */
-	ret = arm_smmu_device_reset(smmu, bypass);
+	ret = arm_smmu_device_reset(smmu);
 	if (ret)
 		return ret;
 

From 528abd42d29b31ed26ec04d8f1424fd7adb72210 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 27 Mar 2024 15:07:49 -0300
Subject: [PATCH 125/352] iommu/arm-smmu-v3: Do not allow a SVA domain to be
 set on the wrong PASID

The SVA code is wired to assume that the SVA is programmed onto the
mm->pasid. The current core code always does this, so it is fine.

Add a check for clarity.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit fdc69d39e77f88264ee6e8174ff9aaf0953aecd9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 2cd433a9c8a0f..41b44baef15e8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -569,6 +569,9 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 	int ret = 0;
 	struct mm_struct *mm = domain->mm;
 
+	if (mm_get_enqcmd_pasid(mm) != id)
+		return -EINVAL;
+
 	mutex_lock(&sva_lock);
 	ret = __arm_smmu_sva_bind(dev, id, mm);
 	mutex_unlock(&sva_lock);

From 2b70290f943180a2a0e6cbbe34da5e97958ea435 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 27 Mar 2024 15:07:50 -0300
Subject: [PATCH 126/352] iommu/arm-smmu-v3: Do not ATC invalidate the entire
 domain

At this point we know which master we are going to change the PCI config
on, this is the only device we need to invalidate. Switch
arm_smmu_atc_inv_domain() for arm_smmu_atc_inv_master().

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 86e5ca098dd9f8c5b80a6205395aea0535018837)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index db5d2428081a4..8b2703163c43d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2410,7 +2410,10 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master,
 	pdev = to_pci_dev(master->dev);
 
 	atomic_inc(&smmu_domain->nr_ats_masters);
-	arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0);
+	/*
+	 * ATC invalidation of PASID 0 causes the entire ATC to be flushed.
+	 */
+	arm_smmu_atc_inv_master(master);
 	if (pci_enable_ats(pdev, stu))
 		dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
 }

From 4fe81b270c78cb6ca24cd89de7f259f52614bc93 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 27 Mar 2024 15:07:51 -0300
Subject: [PATCH 127/352] iommu/arm-smmu-v3: Add a type for the CD entry

Instead of passing a naked __le16 * around to represent a CD table entry
wrap it in a "struct arm_smmu_cd" with an array of the correct size. This
makes it much clearer which functions will comprise the "CD API".

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit e8e4398d53f98be7ac48e0bda9ea6e26df24136d)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 20 +++++++++++---------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  7 ++++++-
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8b2703163c43d..1099cd5751f2a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1209,7 +1209,8 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
+static struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
+					       u32 ssid)
 {
 	__le64 *l1ptr;
 	unsigned int idx;
@@ -1218,7 +1219,8 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
 	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
-		return cd_table->cdtab + ssid * CTXDESC_CD_DWORDS;
+		return (struct arm_smmu_cd *)(cd_table->cdtab +
+					      ssid * CTXDESC_CD_DWORDS);
 
 	idx = ssid >> CTXDESC_SPLIT;
 	l1_desc = &cd_table->l1_desc[idx];
@@ -1232,7 +1234,7 @@ static __le64 *arm_smmu_get_cd_ptr(struct arm_smmu_master *master, u32 ssid)
 		arm_smmu_sync_cd(master, ssid, false);
 	}
 	idx = ssid & (CTXDESC_L2_ENTRIES - 1);
-	return l1_desc->l2ptr + idx * CTXDESC_CD_DWORDS;
+	return &l1_desc->l2ptr[idx];
 }
 
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
@@ -1251,7 +1253,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	 */
 	u64 val;
 	bool cd_live;
-	__le64 *cdptr;
+	struct arm_smmu_cd *cdptr;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
 
@@ -1262,7 +1264,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	if (!cdptr)
 		return -ENOMEM;
 
-	val = le64_to_cpu(cdptr[0]);
+	val = le64_to_cpu(cdptr->data[0]);
 	cd_live = !!(val & CTXDESC_CD_0_V);
 
 	if (!cd) { /* (5) */
@@ -1279,9 +1281,9 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 		 * this substream's traffic
 		 */
 	} else { /* (1) and (2) */
-		cdptr[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
-		cdptr[2] = 0;
-		cdptr[3] = cpu_to_le64(cd->mair);
+		cdptr->data[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
+		cdptr->data[2] = 0;
+		cdptr->data[3] = cpu_to_le64(cd->mair);
 
 		/*
 		 * STE may be live, and the SMMU might read dwords of this CD in any
@@ -1313,7 +1315,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	 *   field within an aligned 64-bit span of a structure can be altered
 	 *   without first making the structure invalid.
 	 */
-	WRITE_ONCE(cdptr[0], cpu_to_le64(val));
+	WRITE_ONCE(cdptr->data[0], cpu_to_le64(val));
 	arm_smmu_sync_cd(master, ssid, true);
 	return 0;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 2a19bb63e5c6d..4b767e0eeeb68 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -283,6 +283,11 @@ struct arm_smmu_ste {
 #define CTXDESC_L1_DESC_L2PTR_MASK	GENMASK_ULL(51, 12)
 
 #define CTXDESC_CD_DWORDS		8
+
+struct arm_smmu_cd {
+	__le64 data[CTXDESC_CD_DWORDS];
+};
+
 #define CTXDESC_CD_0_TCR_T0SZ		GENMASK_ULL(5, 0)
 #define CTXDESC_CD_0_TCR_TG0		GENMASK_ULL(7, 6)
 #define CTXDESC_CD_0_TCR_IRGN0		GENMASK_ULL(9, 8)
@@ -592,7 +597,7 @@ struct arm_smmu_ctx_desc {
 };
 
 struct arm_smmu_l1_ctx_desc {
-	__le64				*l2ptr;
+	struct arm_smmu_cd		*l2ptr;
 	dma_addr_t			l2ptr_dma;
 };
 

From 79028b2e6cfc4b4497a322316ac0c34d4276e574 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Thu, 28 Mar 2024 05:29:58 -0700
Subject: [PATCH 128/352] iommu: Pass domain to remove_dev_pasid() op

Existing remove_dev_pasid() callbacks of the underlying iommu drivers
get the attached domain from the group->pasid_array. However, the domain
stored in group->pasid_array is not always correct in all scenarios.
A wrong domain may result in failure in remove_dev_pasid() callback.
To avoid such problems, it is more reliable to pass the domain to the
remove_dev_pasid() op.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240328122958.83332-3-yi.l.liu@intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit d2f85a263883b679f87ed8f911746105658e9c47)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  9 ++-------
 drivers/iommu/intel/iommu.c                 | 11 +++--------
 drivers/iommu/iommu.c                       |  9 +++++----
 include/linux/iommu.h                       |  3 ++-
 4 files changed, 12 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 1099cd5751f2a..3683cd25c4628 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3049,14 +3049,9 @@ static int arm_smmu_def_domain_type(struct device *dev)
 	return 0;
 }
 
-static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
+static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
+				      struct iommu_domain *domain)
 {
-	struct iommu_domain *domain;
-
-	domain = iommu_get_domain_for_dev_pasid(dev, pasid, IOMMU_DOMAIN_SVA);
-	if (WARN_ON(IS_ERR(domain)) || !domain)
-		return;
-
 	arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
 }
 
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 605cb77ff4ed5..ae58299c5a9dc 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -4679,19 +4679,15 @@ static int intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
 	return 0;
 }
 
-static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
+static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
+					 struct iommu_domain *domain)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
+	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
 	struct dev_pasid_info *curr, *dev_pasid = NULL;
 	struct intel_iommu *iommu = info->iommu;
-	struct dmar_domain *dmar_domain;
-	struct iommu_domain *domain;
 	unsigned long flags;
 
-	domain = iommu_get_domain_for_dev_pasid(dev, pasid, 0);
-	if (WARN_ON_ONCE(!domain))
-		goto out_tear_down;
-
 	/*
 	 * The SVA implementation needs to handle its own stuffs like the mm
 	 * notification. Before consolidating that code into iommu core, let
@@ -4702,7 +4698,6 @@ static void intel_iommu_remove_dev_pasid(struct device *dev, ioasid_t pasid)
 		goto out_tear_down;
 	}
 
-	dmar_domain = to_dmar_domain(domain);
 	spin_lock_irqsave(&dmar_domain->lock, flags);
 	list_for_each_entry(curr, &dmar_domain->dev_pasids, link_domain) {
 		if (curr->dev == dev && curr->pasid == pasid) {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 659a77f7bb833..3183b0ed4cdb9 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3335,20 +3335,21 @@ static int __iommu_set_group_pasid(struct iommu_domain *domain,
 
 		if (device == last_gdev)
 			break;
-		ops->remove_dev_pasid(device->dev, pasid);
+		ops->remove_dev_pasid(device->dev, pasid, domain);
 	}
 	return ret;
 }
 
 static void __iommu_remove_group_pasid(struct iommu_group *group,
-				       ioasid_t pasid)
+				       ioasid_t pasid,
+				       struct iommu_domain *domain)
 {
 	struct group_device *device;
 	const struct iommu_ops *ops;
 
 	for_each_group_device(group, device) {
 		ops = dev_iommu_ops(device->dev);
-		ops->remove_dev_pasid(device->dev, pasid);
+		ops->remove_dev_pasid(device->dev, pasid, domain);
 	}
 }
 
@@ -3418,7 +3419,7 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
 	struct iommu_group *group = dev->iommu_group;
 
 	mutex_lock(&group->mutex);
-	__iommu_remove_group_pasid(group, pasid);
+	__iommu_remove_group_pasid(group, pasid, domain);
 	WARN_ON(xa_erase(&group->pasid_array, pasid) != domain);
 	mutex_unlock(&group->mutex);
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 6e1563c943d6c..3efd51336ca1d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -578,7 +578,8 @@ struct iommu_ops {
 			      struct iommu_page_response *msg);
 
 	int (*def_domain_type)(struct device *dev);
-	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid);
+	void (*remove_dev_pasid)(struct device *dev, ioasid_t pasid,
+				 struct iommu_domain *domain);
 
 	const struct iommu_domain_ops *default_domain_ops;
 	unsigned long pgsize_bitmap;

From cfc2a8d72fa54c486d2d866f508a6546b0e3b4db Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:13 +0000
Subject: [PATCH 129/352] iommu/dma: use iommu_put_pages_list() to releae
 freelist

Free the IOMMU page tables via iommu_put_pages_list(). The page tables
were allocated via iommu_alloc_* functions in architecture specific
places, but are released in dma-iommu if the freelist is gathered during
map/unmap operations into iommu_iotlb_gather data structure.

Currently, only iommu/intel that does that.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-3-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 95b18ef9c69157ded5ece1136377cf8123b597f0)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/dma-iommu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e4cb26f6a9434..16a7c4a4f3dba 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -32,6 +32,7 @@
 #include <trace/events/swiotlb.h>
 
 #include "dma-iommu.h"
+#include "iommu-pages.h"
 
 struct iommu_dma_msi_page {
 	struct list_head	list;
@@ -156,7 +157,7 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
 		if (fq->entries[idx].counter >= counter)
 			break;
 
-		put_pages_list(&fq->entries[idx].freelist);
+		iommu_put_pages_list(&fq->entries[idx].freelist);
 		free_iova_fast(&cookie->iovad,
 			       fq->entries[idx].iova_pfn,
 			       fq->entries[idx].pages);
@@ -254,7 +255,7 @@ static void iommu_dma_free_fq_single(struct iova_fq *fq)
 	int idx;
 
 	fq_ring_for_each(idx, fq)
-		put_pages_list(&fq->entries[idx].freelist);
+		iommu_put_pages_list(&fq->entries[idx].freelist);
 	vfree(fq);
 }
 
@@ -267,7 +268,7 @@ static void iommu_dma_free_fq_percpu(struct iova_fq __percpu *percpu_fq)
 		struct iova_fq *fq = per_cpu_ptr(percpu_fq, cpu);
 
 		fq_ring_for_each(idx, fq)
-			put_pages_list(&fq->entries[idx].freelist);
+			iommu_put_pages_list(&fq->entries[idx].freelist);
 	}
 
 	free_percpu(percpu_fq);

From 099c35e16a633adc9d3f26c3bc9cd8052af35dd1 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:12 +0000
Subject: [PATCH 130/352] iommu/vt-d: add wrapper functions for page
 allocations

In order to improve observability and accountability of IOMMU layer, we
must account the number of pages that are allocated by functions that
are calling directly into buddy allocator.

This is achieved by first wrapping the allocation related functions into a
separate inline functions in new file:

drivers/iommu/iommu-pages.h

Convert all page allocation calls under iommu/intel to use these new
functions.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-2-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 06c375053cefc3a2f383d200596abe5ab3fb35f9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/intel/dmar.c          |  16 +--
 drivers/iommu/intel/iommu.c         |  47 +++------
 drivers/iommu/intel/iommu.h         |   2 -
 drivers/iommu/intel/irq_remapping.c |  16 +--
 drivers/iommu/intel/pasid.c         |  18 ++--
 drivers/iommu/intel/svm.c           |  11 +-
 drivers/iommu/iommu-pages.h         | 154 ++++++++++++++++++++++++++++
 7 files changed, 201 insertions(+), 63 deletions(-)
 create mode 100644 drivers/iommu/iommu-pages.h

diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
index 36d7427b12026..87ad996e5257f 100644
--- a/drivers/iommu/intel/dmar.c
+++ b/drivers/iommu/intel/dmar.c
@@ -32,6 +32,7 @@
 
 #include "iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 #include "perf.h"
 #include "trace.h"
 #include "perfmon.h"
@@ -1187,7 +1188,7 @@ static void free_iommu(struct intel_iommu *iommu)
 	}
 
 	if (iommu->qi) {
-		free_page((unsigned long)iommu->qi->desc);
+		iommu_free_page(iommu->qi->desc);
 		kfree(iommu->qi->desc_status);
 		kfree(iommu->qi);
 	}
@@ -1755,7 +1756,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu)
 int dmar_enable_qi(struct intel_iommu *iommu)
 {
 	struct q_inval *qi;
-	struct page *desc_page;
+	void *desc;
+	int order;
 
 	if (!ecap_qis(iommu->ecap))
 		return -ENOENT;
@@ -1776,19 +1778,19 @@ int dmar_enable_qi(struct intel_iommu *iommu)
 	 * Need two pages to accommodate 256 descriptors of 256 bits each
 	 * if the remapping hardware supports scalable mode translation.
 	 */
-	desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
-				     !!ecap_smts(iommu->ecap));
-	if (!desc_page) {
+	order = ecap_smts(iommu->ecap) ? 1 : 0;
+	desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order);
+	if (!desc) {
 		kfree(qi);
 		iommu->qi = NULL;
 		return -ENOMEM;
 	}
 
-	qi->desc = page_address(desc_page);
+	qi->desc = desc;
 
 	qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
 	if (!qi->desc_status) {
-		free_page((unsigned long) qi->desc);
+		iommu_free_page(qi->desc);
 		kfree(qi);
 		iommu->qi = NULL;
 		return -ENOMEM;
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index ae58299c5a9dc..6155877984c46 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -27,6 +27,7 @@
 #include "iommu.h"
 #include "../dma-iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 #include "pasid.h"
 #include "cap_audit.h"
 #include "perfmon.h"
@@ -308,22 +309,6 @@ static int __init intel_iommu_setup(char *str)
 }
 __setup("intel_iommu=", intel_iommu_setup);
 
-void *alloc_pgtable_page(int node, gfp_t gfp)
-{
-	struct page *page;
-	void *vaddr = NULL;
-
-	page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
-	if (page)
-		vaddr = page_address(page);
-	return vaddr;
-}
-
-void free_pgtable_page(void *vaddr)
-{
-	free_page((unsigned long)vaddr);
-}
-
 static int domain_type_is_si(struct dmar_domain *domain)
 {
 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
@@ -555,7 +540,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
 		if (!alloc)
 			return NULL;
 
-		context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
+		context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
 		if (!context)
 			return NULL;
 
@@ -729,17 +714,17 @@ static void free_context_table(struct intel_iommu *iommu)
 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
 		context = iommu_context_addr(iommu, i, 0, 0);
 		if (context)
-			free_pgtable_page(context);
+			iommu_free_page(context);
 
 		if (!sm_supported(iommu))
 			continue;
 
 		context = iommu_context_addr(iommu, i, 0x80, 0);
 		if (context)
-			free_pgtable_page(context);
+			iommu_free_page(context);
 	}
 
-	free_pgtable_page(iommu->root_entry);
+	iommu_free_page(iommu->root_entry);
 	iommu->root_entry = NULL;
 }
 
@@ -877,7 +862,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 		if (!dma_pte_present(pte)) {
 			uint64_t pteval;
 
-			tmp_page = alloc_pgtable_page(domain->nid, gfp);
+			tmp_page = iommu_alloc_page_node(domain->nid, gfp);
 
 			if (!tmp_page)
 				return NULL;
@@ -889,7 +874,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
 
 			if (cmpxchg64(&pte->val, 0ULL, pteval))
 				/* Someone else set it while we were thinking; use theirs. */
-				free_pgtable_page(tmp_page);
+				iommu_free_page(tmp_page);
 			else
 				domain_flush_cache(domain, pte, sizeof(*pte));
 		}
@@ -1002,7 +987,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
 		      last_pfn < level_pfn + level_size(level) - 1)) {
 			dma_clear_pte(pte);
 			domain_flush_cache(domain, pte, sizeof(*pte));
-			free_pgtable_page(level_pte);
+			iommu_free_page(level_pte);
 		}
 next:
 		pfn += level_size(level);
@@ -1026,7 +1011,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
 
 	/* free pgd */
 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
-		free_pgtable_page(domain->pgd);
+		iommu_free_page(domain->pgd);
 		domain->pgd = NULL;
 	}
 }
@@ -1128,7 +1113,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
 {
 	struct root_entry *root;
 
-	root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
+	root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
 	if (!root) {
 		pr_err("Allocating root entry for %s failed\n",
 			iommu->name);
@@ -1851,7 +1836,7 @@ static void domain_exit(struct dmar_domain *domain)
 		LIST_HEAD(freelist);
 
 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
-		put_pages_list(&freelist);
+		iommu_put_pages_list(&freelist);
 	}
 
 	if (WARN_ON(!list_empty(&domain->devices)))
@@ -2591,7 +2576,7 @@ static int copy_context_table(struct intel_iommu *iommu,
 			if (!old_ce)
 				goto out;
 
-			new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
+			new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
 			if (!new_ce)
 				goto out_unmap;
 
@@ -3527,7 +3512,7 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
 					start_vpfn, mhp->nr_pages,
 					list_empty(&freelist), 0);
 			rcu_read_unlock();
-			put_pages_list(&freelist);
+			iommu_put_pages_list(&freelist);
 		}
 		break;
 	}
@@ -3934,7 +3919,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
 	domain->max_addr = 0;
 
 	/* always allocate the top pgd */
-	domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
+	domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
 	if (!domain->pgd)
 		return -ENOMEM;
 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
@@ -4088,7 +4073,7 @@ int prepare_domain_attach_device(struct iommu_domain *domain,
 		pte = dmar_domain->pgd;
 		if (dma_pte_present(pte)) {
 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
-			free_pgtable_page(pte);
+			iommu_free_page(pte);
 		}
 		dmar_domain->agaw--;
 	}
@@ -4238,7 +4223,7 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
 	if (dmar_domain->nested_parent)
 		parent_domain_flush(dmar_domain, start_pfn, nrpages,
 				    list_empty(&gather->freelist));
-	put_pages_list(&gather->freelist);
+	iommu_put_pages_list(&gather->freelist);
 }
 
 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index 404d2476a8774..8d081d8c6f41d 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -1085,8 +1085,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain);
 
 int dmar_ir_support(void);
 
-void *alloc_pgtable_page(int node, gfp_t gfp);
-void free_pgtable_page(void *vaddr);
 void iommu_flush_write_buffer(struct intel_iommu *iommu);
 struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
 					       const struct iommu_user_data *user_data);
diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
index 566297bc87ddb..39cd9626eb8d0 100644
--- a/drivers/iommu/intel/irq_remapping.c
+++ b/drivers/iommu/intel/irq_remapping.c
@@ -22,6 +22,7 @@
 
 #include "iommu.h"
 #include "../irq_remapping.h"
+#include "../iommu-pages.h"
 #include "cap_audit.h"
 
 enum irq_mode {
@@ -527,7 +528,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 	struct ir_table *ir_table;
 	struct fwnode_handle *fn;
 	unsigned long *bitmap;
-	struct page *pages;
+	void *ir_table_base;
 
 	if (iommu->ir_table)
 		return 0;
@@ -536,9 +537,9 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 	if (!ir_table)
 		return -ENOMEM;
 
-	pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
-				 INTR_REMAP_PAGE_ORDER);
-	if (!pages) {
+	ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL,
+					       INTR_REMAP_PAGE_ORDER);
+	if (!ir_table_base) {
 		pr_err("IR%d: failed to allocate pages of order %d\n",
 		       iommu->seq_id, INTR_REMAP_PAGE_ORDER);
 		goto out_free_table;
@@ -573,7 +574,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 	else
 		iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops;
 
-	ir_table->base = page_address(pages);
+	ir_table->base = ir_table_base;
 	ir_table->bitmap = bitmap;
 	iommu->ir_table = ir_table;
 
@@ -622,7 +623,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
 out_free_bitmap:
 	bitmap_free(bitmap);
 out_free_pages:
-	__free_pages(pages, INTR_REMAP_PAGE_ORDER);
+	iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER);
 out_free_table:
 	kfree(ir_table);
 
@@ -643,8 +644,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
 			irq_domain_free_fwnode(fn);
 			iommu->ir_domain = NULL;
 		}
-		free_pages((unsigned long)iommu->ir_table->base,
-			   INTR_REMAP_PAGE_ORDER);
+		iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER);
 		bitmap_free(iommu->ir_table->bitmap);
 		kfree(iommu->ir_table);
 		iommu->ir_table = NULL;
diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
index a51e895d9a178..6ef582bfaea50 100644
--- a/drivers/iommu/intel/pasid.c
+++ b/drivers/iommu/intel/pasid.c
@@ -20,6 +20,7 @@
 
 #include "iommu.h"
 #include "pasid.h"
+#include "../iommu-pages.h"
 
 /*
  * Intel IOMMU system wide PASID name space:
@@ -38,7 +39,7 @@ int intel_pasid_alloc_table(struct device *dev)
 {
 	struct device_domain_info *info;
 	struct pasid_table *pasid_table;
-	struct page *pages;
+	struct pasid_dir_entry *dir;
 	u32 max_pasid = 0;
 	int order, size;
 
@@ -59,14 +60,13 @@ int intel_pasid_alloc_table(struct device *dev)
 
 	size = max_pasid >> (PASID_PDE_SHIFT - 3);
 	order = size ? get_order(size) : 0;
-	pages = alloc_pages_node(info->iommu->node,
-				 GFP_KERNEL | __GFP_ZERO, order);
-	if (!pages) {
+	dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order);
+	if (!dir) {
 		kfree(pasid_table);
 		return -ENOMEM;
 	}
 
-	pasid_table->table = page_address(pages);
+	pasid_table->table = dir;
 	pasid_table->order = order;
 	pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
 	info->pasid_table = pasid_table;
@@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev)
 	max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
 	for (i = 0; i < max_pde; i++) {
 		table = get_pasid_table_from_pde(&dir[i]);
-		free_pgtable_page(table);
+		iommu_free_page(table);
 	}
 
-	free_pages((unsigned long)pasid_table->table, pasid_table->order);
+	iommu_free_pages(pasid_table->table, pasid_table->order);
 	kfree(pasid_table);
 }
 
@@ -146,7 +146,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 retry:
 	entries = get_pasid_table_from_pde(&dir[dir_index]);
 	if (!entries) {
-		entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC);
+		entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC);
 		if (!entries)
 			return NULL;
 
@@ -158,7 +158,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
 		 */
 		if (cmpxchg64(&dir[dir_index].val, 0ULL,
 			      (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
-			free_pgtable_page(entries);
+			iommu_free_page(entries);
 			goto retry;
 		}
 		if (!ecap_coherent(info->iommu->ecap)) {
diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
index ee58f0d962272..e42f50284f361 100644
--- a/drivers/iommu/intel/svm.c
+++ b/drivers/iommu/intel/svm.c
@@ -22,6 +22,7 @@
 #include "iommu.h"
 #include "pasid.h"
 #include "perf.h"
+#include "../iommu-pages.h"
 #include "trace.h"
 
 static irqreturn_t prq_event_thread(int irq, void *d);
@@ -63,16 +64,14 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
 int intel_svm_enable_prq(struct intel_iommu *iommu)
 {
 	struct iopf_queue *iopfq;
-	struct page *pages;
 	int irq, ret;
 
-	pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
-	if (!pages) {
+	iommu->prq = iommu_alloc_pages_node(iommu->node, GFP_KERNEL, PRQ_ORDER);
+	if (!iommu->prq) {
 		pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
 			iommu->name);
 		return -ENOMEM;
 	}
-	iommu->prq = page_address(pages);
 
 	irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
 	if (irq <= 0) {
@@ -117,7 +116,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
 	dmar_free_hwirq(irq);
 	iommu->pr_irq = 0;
 free_prq:
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu_free_pages(iommu->prq, PRQ_ORDER);
 	iommu->prq = NULL;
 
 	return ret;
@@ -140,7 +139,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
 		iommu->iopf_queue = NULL;
 	}
 
-	free_pages((unsigned long)iommu->prq, PRQ_ORDER);
+	iommu_free_pages(iommu->prq, PRQ_ORDER);
 	iommu->prq = NULL;
 
 	return 0;
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
new file mode 100644
index 0000000000000..5a222d0ad25cc
--- /dev/null
+++ b/drivers/iommu/iommu-pages.h
@@ -0,0 +1,154 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, Google LLC.
+ * Pasha Tatashin <pasha.tatashin@soleen.com>
+ */
+
+#ifndef __IOMMU_PAGES_H
+#define __IOMMU_PAGES_H
+
+#include <linux/vmstat.h>
+#include <linux/gfp.h>
+#include <linux/mm.h>
+
+/*
+ * All page allocations that should be reported to as "iommu-pagetables" to
+ * userspace must use one of the functions below.  This includes allocations of
+ * page-tables and other per-iommu_domain configuration structures.
+ *
+ * This is necessary for the proper accounting as IOMMU state can be rather
+ * large, i.e. multiple gigabytes in size.
+ */
+
+/**
+ * __iommu_alloc_pages - allocate a zeroed page of a given order.
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the head struct page of the allocated page.
+ */
+static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
+{
+	struct page *page;
+
+	page = alloc_pages(gfp | __GFP_ZERO, order);
+	if (unlikely(!page))
+		return NULL;
+
+	return page;
+}
+
+/**
+ * __iommu_free_pages - free page of a given order
+ * @page: head struct page of the page
+ * @order: page order
+ */
+static inline void __iommu_free_pages(struct page *page, int order)
+{
+	if (!page)
+		return;
+
+	__free_pages(page, order);
+}
+
+/**
+ * iommu_alloc_pages_node - allocate a zeroed page of a given order from
+ * specific NUMA node.
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
+{
+	struct page *page = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
+
+	if (unlikely(!page))
+		return NULL;
+
+	return page_address(page);
+}
+
+/**
+ * iommu_alloc_pages - allocate a zeroed page of a given order
+ * @gfp: buddy allocator flags
+ * @order: page order
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_pages(gfp_t gfp, int order)
+{
+	struct page *page = __iommu_alloc_pages(gfp, order);
+
+	if (unlikely(!page))
+		return NULL;
+
+	return page_address(page);
+}
+
+/**
+ * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
+ * @nid: memory NUMA node id
+ * @gfp: buddy allocator flags
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_page_node(int nid, gfp_t gfp)
+{
+	return iommu_alloc_pages_node(nid, gfp, 0);
+}
+
+/**
+ * iommu_alloc_page - allocate a zeroed page
+ * @gfp: buddy allocator flags
+ *
+ * returns the virtual address of the allocated page
+ */
+static inline void *iommu_alloc_page(gfp_t gfp)
+{
+	return iommu_alloc_pages(gfp, 0);
+}
+
+/**
+ * iommu_free_pages - free page of a given order
+ * @virt: virtual address of the page to be freed.
+ * @order: page order
+ */
+static inline void iommu_free_pages(void *virt, int order)
+{
+	if (!virt)
+		return;
+
+	__iommu_free_pages(virt_to_page(virt), order);
+}
+
+/**
+ * iommu_free_page - free page
+ * @virt: virtual address of the page to be freed.
+ */
+static inline void iommu_free_page(void *virt)
+{
+	iommu_free_pages(virt, 0);
+}
+
+/**
+ * iommu_put_pages_list - free a list of pages.
+ * @page: the head of the lru list to be freed.
+ *
+ * There are no locking requirement for these pages, as they are going to be
+ * put on a free list as soon as refcount reaches 0. Pages are put on this LRU
+ * list once they are removed from the IOMMU page tables. However, they can
+ * still be access through debugfs.
+ */
+static inline void iommu_put_pages_list(struct list_head *page)
+{
+	while (!list_empty(page)) {
+		struct page *p = list_entry(page->prev, struct page, lru);
+
+		list_del(&p->lru);
+		put_page(p);
+	}
+}
+
+#endif	/* __IOMMU_PAGES_H */

From bb21953c9dcea4c96f2014b236ec40a7a3aad99a Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:15 +0000
Subject: [PATCH 131/352] iommu/io-pgtable-arm: use page allocation function
 provided by iommu-pages.h

Convert iommu/io-pgtable-arm.c to use the new page allocation functions
provided in iommu-pages.h.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-5-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 9a3dd4c1ee7a183229163320eb38f15b17342f78)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgtable-arm.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index f7828a7aad410..3d23b924cec16 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -21,6 +21,7 @@
 #include <asm/barrier.h>
 
 #include "io-pgtable-arm.h"
+#include "iommu-pages.h"
 
 #define ARM_LPAE_MAX_ADDR_BITS		52
 #define ARM_LPAE_S2_MAX_CONCAT_PAGES	16
@@ -198,14 +199,10 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
 
 	VM_BUG_ON((gfp & __GFP_HIGHMEM));
 
-	if (cfg->alloc) {
+	if (cfg->alloc)
 		pages = cfg->alloc(cookie, size, gfp);
-	} else {
-		struct page *p;
-
-		p = alloc_pages_node(dev_to_node(dev), gfp | __GFP_ZERO, order);
-		pages = p ? page_address(p) : NULL;
-	}
+	else
+		pages = iommu_alloc_pages_node(dev_to_node(dev), gfp, order);
 
 	if (!pages)
 		return NULL;
@@ -233,7 +230,7 @@ static void *__arm_lpae_alloc_pages(size_t size, gfp_t gfp,
 	if (cfg->free)
 		cfg->free(cookie, pages, size);
 	else
-		free_pages((unsigned long)pages, order);
+		iommu_free_pages(pages, order);
 
 	return NULL;
 }
@@ -249,7 +246,7 @@ static void __arm_lpae_free_pages(void *pages, size_t size,
 	if (cfg->free)
 		cfg->free(cookie, pages, size);
 	else
-		free_pages((unsigned long)pages, get_order(size));
+		iommu_free_pages(pages, get_order(size));
 }
 
 static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,

From 7ec56eff515c07ceb930a54a80d9c177616ab084 Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:21 +0000
Subject: [PATCH 132/352] iommu: observability of the IOMMU allocations

Add NR_IOMMU_PAGES into node_stat_item that counts number of pages
that are allocated by the IOMMU subsystem.

The allocations can be view per-node via:
/sys/devices/system/node/nodeN/vmstat.

For example:

$ grep iommu /sys/devices/system/node/node*/vmstat
/sys/devices/system/node/node0/vmstat:nr_iommu_pages 106025
/sys/devices/system/node/node1/vmstat:nr_iommu_pages 3464

The value is in page-count, therefore, in the above example
the iommu allocations amount to ~428M.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-11-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit bd3520a93a84cd8c3897283e5891a9106fcf5acc)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu-pages.h | 30 ++++++++++++++++++++++++++++++
 include/linux/mmzone.h      |  3 +++
 mm/vmstat.c                 |  3 +++
 3 files changed, 36 insertions(+)

diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index 5a222d0ad25cc..1264b0f6b6c3c 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -20,6 +20,30 @@
  * large, i.e. multiple gigabytes in size.
  */
 
+/**
+ * __iommu_alloc_account - account for newly allocated page.
+ * @page: head struct page of the page.
+ * @order: order of the page
+ */
+static inline void __iommu_alloc_account(struct page *page, int order)
+{
+	const long pgcnt = 1l << order;
+
+	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt);
+}
+
+/**
+ * __iommu_free_account - account a page that is about to be freed.
+ * @page: head struct page of the page.
+ * @order: order of the page
+ */
+static inline void __iommu_free_account(struct page *page, int order)
+{
+	const long pgcnt = 1l << order;
+
+	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt);
+}
+
 /**
  * __iommu_alloc_pages - allocate a zeroed page of a given order.
  * @gfp: buddy allocator flags
@@ -35,6 +59,8 @@ static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
 	if (unlikely(!page))
 		return NULL;
 
+	__iommu_alloc_account(page, order);
+
 	return page;
 }
 
@@ -48,6 +74,7 @@ static inline void __iommu_free_pages(struct page *page, int order)
 	if (!page)
 		return;
 
+	__iommu_free_account(page, order);
 	__free_pages(page, order);
 }
 
@@ -67,6 +94,8 @@ static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
 	if (unlikely(!page))
 		return NULL;
 
+	__iommu_alloc_account(page, order);
+
 	return page_address(page);
 }
 
@@ -147,6 +176,7 @@ static inline void iommu_put_pages_list(struct list_head *page)
 		struct page *p = list_entry(page->prev, struct page, lru);
 
 		list_del(&p->lru);
+		__iommu_free_account(p, 0);
 		put_page(p);
 	}
 }
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index a497f189d9881..bb6bc504915a6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -203,6 +203,9 @@ enum node_stat_item {
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
 	NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
+#ifdef CONFIG_IOMMU_SUPPORT
+	NR_IOMMU_PAGES,		/* # of pages allocated by IOMMU */
+#endif
 #ifdef CONFIG_SWAP
 	NR_SWAPCACHE,
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index db79935e4a543..8507c497218b8 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1242,6 +1242,9 @@ const char * const vmstat_text[] = {
 #endif
 	"nr_page_table_pages",
 	"nr_sec_page_table_pages",
+#ifdef CONFIG_IOMMU_SUPPORT
+	"nr_iommu_pages",
+#endif
 #ifdef CONFIG_SWAP
 	"nr_swapcached",
 #endif

From 9369ef4edfb29db829d1ad4d9e90f59a23259a4b Mon Sep 17 00:00:00 2001
From: Pasha Tatashin <pasha.tatashin@soleen.com>
Date: Sat, 13 Apr 2024 00:25:22 +0000
Subject: [PATCH 133/352] iommu: account IOMMU allocated memory

In order to be able to limit the amount of memory that is allocated
by IOMMU subsystem, the memory must be accounted.

Account IOMMU as part of the secondary pagetables as it was discussed
at LPC.

The value of SecPageTables now contains mmeory allocation by IOMMU
and KVM.

There is a difference between GFP_ACCOUNT and what NR_IOMMU_PAGES shows.
GFP_ACCOUNT is set only where it makes sense to charge to user
processes, i.e. IOMMU Page Tables, but there more IOMMU shared data
that should not really be charged to a specific process.

Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
Acked-by: David Rientjes <rientjes@google.com>
Tested-by: Bagas Sanjaya <bagasdotme@gmail.com>
Link: https://lore.kernel.org/r/20240413002522.1101315-12-pasha.tatashin@soleen.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 212c5c078d83d780cf2873ca931df135771e8bb7)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 Documentation/admin-guide/cgroup-v2.rst | 2 +-
 Documentation/filesystems/proc.rst      | 4 ++--
 drivers/iommu/iommu-pages.h             | 2 ++
 include/linux/mmzone.h                  | 2 +-
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 17e6e95651564..15f80fea8df76 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1432,7 +1432,7 @@ PAGE_SIZE multiple when read back.
 	  sec_pagetables
 		Amount of memory allocated for secondary page tables,
 		this currently includes KVM mmu allocations on x86
-		and arm64.
+		and arm64 and IOMMU page tables.
 
 	  percpu (npn)
 		Amount of memory used for storing per-cpu kernel
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 104c6d047d9b5..604b2dccdc5a9 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -1110,8 +1110,8 @@ KernelStack
 PageTables
               Memory consumed by userspace page tables
 SecPageTables
-              Memory consumed by secondary page tables, this currently
-              currently includes KVM mmu allocations on x86 and arm64.
+              Memory consumed by secondary page tables, this currently includes
+              KVM mmu and IOMMU allocations on x86 and arm64.
 NFS_Unstable
               Always zero. Previous counted pages which had been written to
               the server, but has not been committed to stable storage.
diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
index 1264b0f6b6c3c..82ebf00330811 100644
--- a/drivers/iommu/iommu-pages.h
+++ b/drivers/iommu/iommu-pages.h
@@ -30,6 +30,7 @@ static inline void __iommu_alloc_account(struct page *page, int order)
 	const long pgcnt = 1l << order;
 
 	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, pgcnt);
+	mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, pgcnt);
 }
 
 /**
@@ -42,6 +43,7 @@ static inline void __iommu_free_account(struct page *page, int order)
 	const long pgcnt = 1l << order;
 
 	mod_node_page_state(page_pgdat(page), NR_IOMMU_PAGES, -pgcnt);
+	mod_lruvec_page_state(page, NR_SECONDARY_PAGETABLE, -pgcnt);
 }
 
 /**
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index bb6bc504915a6..a18edcf12d53c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -202,7 +202,7 @@ enum node_stat_item {
 	NR_KERNEL_SCS_KB,	/* measured in KiB */
 #endif
 	NR_PAGETABLE,		/* used for pagetables */
-	NR_SECONDARY_PAGETABLE, /* secondary pagetables, e.g. KVM pagetables */
+	NR_SECONDARY_PAGETABLE, /* secondary pagetables, KVM & IOMMU */
 #ifdef CONFIG_IOMMU_SUPPORT
 	NR_IOMMU_PAGES,		/* # of pages allocated by IOMMU */
 #endif

From 4003b3cdd0089d91d73cd456bb1bf22f618e5657 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Apr 2024 14:12:11 -0300
Subject: [PATCH 134/352] iommu/arm-smmu: Convert to domain_alloc_paging()

Now that the BLOCKED and IDENTITY behaviors are managed with their own
domains change to the domain_alloc_paging() op.

The check for using_legacy_binding is now redundant,
arm_smmu_def_domain_type() always returns IOMMU_DOMAIN_IDENTITY for this
mode, so the core code will never attempt to create a DMA domain in the
first place.

Since commit a4fdd9762272 ("iommu: Use flush queue capability") the core
code only passes in IDENTITY/BLOCKED/UNMANAGED/DMA domain types. It will
not pass in IDENTITY or BLOCKED if the global statics exist, so the test
for DMA is also redundant now too.

Cc: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/0-v1-3632c65678e0+2f1-smmu_alloc_paging_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d75d7dc26f29141fe31167c5414605d4087f0abb)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index c572d877b0e10..5935f44e1d1d6 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -859,14 +859,10 @@ static void arm_smmu_destroy_domain_context(struct arm_smmu_domain *smmu_domain)
 	arm_smmu_rpm_put(smmu);
 }
 
-static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
+static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
 {
 	struct arm_smmu_domain *smmu_domain;
 
-	if (type != IOMMU_DOMAIN_UNMANAGED) {
-		if (using_legacy_binding || type != IOMMU_DOMAIN_DMA)
-			return NULL;
-	}
 	/*
 	 * Allocate the domain and initialise some of its data structures.
 	 * We can't really do anything meaningful until we've added a
@@ -1596,7 +1592,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
-	.domain_alloc		= arm_smmu_domain_alloc,
+	.domain_alloc_paging	= arm_smmu_domain_alloc_paging,
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.probe_finalize		= arm_smmu_probe_finalize,

From cb26e55d27ee07f76f3a1c905477d27eb4abe962 Mon Sep 17 00:00:00 2001
From: Georgi Djakov <quic_c_gdjako@quicinc.com>
Date: Wed, 17 Apr 2024 06:37:26 -0700
Subject: [PATCH 135/352] iommu/arm-smmu-qcom-debug: Add support for TBUs

Operating the TBUs (Translation Buffer Units) from Linux on Qualcomm
platforms can help with debugging context faults. To help with that,
the TBUs can run ATOS (Address Translation Operations) to manually
trigger address translation of IOVA to physical address in hardware
and provide more details when a context fault happens.

The driver will control the resources needed by the TBU to allow
running the debug operations such as ATOS, check for outstanding
transactions, do snapshot capture etc.

Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/20240417133731.2055383-3-quic_c_gdjako@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 414ecb030870a31262e5fd29fd372ee73b32a6be)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/Kconfig                         |  12 +-
 .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c  | 353 ++++++++++++++++++
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h    |   2 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h         |   2 +
 4 files changed, 365 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index e9f6a5cb3400f..d62534abb96c2 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -379,10 +379,14 @@ config ARM_SMMU_QCOM_DEBUG
 	depends on ARM_SMMU_QCOM
 	help
 	  Support for implementation specific debug features in ARM SMMU
-	  hardware found in QTI platforms.
-
-	  Say Y here to enable debug for issues such as TLB sync timeouts
-	  which requires implementation defined register dumps.
+	  hardware found in QTI platforms. This include support for
+	  the Translation Buffer Units (TBU) that can be used to obtain
+	  additional information when debugging memory management issues
+	  like context faults.
+
+	  Say Y here to enable debug for issues such as context faults
+	  or TLB sync timeouts which requires implementation defined
+	  register dumps.
 
 config ARM_SMMU_V3
 	tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index bb89d49adf8d2..eff7ca94ec8d0 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -1,15 +1,66 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (c) 2022 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2024 Qualcomm Innovation Center, Inc. All rights reserved.
  */
 
+#include <linux/cleanup.h>
 #include <linux/device.h>
+#include <linux/interconnect.h>
 #include <linux/firmware/qcom/qcom_scm.h>
+#include <linux/iopoll.h>
+#include <linux/list.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
 #include <linux/ratelimit.h>
+#include <linux/spinlock.h>
 
 #include "arm-smmu.h"
 #include "arm-smmu-qcom.h"
 
+#define TBU_DBG_TIMEOUT_US		100
+#define DEBUG_AXUSER_REG		0x30
+#define DEBUG_AXUSER_CDMID		GENMASK_ULL(43, 36)
+#define DEBUG_AXUSER_CDMID_VAL		0xff
+#define DEBUG_PAR_REG			0x28
+#define DEBUG_PAR_FAULT_VAL		BIT(0)
+#define DEBUG_PAR_PA			GENMASK_ULL(47, 12)
+#define DEBUG_SID_HALT_REG		0x0
+#define DEBUG_SID_HALT_VAL		BIT(16)
+#define DEBUG_SID_HALT_SID		GENMASK(9, 0)
+#define DEBUG_SR_HALT_ACK_REG		0x20
+#define DEBUG_SR_HALT_ACK_VAL		BIT(1)
+#define DEBUG_SR_ECATS_RUNNING_VAL	BIT(0)
+#define DEBUG_TXN_AXCACHE		GENMASK(5, 2)
+#define DEBUG_TXN_AXPROT		GENMASK(8, 6)
+#define DEBUG_TXN_AXPROT_PRIV		0x1
+#define DEBUG_TXN_AXPROT_NSEC		0x2
+#define DEBUG_TXN_TRIGG_REG		0x18
+#define DEBUG_TXN_TRIGGER		BIT(0)
+#define DEBUG_VA_ADDR_REG		0x8
+
+static LIST_HEAD(tbu_list);
+static DEFINE_MUTEX(tbu_list_lock);
+static DEFINE_SPINLOCK(atos_lock);
+
+struct qcom_tbu {
+	struct device *dev;
+	struct device_node *smmu_np;
+	u32 sid_range[2];
+	struct list_head list;
+	struct clk *clk;
+	struct icc_path	*path;
+	void __iomem *base;
+	spinlock_t halt_lock; /* multiple halt or resume can't execute concurrently */
+	int halt_count;
+};
+
+static struct qcom_smmu *to_qcom_smmu(struct arm_smmu_device *smmu)
+{
+	return container_of(smmu, struct qcom_smmu, smmu);
+}
+
 void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu)
 {
 	int ret;
@@ -49,3 +100,305 @@ void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu)
 			tbu_pwr_status, sync_inv_ack, sync_inv_progress);
 	}
 }
+
+static struct qcom_tbu *qcom_find_tbu(struct qcom_smmu *qsmmu, u32 sid)
+{
+	struct qcom_tbu *tbu;
+	u32 start, end;
+
+	guard(mutex)(&tbu_list_lock);
+
+	if (list_empty(&tbu_list))
+		return NULL;
+
+	list_for_each_entry(tbu, &tbu_list, list) {
+		start = tbu->sid_range[0];
+		end = start + tbu->sid_range[1];
+
+		if (qsmmu->smmu.dev->of_node == tbu->smmu_np &&
+		    start <= sid && sid < end)
+			return tbu;
+	}
+	dev_err(qsmmu->smmu.dev, "Unable to find TBU for sid 0x%x\n", sid);
+
+	return NULL;
+}
+
+static int qcom_tbu_halt(struct qcom_tbu *tbu, struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	int ret = 0, idx = smmu_domain->cfg.cbndx;
+	u32 val, fsr, status;
+
+	guard(spinlock_irqsave)(&tbu->halt_lock);
+	if (tbu->halt_count) {
+		tbu->halt_count++;
+		return ret;
+	}
+
+	val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val |= DEBUG_SID_HALT_VAL;
+	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+	if ((fsr & ARM_SMMU_FSR_FAULT) && (fsr & ARM_SMMU_FSR_SS)) {
+		u32 sctlr_orig, sctlr;
+
+		/*
+		 * We are in a fault. Our request to halt the bus will not
+		 * complete until transactions in front of us (such as the fault
+		 * itself) have completed. Disable iommu faults and terminate
+		 * any existing transactions.
+		 */
+		sctlr_orig = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR);
+		sctlr = sctlr_orig & ~(ARM_SMMU_SCTLR_CFCFG | ARM_SMMU_SCTLR_CFIE);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, ARM_SMMU_RESUME_TERMINATE);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig);
+	}
+
+	if (readl_poll_timeout_atomic(tbu->base + DEBUG_SR_HALT_ACK_REG, status,
+				      (status & DEBUG_SR_HALT_ACK_VAL),
+				      0, TBU_DBG_TIMEOUT_US)) {
+		dev_err(tbu->dev, "Timeout while trying to halt TBU!\n");
+		ret = -ETIMEDOUT;
+
+		val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+		val &= ~DEBUG_SID_HALT_VAL;
+		writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+		return ret;
+	}
+
+	tbu->halt_count = 1;
+
+	return ret;
+}
+
+static void qcom_tbu_resume(struct qcom_tbu *tbu)
+{
+	u32 val;
+
+	guard(spinlock_irqsave)(&tbu->halt_lock);
+	if (!tbu->halt_count) {
+		WARN(1, "%s: halt_count is 0", dev_name(tbu->dev));
+		return;
+	}
+
+	if (tbu->halt_count > 1) {
+		tbu->halt_count--;
+		return;
+	}
+
+	val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val &= ~DEBUG_SID_HALT_VAL;
+	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+	tbu->halt_count = 0;
+}
+
+static phys_addr_t qcom_tbu_trigger_atos(struct arm_smmu_domain *smmu_domain,
+					 struct qcom_tbu *tbu, dma_addr_t iova, u32 sid)
+{
+	bool atos_timedout = false;
+	phys_addr_t phys = 0;
+	ktime_t timeout;
+	u64 val;
+
+	/* Set address and stream-id */
+	val = readq_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val &= ~DEBUG_SID_HALT_SID;
+	val |= FIELD_PREP(DEBUG_SID_HALT_SID, sid);
+	writeq_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+	writeq_relaxed(iova, tbu->base + DEBUG_VA_ADDR_REG);
+	val = FIELD_PREP(DEBUG_AXUSER_CDMID, DEBUG_AXUSER_CDMID_VAL);
+	writeq_relaxed(val, tbu->base + DEBUG_AXUSER_REG);
+
+	/* Write-back read and write-allocate */
+	val = FIELD_PREP(DEBUG_TXN_AXCACHE, 0xf);
+
+	/* Non-secure access */
+	val |= FIELD_PREP(DEBUG_TXN_AXPROT, DEBUG_TXN_AXPROT_NSEC);
+
+	/* Privileged access */
+	val |= FIELD_PREP(DEBUG_TXN_AXPROT, DEBUG_TXN_AXPROT_PRIV);
+
+	val |= DEBUG_TXN_TRIGGER;
+	writeq_relaxed(val, tbu->base + DEBUG_TXN_TRIGG_REG);
+
+	timeout = ktime_add_us(ktime_get(), TBU_DBG_TIMEOUT_US);
+	for (;;) {
+		val = readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG);
+		if (!(val & DEBUG_SR_ECATS_RUNNING_VAL))
+			break;
+		val = readl_relaxed(tbu->base + DEBUG_PAR_REG);
+		if (val & DEBUG_PAR_FAULT_VAL)
+			break;
+		if (ktime_compare(ktime_get(), timeout) > 0) {
+			atos_timedout = true;
+			break;
+		}
+	}
+
+	val = readq_relaxed(tbu->base + DEBUG_PAR_REG);
+	if (val & DEBUG_PAR_FAULT_VAL)
+		dev_err(tbu->dev, "ATOS generated a fault interrupt! PAR = %llx, SID=0x%x\n",
+			val, sid);
+	else if (atos_timedout)
+		dev_err_ratelimited(tbu->dev, "ATOS translation timed out!\n");
+	else
+		phys = FIELD_GET(DEBUG_PAR_PA, val);
+
+	/* Reset hardware */
+	writeq_relaxed(0, tbu->base + DEBUG_TXN_TRIGG_REG);
+	writeq_relaxed(0, tbu->base + DEBUG_VA_ADDR_REG);
+	val = readl_relaxed(tbu->base + DEBUG_SID_HALT_REG);
+	val &= ~DEBUG_SID_HALT_SID;
+	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
+
+	return phys;
+}
+
+static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
+				     dma_addr_t iova, u32 sid)
+{
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	struct qcom_smmu *qsmmu = to_qcom_smmu(smmu);
+	int idx = smmu_domain->cfg.cbndx;
+	struct qcom_tbu *tbu;
+	u32 sctlr_orig, sctlr;
+	phys_addr_t phys = 0;
+	int attempt = 0;
+	int ret;
+	u64 fsr;
+
+	tbu = qcom_find_tbu(qsmmu, sid);
+	if (!tbu)
+		return 0;
+
+	ret = icc_set_bw(tbu->path, 0, UINT_MAX);
+	if (ret)
+		return ret;
+
+	ret = clk_prepare_enable(tbu->clk);
+	if (ret)
+		goto disable_icc;
+
+	ret = qcom_tbu_halt(tbu, smmu_domain);
+	if (ret)
+		goto disable_clk;
+
+	/*
+	 * ATOS/ECATS can trigger the fault interrupt, so disable it temporarily
+	 * and check for an interrupt manually.
+	 */
+	sctlr_orig = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_SCTLR);
+	sctlr = sctlr_orig & ~(ARM_SMMU_SCTLR_CFCFG | ARM_SMMU_SCTLR_CFIE);
+	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr);
+
+	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+	if (fsr & ARM_SMMU_FSR_FAULT) {
+		/* Clear pending interrupts */
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+
+		/*
+		 * TBU halt takes care of resuming any stalled transcation.
+		 * Kept it here for completeness sake.
+		 */
+		if (fsr & ARM_SMMU_FSR_SS)
+			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+					  ARM_SMMU_RESUME_TERMINATE);
+	}
+
+	/* Only one concurrent atos operation */
+	scoped_guard(spinlock_irqsave, &atos_lock) {
+		/*
+		 * If the translation fails, attempt the lookup more time."
+		 */
+		do {
+			phys = qcom_tbu_trigger_atos(smmu_domain, tbu, iova, sid);
+
+			fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+			if (fsr & ARM_SMMU_FSR_FAULT) {
+				/* Clear pending interrupts */
+				arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+
+				if (fsr & ARM_SMMU_FSR_SS)
+					arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
+							  ARM_SMMU_RESUME_TERMINATE);
+			}
+		} while (!phys && attempt++ < 2);
+
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr_orig);
+	}
+	qcom_tbu_resume(tbu);
+
+	/* Read to complete prior write transcations */
+	readl_relaxed(tbu->base + DEBUG_SR_HALT_ACK_REG);
+
+disable_clk:
+	clk_disable_unprepare(tbu->clk);
+disable_icc:
+	icc_set_bw(tbu->path, 0, 0);
+
+	return phys;
+}
+
+static int qcom_tbu_probe(struct platform_device *pdev)
+{
+	struct of_phandle_args args = { .args_count = 2 };
+	struct device_node *np = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct qcom_tbu *tbu;
+
+	tbu = devm_kzalloc(dev, sizeof(*tbu), GFP_KERNEL);
+	if (!tbu)
+		return -ENOMEM;
+
+	tbu->dev = dev;
+	INIT_LIST_HEAD(&tbu->list);
+	spin_lock_init(&tbu->halt_lock);
+
+	if (of_parse_phandle_with_args(np, "qcom,stream-id-range", "#iommu-cells", 0, &args)) {
+		dev_err(dev, "Cannot parse the 'qcom,stream-id-range' DT property\n");
+		return -EINVAL;
+	}
+
+	tbu->smmu_np =  args.np;
+	tbu->sid_range[0] = args.args[0];
+	tbu->sid_range[1] = args.args[1];
+	of_node_put(args.np);
+
+	tbu->base = devm_of_iomap(dev, np, 0, NULL);
+	if (IS_ERR(tbu->base))
+		return PTR_ERR(tbu->base);
+
+	tbu->clk = devm_clk_get_optional(dev, NULL);
+	if (IS_ERR(tbu->clk))
+		return PTR_ERR(tbu->clk);
+
+	tbu->path = devm_of_icc_get(dev, NULL);
+	if (IS_ERR(tbu->path))
+		return PTR_ERR(tbu->path);
+
+	guard(mutex)(&tbu_list_lock);
+	list_add_tail(&tbu->list, &tbu_list);
+
+	return 0;
+}
+
+static const struct of_device_id qcom_tbu_of_match[] = {
+	{ .compatible = "qcom,sc7280-tbu" },
+	{ .compatible = "qcom,sdm845-tbu" },
+	{ }
+};
+
+static struct platform_driver qcom_tbu_driver = {
+	.driver = {
+		.name           = "qcom_tbu",
+		.of_match_table = qcom_tbu_of_match,
+	},
+	.probe = qcom_tbu_probe,
+};
+builtin_platform_driver(qcom_tbu_driver);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
index 593910567b884..9bb3ae7d62da6 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
@@ -30,6 +30,8 @@ struct qcom_smmu_match_data {
 	const struct arm_smmu_impl *adreno_impl;
 };
 
+irqreturn_t qcom_smmu_context_fault(int irq, void *dev);
+
 #ifdef CONFIG_ARM_SMMU_QCOM_DEBUG
 void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu);
 #else
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 836ed6799a801..1670e95c4637e 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -136,6 +136,7 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CBAR_VMID		GENMASK(7, 0)
 
 #define ARM_SMMU_GR1_CBFRSYNRA(n)	(0x400 + ((n) << 2))
+#define ARM_SMMU_CBFRSYNRA_SID		GENMASK(15, 0)
 
 #define ARM_SMMU_GR1_CBA2R(n)		(0x800 + ((n) << 2))
 #define ARM_SMMU_CBA2R_VMID16		GENMASK(31, 16)
@@ -238,6 +239,7 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_ATSR		0x8f0
 #define ARM_SMMU_ATSR_ACTIVE		BIT(0)
 
+#define ARM_SMMU_RESUME_TERMINATE	BIT(0)
 
 /* Maximum number of context banks per SMMU */
 #define ARM_SMMU_MAX_CBS		128

From 264ce6157bf8e7c04e0ac42a924f3fa569c9ad10 Mon Sep 17 00:00:00 2001
From: Georgi Djakov <quic_c_gdjako@quicinc.com>
Date: Wed, 17 Apr 2024 06:37:27 -0700
Subject: [PATCH 136/352] iommu/arm-smmu: Allow using a threaded handler for
 context interrupts

Threaded IRQ handlers run in a less critical context compared to normal
IRQs, so they can perform more complex and time-consuming operations
without causing significant delays in other parts of the kernel.
During a context fault, it might be needed to do more processing and
gather debug information from TBUs in the handler. These operations may
sleep, so add an option to use a threaded IRQ handler in these cases.

Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/20240417133731.2055383-4-quic_c_gdjako@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 960be6e10d4fcb18cfe863e0ceb3213a75eecb81)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 12 ++++++++++--
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 5935f44e1d1d6..87c81f75cf844 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -806,8 +806,16 @@ static int arm_smmu_init_domain_context(struct arm_smmu_domain *smmu_domain,
 	else
 		context_fault = arm_smmu_context_fault;
 
-	ret = devm_request_irq(smmu->dev, irq, context_fault, IRQF_SHARED,
-			       "arm-smmu-context-fault", smmu_domain);
+	if (smmu->impl && smmu->impl->context_fault_needs_threaded_irq)
+		ret = devm_request_threaded_irq(smmu->dev, irq, NULL,
+						context_fault,
+						IRQF_ONESHOT | IRQF_SHARED,
+						"arm-smmu-context-fault",
+						smmu_domain);
+	else
+		ret = devm_request_irq(smmu->dev, irq, context_fault, IRQF_SHARED,
+				       "arm-smmu-context-fault", smmu_domain);
+
 	if (ret < 0) {
 		dev_err(smmu->dev, "failed to request context IRQ %d (%u)\n",
 			cfg->irptndx, irq);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 1670e95c4637e..4765c6945c344 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -438,6 +438,7 @@ struct arm_smmu_impl {
 	int (*def_domain_type)(struct device *dev);
 	irqreturn_t (*global_fault)(int irq, void *dev);
 	irqreturn_t (*context_fault)(int irq, void *dev);
+	bool context_fault_needs_threaded_irq;
 	int (*alloc_context_bank)(struct arm_smmu_domain *smmu_domain,
 				  struct arm_smmu_device *smmu,
 				  struct device *dev, int start);

From 223359b46f57241a043d67d5e1b247cf80d224b1 Mon Sep 17 00:00:00 2001
From: Georgi Djakov <quic_c_gdjako@quicinc.com>
Date: Wed, 17 Apr 2024 06:37:28 -0700
Subject: [PATCH 137/352] iommu/arm-smmu-qcom: Use a custom context fault
 handler for sdm845

The sdm845 platform now supports TBUs, so let's get additional debug
info from the TBUs when a context fault occurs. Implement a custom
context fault handler that does both software + hardware page table
walks and TLB Invalidate All.

Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/20240417133731.2055383-5-quic_c_gdjako@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d374555ef993433f4d2e08b700dbd27788427d61)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c  | 143 ++++++++++++++++++
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c    |   4 +
 2 files changed, 147 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index eff7ca94ec8d0..552199cbd9e25 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -345,6 +345,149 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
 	return phys;
 }
 
+static phys_addr_t qcom_smmu_iova_to_phys_hard(struct arm_smmu_domain *smmu_domain, dma_addr_t iova)
+{
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	int idx = smmu_domain->cfg.cbndx;
+	u32 frsynra;
+	u16 sid;
+
+	frsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+	sid = FIELD_GET(ARM_SMMU_CBFRSYNRA_SID, frsynra);
+
+	return qcom_iova_to_phys(smmu_domain, iova, sid);
+}
+
+static phys_addr_t qcom_smmu_verify_fault(struct arm_smmu_domain *smmu_domain, dma_addr_t iova, u32 fsr)
+{
+	struct io_pgtable *iop = io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops);
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	phys_addr_t phys_post_tlbiall;
+	phys_addr_t phys;
+
+	phys = qcom_smmu_iova_to_phys_hard(smmu_domain, iova);
+	io_pgtable_tlb_flush_all(iop);
+	phys_post_tlbiall = qcom_smmu_iova_to_phys_hard(smmu_domain, iova);
+
+	if (phys != phys_post_tlbiall) {
+		dev_err(smmu->dev,
+			"ATOS results differed across TLBIALL... (before: %pa after: %pa)\n",
+			&phys, &phys_post_tlbiall);
+	}
+
+	return (phys == 0 ? phys_post_tlbiall : phys);
+}
+
+irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
+{
+	struct arm_smmu_domain *smmu_domain = dev;
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	u32 fsr, fsynr, cbfrsynra, resume = 0;
+	int idx = smmu_domain->cfg.cbndx;
+	phys_addr_t phys_soft;
+	unsigned long iova;
+	int ret, tmp;
+
+	static DEFINE_RATELIMIT_STATE(_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+
+	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+	if (!(fsr & ARM_SMMU_FSR_FAULT))
+		return IRQ_NONE;
+
+	fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
+	iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
+	cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+
+	if (list_empty(&tbu_list)) {
+		ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
+					 fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+
+		if (ret == -ENOSYS)
+			dev_err_ratelimited(smmu->dev,
+					    "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
+					    fsr, iova, fsynr, cbfrsynra, idx);
+
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+		return IRQ_HANDLED;
+	}
+
+	phys_soft = ops->iova_to_phys(ops, iova);
+
+	tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova,
+				 fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+	if (!tmp || tmp == -EBUSY) {
+		dev_dbg(smmu->dev,
+			"Context fault handled by client: iova=0x%08lx, fsr=0x%x, fsynr=0x%x, cb=%d\n",
+			iova, fsr, fsynr, idx);
+		dev_dbg(smmu->dev, "soft iova-to-phys=%pa\n", &phys_soft);
+		ret = IRQ_HANDLED;
+		resume = ARM_SMMU_RESUME_TERMINATE;
+	} else {
+		phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, iova, fsr);
+
+		if (__ratelimit(&_rs)) {
+			dev_err(smmu->dev,
+				"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
+				fsr, iova, fsynr, cbfrsynra, idx);
+			dev_err(smmu->dev,
+				"FSR    = %08x [%s%s%s%s%s%s%s%s%s], SID=0x%x\n",
+				fsr,
+				(fsr & 0x02) ? "TF " : "",
+				(fsr & 0x04) ? "AFF " : "",
+				(fsr & 0x08) ? "PF " : "",
+				(fsr & 0x10) ? "EF " : "",
+				(fsr & 0x20) ? "TLBMCF " : "",
+				(fsr & 0x40) ? "TLBLKF " : "",
+				(fsr & 0x80) ? "MHF " : "",
+				(fsr & 0x40000000) ? "SS " : "",
+				(fsr & 0x80000000) ? "MULTI " : "",
+				cbfrsynra);
+
+			dev_err(smmu->dev,
+				"soft iova-to-phys=%pa\n", &phys_soft);
+			if (!phys_soft)
+				dev_err(smmu->dev,
+					"SOFTWARE TABLE WALK FAILED! Looks like %s accessed an unmapped address!\n",
+					dev_name(smmu->dev));
+			if (phys_atos)
+				dev_err(smmu->dev, "hard iova-to-phys (ATOS)=%pa\n",
+					&phys_atos);
+			else
+				dev_err(smmu->dev, "hard iova-to-phys (ATOS) failed\n");
+		}
+		ret = IRQ_NONE;
+		resume = ARM_SMMU_RESUME_TERMINATE;
+	}
+
+	/*
+	 * If the client returns -EBUSY, do not clear FSR and do not RESUME
+	 * if stalled. This is required to keep the IOMMU client stalled on
+	 * the outstanding fault. This gives the client a chance to take any
+	 * debug action and then terminate the stalled transaction.
+	 * So, the sequence in case of stall on fault should be:
+	 * 1) Do not clear FSR or write to RESUME here
+	 * 2) Client takes any debug action
+	 * 3) Client terminates the stalled transaction and resumes the IOMMU
+	 * 4) Client clears FSR. The FSR should only be cleared after 3) and
+	 *    not before so that the fault remains outstanding. This ensures
+	 *    SCTLR.HUPCF has the desired effect if subsequent transactions also
+	 *    need to be terminated.
+	 */
+	if (tmp != -EBUSY) {
+		/* Clear the faulting FSR */
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+
+		/* Retry or terminate any stalled transactions */
+		if (fsr & ARM_SMMU_FSR_SS)
+			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, resume);
+	}
+
+	return ret;
+}
+
 static int qcom_tbu_probe(struct platform_device *pdev)
 {
 	struct of_phandle_args args = { .args_count = 2 };
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 5c7cfc51b57c0..a901230dbabd1 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -422,6 +422,10 @@ static const struct arm_smmu_impl sdm845_smmu_500_impl = {
 	.reset = qcom_sdm845_smmu500_reset,
 	.write_s2cr = qcom_smmu_write_s2cr,
 	.tlb_sync = qcom_smmu_tlb_sync,
+#ifdef CONFIG_ARM_SMMU_QCOM_DEBUG
+	.context_fault = qcom_smmu_context_fault,
+	.context_fault_needs_threaded_irq = true,
+#endif
 };
 
 static const struct arm_smmu_impl qcom_adreno_smmu_v2_impl = {

From 184f463a3de50b87c3014613743bf321b23b7292 Mon Sep 17 00:00:00 2001
From: Georgi Djakov <quic_c_gdjako@quicinc.com>
Date: Wed, 17 Apr 2024 06:37:30 -0700
Subject: [PATCH 138/352] iommu/arm-smmu-qcom: Use the custom fault handler on
 more platforms

The TBU support is now available, so let's allow it to be used on other
platforms that have the Qualcomm SMMU-500 implementation with TBUs. This
will allow the context fault handler to query the TBUs when a context
fault occurs.

Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/20240417133731.2055383-7-quic_c_gdjako@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit b8ca7ce709f8210c13eec022e87a12111db5d745)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index a901230dbabd1..25f034677f568 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -413,6 +413,10 @@ static const struct arm_smmu_impl qcom_smmu_500_impl = {
 	.reset = arm_mmu500_reset,
 	.write_s2cr = qcom_smmu_write_s2cr,
 	.tlb_sync = qcom_smmu_tlb_sync,
+#ifdef CONFIG_ARM_SMMU_QCOM_DEBUG
+	.context_fault = qcom_smmu_context_fault,
+	.context_fault_needs_threaded_irq = true,
+#endif
 };
 
 static const struct arm_smmu_impl sdm845_smmu_500_impl = {

From 99ed722c65a2bd1d14f47629fceb5f08634b3241 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 18 Apr 2024 10:33:59 +0000
Subject: [PATCH 139/352] iommu: Add ops->domain_alloc_sva()

Make a new op that receives the device and the mm_struct that the SVA
domain should be created for. Unlike domain_alloc_paging() the dev
argument is never NULL here.

This allows drivers to fully initialize the SVA domain and allocate the
mmu_notifier during allocation. It allows the notifier lifetime to follow
the lifetime of the iommu_domain.

Since we have only one call site, upgrade the new op to return ERR_PTR
instead of NULL.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
[Removed smmu3 related changes - Vasant]
Signed-off-by: Vasant Hegde <vasant.hegde@amd.com>
Reviewed-by: Tina Zhang <tina.zhang@intel.com>
Link: https://lore.kernel.org/r/20240418103400.6229-15-vasant.hegde@amd.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 80af5a45202422db957549a241e00bf4d4e0ce89)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu-sva.c | 16 +++++++++++-----
 include/linux/iommu.h     |  3 +++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 640acc804e8cd..18a35e798b729 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -108,8 +108,8 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 
 	/* Allocate a new domain and set it on device pasid. */
 	domain = iommu_sva_domain_alloc(dev, mm);
-	if (!domain) {
-		ret = -ENOMEM;
+	if (IS_ERR(domain)) {
+		ret = PTR_ERR(domain);
 		goto out_free_handle;
 	}
 
@@ -283,9 +283,15 @@ struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_domain *domain;
 
-	domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
-	if (!domain)
-		return NULL;
+	if (ops->domain_alloc_sva) {
+		domain = ops->domain_alloc_sva(dev, mm);
+		if (IS_ERR(domain))
+			return domain;
+	} else {
+		domain = ops->domain_alloc(IOMMU_DOMAIN_SVA);
+		if (!domain)
+			return ERR_PTR(-ENOMEM);
+	}
 
 	domain->type = IOMMU_DOMAIN_SVA;
 	mmgrab(mm);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 3efd51336ca1d..a44ff9e17b1b1 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -518,6 +518,7 @@ static inline int __iommu_copy_struct_from_user_array(
  *                     Upon failure, ERR_PTR must be returned.
  * @domain_alloc_paging: Allocate an iommu_domain that can be used for
  *                       UNMANAGED, DMA, and DMA_FQ domain types.
+ * @domain_alloc_sva: Allocate an iommu_domain for Shared Virtual Addressing.
  * @probe_device: Add device to iommu driver handling
  * @release_device: Remove device from iommu driver handling
  * @probe_finalize: Do final setup work after the device is added to an IOMMU
@@ -558,6 +559,8 @@ struct iommu_ops {
 		struct device *dev, u32 flags, struct iommu_domain *parent,
 		const struct iommu_user_data *user_data);
 	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
+	struct iommu_domain *(*domain_alloc_sva)(struct device *dev,
+						 struct mm_struct *mm);
 
 	struct iommu_device *(*probe_device)(struct device *dev);
 	void (*release_device)(struct device *dev);

From b83bd5e2d9d8edab54583b1f7ba79d95d0a6bf2e Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Wed, 1 May 2024 15:27:29 +0100
Subject: [PATCH 140/352] iommu/arm-smmu-qcom: Don't build debug features as a
 kernel module

The Qualcomm TBU debug support introduced by 414ecb030870
("iommu/arm-smmu-qcom-debug: Add support for TBUs") provides its own
driver initialisation function, which breaks the link when the core SMMU
driver is built as a module:

  ld.lld: error: duplicate symbol: init_module
  >>> defined at arm-smmu.c
  >>>            drivers/iommu/arm/arm-smmu/arm-smmu.o:(init_module)
  >>> defined at arm-smmu-qcom-debug.c
  >>>            drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.o:(.init.text+0x4)

Since we're late in the cycle, just make the debug features depend on a
non-modular SMMU driver for now while the initialisation is reworked to
hang off qcom_smmu_impl_init().

Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 0928fc15f31553c7acb8117b0609799fc0f22fa5)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index d62534abb96c2..84921c49a05d3 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -376,7 +376,7 @@ config ARM_SMMU_QCOM
 
 config ARM_SMMU_QCOM_DEBUG
 	bool "ARM SMMU QCOM implementation defined debug support"
-	depends on ARM_SMMU_QCOM
+	depends on ARM_SMMU_QCOM=y
 	help
 	  Support for implementation specific debug features in ARM SMMU
 	  hardware found in QTI platforms. This include support for

From a5e46f31eba61739f2abbad9d45cb7ec3ec85053 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:33 -0300
Subject: [PATCH 141/352] iommu/arm-smmu-v3: Add an ops indirection to the STE
 code

Prepare to put the CD code into the same mechanism. Add an ops indirection
around all the STE specific code and make the worker functions independent
of the entry content being processed.

get_used and sync ops are provided to hook the correct code.

Signed-off-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit de31c355541286aa4c938c982dfcafbf062fcb93)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 176 ++++++++++++--------
 1 file changed, 104 insertions(+), 72 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 3683cd25c4628..8b85b97cefcea 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -42,8 +42,19 @@ enum arm_smmu_msi_index {
 	ARM_SMMU_MAX_MSIS,
 };
 
-static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu,
-				      ioasid_t sid);
+struct arm_smmu_entry_writer_ops;
+struct arm_smmu_entry_writer {
+	const struct arm_smmu_entry_writer_ops *ops;
+	struct arm_smmu_master *master;
+};
+
+struct arm_smmu_entry_writer_ops {
+	void (*get_used)(const __le64 *entry, __le64 *used);
+	void (*sync)(struct arm_smmu_entry_writer *writer);
+};
+
+#define NUM_ENTRY_QWORDS 8
+static_assert(sizeof(struct arm_smmu_ste) == NUM_ENTRY_QWORDS * sizeof(u64));
 
 static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
 	[EVTQ_MSI_INDEX] = {
@@ -972,43 +983,42 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
  * would be nice if this was complete according to the spec, but minimally it
  * has to capture the bits this driver uses.
  */
-static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
-				  struct arm_smmu_ste *used_bits)
+static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
 {
-	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent->data[0]));
+	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0]));
 
-	used_bits->data[0] = cpu_to_le64(STRTAB_STE_0_V);
-	if (!(ent->data[0] & cpu_to_le64(STRTAB_STE_0_V)))
+	used_bits[0] = cpu_to_le64(STRTAB_STE_0_V);
+	if (!(ent[0] & cpu_to_le64(STRTAB_STE_0_V)))
 		return;
 
-	used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
+	used_bits[0] |= cpu_to_le64(STRTAB_STE_0_CFG);
 
 	/* S1 translates */
 	if (cfg & BIT(0)) {
-		used_bits->data[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
-						  STRTAB_STE_0_S1CTXPTR_MASK |
-						  STRTAB_STE_0_S1CDMAX);
-		used_bits->data[1] |=
+		used_bits[0] |= cpu_to_le64(STRTAB_STE_0_S1FMT |
+					    STRTAB_STE_0_S1CTXPTR_MASK |
+					    STRTAB_STE_0_S1CDMAX);
+		used_bits[1] |=
 			cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |
 				    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |
 				    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
 				    STRTAB_STE_1_EATS);
-		used_bits->data[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
+		used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
 	}
 
 	/* S2 translates */
 	if (cfg & BIT(1)) {
-		used_bits->data[1] |=
+		used_bits[1] |=
 			cpu_to_le64(STRTAB_STE_1_EATS | STRTAB_STE_1_SHCFG);
-		used_bits->data[2] |=
+		used_bits[2] |=
 			cpu_to_le64(STRTAB_STE_2_S2VMID | STRTAB_STE_2_VTCR |
 				    STRTAB_STE_2_S2AA64 | STRTAB_STE_2_S2ENDI |
 				    STRTAB_STE_2_S2PTW | STRTAB_STE_2_S2R);
-		used_bits->data[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
+		used_bits[3] |= cpu_to_le64(STRTAB_STE_3_S2TTB_MASK);
 	}
 
 	if (cfg == STRTAB_STE_0_CFG_BYPASS)
-		used_bits->data[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
+		used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
 }
 
 /*
@@ -1017,57 +1027,55 @@ static void arm_smmu_get_ste_used(const struct arm_smmu_ste *ent,
  * unused_update is an intermediate value of entry that has unused bits set to
  * their new values.
  */
-static u8 arm_smmu_entry_qword_diff(const struct arm_smmu_ste *entry,
-				    const struct arm_smmu_ste *target,
-				    struct arm_smmu_ste *unused_update)
+static u8 arm_smmu_entry_qword_diff(struct arm_smmu_entry_writer *writer,
+				    const __le64 *entry, const __le64 *target,
+				    __le64 *unused_update)
 {
-	struct arm_smmu_ste target_used = {};
-	struct arm_smmu_ste cur_used = {};
+	__le64 target_used[NUM_ENTRY_QWORDS] = {};
+	__le64 cur_used[NUM_ENTRY_QWORDS] = {};
 	u8 used_qword_diff = 0;
 	unsigned int i;
 
-	arm_smmu_get_ste_used(entry, &cur_used);
-	arm_smmu_get_ste_used(target, &target_used);
+	writer->ops->get_used(entry, cur_used);
+	writer->ops->get_used(target, target_used);
 
-	for (i = 0; i != ARRAY_SIZE(target_used.data); i++) {
+	for (i = 0; i != NUM_ENTRY_QWORDS; i++) {
 		/*
 		 * Check that masks are up to date, the make functions are not
 		 * allowed to set a bit to 1 if the used function doesn't say it
 		 * is used.
 		 */
-		WARN_ON_ONCE(target->data[i] & ~target_used.data[i]);
+		WARN_ON_ONCE(target[i] & ~target_used[i]);
 
 		/* Bits can change because they are not currently being used */
-		unused_update->data[i] = (entry->data[i] & cur_used.data[i]) |
-					 (target->data[i] & ~cur_used.data[i]);
+		unused_update[i] = (entry[i] & cur_used[i]) |
+				   (target[i] & ~cur_used[i]);
 		/*
 		 * Each bit indicates that a used bit in a qword needs to be
 		 * changed after unused_update is applied.
 		 */
-		if ((unused_update->data[i] & target_used.data[i]) !=
-		    target->data[i])
+		if ((unused_update[i] & target_used[i]) != target[i])
 			used_qword_diff |= 1 << i;
 	}
 	return used_qword_diff;
 }
 
-static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
-		      struct arm_smmu_ste *entry,
-		      const struct arm_smmu_ste *target, unsigned int start,
+static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
+		      const __le64 *target, unsigned int start,
 		      unsigned int len)
 {
 	bool changed = false;
 	unsigned int i;
 
 	for (i = start; len != 0; len--, i++) {
-		if (entry->data[i] != target->data[i]) {
-			WRITE_ONCE(entry->data[i], target->data[i]);
+		if (entry[i] != target[i]) {
+			WRITE_ONCE(entry[i], target[i]);
 			changed = true;
 		}
 	}
 
 	if (changed)
-		arm_smmu_sync_ste_for_sid(smmu, sid);
+		writer->ops->sync(writer);
 	return changed;
 }
 
@@ -1097,24 +1105,21 @@ static bool entry_set(struct arm_smmu_device *smmu, ioasid_t sid,
  * V=0 process. This relies on the IGNORED behavior described in the
  * specification.
  */
-static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
-			       struct arm_smmu_ste *entry,
-			       const struct arm_smmu_ste *target)
+static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer,
+				 __le64 *entry, const __le64 *target)
 {
-	unsigned int num_entry_qwords = ARRAY_SIZE(target->data);
-	struct arm_smmu_device *smmu = master->smmu;
-	struct arm_smmu_ste unused_update;
+	__le64 unused_update[NUM_ENTRY_QWORDS];
 	u8 used_qword_diff;
 
 	used_qword_diff =
-		arm_smmu_entry_qword_diff(entry, target, &unused_update);
+		arm_smmu_entry_qword_diff(writer, entry, target, unused_update);
 	if (hweight8(used_qword_diff) == 1) {
 		/*
 		 * Only one qword needs its used bits to be changed. This is a
-		 * hitless update, update all bits the current STE is ignoring
-		 * to their new values, then update a single "critical qword" to
-		 * change the STE and finally 0 out any bits that are now unused
-		 * in the target configuration.
+		 * hitless update, update all bits the current STE/CD is
+		 * ignoring to their new values, then update a single "critical
+		 * qword" to change the STE/CD and finally 0 out any bits that
+		 * are now unused in the target configuration.
 		 */
 		unsigned int critical_qword_index = ffs(used_qword_diff) - 1;
 
@@ -1123,22 +1128,21 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
 		 * writing it in the next step anyways. This can save a sync
 		 * when the only change is in that qword.
 		 */
-		unused_update.data[critical_qword_index] =
-			entry->data[critical_qword_index];
-		entry_set(smmu, sid, entry, &unused_update, 0, num_entry_qwords);
-		entry_set(smmu, sid, entry, target, critical_qword_index, 1);
-		entry_set(smmu, sid, entry, target, 0, num_entry_qwords);
+		unused_update[critical_qword_index] =
+			entry[critical_qword_index];
+		entry_set(writer, entry, unused_update, 0, NUM_ENTRY_QWORDS);
+		entry_set(writer, entry, target, critical_qword_index, 1);
+		entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS);
 	} else if (used_qword_diff) {
 		/*
 		 * At least two qwords need their inuse bits to be changed. This
 		 * requires a breaking update, zero the V bit, write all qwords
 		 * but 0, then set qword 0
 		 */
-		unused_update.data[0] = entry->data[0] &
-					cpu_to_le64(~STRTAB_STE_0_V);
-		entry_set(smmu, sid, entry, &unused_update, 0, 1);
-		entry_set(smmu, sid, entry, target, 1, num_entry_qwords - 1);
-		entry_set(smmu, sid, entry, target, 0, 1);
+		unused_update[0] = 0;
+		entry_set(writer, entry, unused_update, 0, 1);
+		entry_set(writer, entry, target, 1, NUM_ENTRY_QWORDS - 1);
+		entry_set(writer, entry, target, 0, 1);
 	} else {
 		/*
 		 * No inuse bit changed. Sanity check that all unused bits are 0
@@ -1146,18 +1150,7 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
 		 * compute_qword_diff().
 		 */
 		WARN_ON_ONCE(
-			entry_set(smmu, sid, entry, target, 0, num_entry_qwords));
-	}
-
-	/* It's likely that we'll want to use the new STE soon */
-	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
-		struct arm_smmu_cmdq_ent
-			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
-					 .prefetch = {
-						 .sid = sid,
-					 } };
-
-		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+			entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS));
 	}
 }
 
@@ -1430,17 +1423,56 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static void arm_smmu_sync_ste_for_sid(struct arm_smmu_device *smmu, u32 sid)
+struct arm_smmu_ste_writer {
+	struct arm_smmu_entry_writer writer;
+	u32 sid;
+};
+
+static void arm_smmu_ste_writer_sync_entry(struct arm_smmu_entry_writer *writer)
 {
+	struct arm_smmu_ste_writer *ste_writer =
+		container_of(writer, struct arm_smmu_ste_writer, writer);
 	struct arm_smmu_cmdq_ent cmd = {
 		.opcode	= CMDQ_OP_CFGI_STE,
 		.cfgi	= {
-			.sid	= sid,
+			.sid	= ste_writer->sid,
 			.leaf	= true,
 		},
 	};
 
-	arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
+	arm_smmu_cmdq_issue_cmd_with_sync(writer->master->smmu, &cmd);
+}
+
+static const struct arm_smmu_entry_writer_ops arm_smmu_ste_writer_ops = {
+	.sync = arm_smmu_ste_writer_sync_entry,
+	.get_used = arm_smmu_get_ste_used,
+};
+
+static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
+			       struct arm_smmu_ste *ste,
+			       const struct arm_smmu_ste *target)
+{
+	struct arm_smmu_device *smmu = master->smmu;
+	struct arm_smmu_ste_writer ste_writer = {
+		.writer = {
+			.ops = &arm_smmu_ste_writer_ops,
+			.master = master,
+		},
+		.sid = sid,
+	};
+
+	arm_smmu_write_entry(&ste_writer.writer, ste->data, target->data);
+
+	/* It's likely that we'll want to use the new STE soon */
+	if (!(smmu->options & ARM_SMMU_OPT_SKIP_PREFETCH)) {
+		struct arm_smmu_cmdq_ent
+			prefetch_cmd = { .opcode = CMDQ_OP_PREFETCH_CFG,
+					 .prefetch = {
+						 .sid = sid,
+					 } };
+
+		arm_smmu_cmdq_issue_cmd(smmu, &prefetch_cmd);
+	}
 }
 
 static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)

From 98a053821ca1988aef4991a2467bb513becadf09 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:34 -0300
Subject: [PATCH 142/352] iommu/arm-smmu-v3: Make CD programming use
 arm_smmu_write_entry()

CD table entries and STE's have the same essential programming sequence,
just with different types. Use the new ops indirection to link CD
programming to the common writer.

In a few more patches all CD writers will call an appropriate make
function and then directly call arm_smmu_write_cd_entry().
arm_smmu_write_ctx_desc() will be removed.

Until then lightly tweak arm_smmu_write_ctx_desc() to also use the new
programmer by using the same logic as right now to build the target CD on
the stack, sanitizing it to meet the used rules, and then using the
writer.

Sanitizing is necessary because the writer expects that the currently
programmed CD follows the used rules. Next patches add new make functions
and new direct calls to arm_smmu_write_cd_entry() which will require this.

Signed-off-by: Michael Shavit <mshavit@google.com>
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 78a5fbe8395b365d58142ff9b7a6aeb556481a1f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 89 ++++++++++++++++-----
 1 file changed, 67 insertions(+), 22 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8b85b97cefcea..e7a896d55cc89 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -55,6 +55,7 @@ struct arm_smmu_entry_writer_ops {
 
 #define NUM_ENTRY_QWORDS 8
 static_assert(sizeof(struct arm_smmu_ste) == NUM_ENTRY_QWORDS * sizeof(u64));
+static_assert(sizeof(struct arm_smmu_cd) == NUM_ENTRY_QWORDS * sizeof(u64));
 
 static phys_addr_t arm_smmu_msi_cfg[ARM_SMMU_MAX_MSIS][3] = {
 	[EVTQ_MSI_INDEX] = {
@@ -1230,6 +1231,59 @@ static struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 	return &l1_desc->l2ptr[idx];
 }
 
+struct arm_smmu_cd_writer {
+	struct arm_smmu_entry_writer writer;
+	unsigned int ssid;
+};
+
+static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits)
+{
+	used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V);
+	if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V)))
+		return;
+	memset(used_bits, 0xFF, sizeof(struct arm_smmu_cd));
+
+	/*
+	 * If EPD0 is set by the make function it means
+	 * T0SZ/TG0/IR0/OR0/SH0/TTB0 are IGNORED
+	 */
+	if (ent[0] & cpu_to_le64(CTXDESC_CD_0_TCR_EPD0)) {
+		used_bits[0] &= ~cpu_to_le64(
+			CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 |
+			CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 |
+			CTXDESC_CD_0_TCR_SH0);
+		used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK);
+	}
+}
+
+static void arm_smmu_cd_writer_sync_entry(struct arm_smmu_entry_writer *writer)
+{
+	struct arm_smmu_cd_writer *cd_writer =
+		container_of(writer, struct arm_smmu_cd_writer, writer);
+
+	arm_smmu_sync_cd(writer->master, cd_writer->ssid, true);
+}
+
+static const struct arm_smmu_entry_writer_ops arm_smmu_cd_writer_ops = {
+	.sync = arm_smmu_cd_writer_sync_entry,
+	.get_used = arm_smmu_get_cd_used,
+};
+
+static void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
+				    struct arm_smmu_cd *cdptr,
+				    const struct arm_smmu_cd *target)
+{
+	struct arm_smmu_cd_writer cd_writer = {
+		.writer = {
+			.ops = &arm_smmu_cd_writer_ops,
+			.master = master,
+		},
+		.ssid = ssid,
+	};
+
+	arm_smmu_write_entry(&cd_writer.writer, cdptr->data, target->data);
+}
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 			    struct arm_smmu_ctx_desc *cd)
 {
@@ -1246,26 +1300,34 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	 */
 	u64 val;
 	bool cd_live;
-	struct arm_smmu_cd *cdptr;
+	struct arm_smmu_cd target;
+	struct arm_smmu_cd *cdptr = &target;
+	struct arm_smmu_cd *cd_table_entry;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
 
 	if (WARN_ON(ssid >= (1 << cd_table->s1cdmax)))
 		return -E2BIG;
 
-	cdptr = arm_smmu_get_cd_ptr(master, ssid);
-	if (!cdptr)
+	cd_table_entry = arm_smmu_get_cd_ptr(master, ssid);
+	if (!cd_table_entry)
 		return -ENOMEM;
 
+	target = *cd_table_entry;
 	val = le64_to_cpu(cdptr->data[0]);
 	cd_live = !!(val & CTXDESC_CD_0_V);
 
 	if (!cd) { /* (5) */
+		memset(cdptr, 0, sizeof(*cdptr));
 		val = 0;
 	} else if (cd == &quiet_cd) { /* (4) */
+		val &= ~(CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 |
+			 CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 |
+			 CTXDESC_CD_0_TCR_SH0);
 		if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
 			val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R);
 		val |= CTXDESC_CD_0_TCR_EPD0;
+		cdptr->data[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK);
 	} else if (cd_live) { /* (3) */
 		val &= ~CTXDESC_CD_0_ASID;
 		val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
@@ -1278,13 +1340,6 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 		cdptr->data[2] = 0;
 		cdptr->data[3] = cpu_to_le64(cd->mair);
 
-		/*
-		 * STE may be live, and the SMMU might read dwords of this CD in any
-		 * order. Ensure that it observes valid values before reading
-		 * V=1.
-		 */
-		arm_smmu_sync_cd(master, ssid, true);
-
 		val = cd->tcr |
 #ifdef __BIG_ENDIAN
 			CTXDESC_CD_0_ENDI |
@@ -1298,18 +1353,8 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 		if (cd_table->stall_enabled)
 			val |= CTXDESC_CD_0_S;
 	}
-
-	/*
-	 * The SMMU accesses 64-bit values atomically. See IHI0070Ca 3.21.3
-	 * "Configuration structures and configuration invalidation completion"
-	 *
-	 *   The size of single-copy atomic reads made by the SMMU is
-	 *   IMPLEMENTATION DEFINED but must be at least 64 bits. Any single
-	 *   field within an aligned 64-bit span of a structure can be altered
-	 *   without first making the structure invalid.
-	 */
-	WRITE_ONCE(cdptr->data[0], cpu_to_le64(val));
-	arm_smmu_sync_cd(master, ssid, true);
+	cdptr->data[0] = cpu_to_le64(val);
+	arm_smmu_write_cd_entry(master, ssid, cd_table_entry, &target);
 	return 0;
 }
 

From 0d6825ca72752f85b8d75f8c764bea02f30b0d96 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:35 -0300
Subject: [PATCH 143/352] iommu/arm-smmu-v3: Move the CD generation for S1
 domains into a function

Introduce arm_smmu_make_s1_cd() to build the CD from the paging S1 domain,
and reorganize all the places programming S1 domain CD table entries to
call it.

Split arm_smmu_update_s1_domain_cd_entry() from
arm_smmu_update_ctx_desc_devices() so that the S1 path has its own call
chain separate from the unrelated SVA path.

arm_smmu_update_s1_domain_cd_entry() only works on S1 domains attached to
RIDs and refreshes all their CDs. Remove case (3) from
arm_smmu_write_ctx_desc() as it is now handled by directly calling
arm_smmu_write_cd_entry().

Remove the forced clear of the CD during S1 domain attach,
arm_smmu_write_cd_entry() will do this automatically if necessary.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
[will: Drop unused arm_smmu_clean_cd_entry() function]
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit e9d1e4ff74b96cf180d04be38541a245c8c574c1)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 25 ++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 71 +++++++++++--------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  9 +++
 3 files changed, 76 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 41b44baef15e8..d159f60480935 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -53,6 +53,29 @@ static void arm_smmu_update_ctx_desc_devices(struct arm_smmu_domain *smmu_domain
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 }
 
+static void
+arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_master *master;
+	struct arm_smmu_cd target_cd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+		struct arm_smmu_cd *cdptr;
+
+		/* S1 domains only support RID attachment right now */
+		cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID);
+		if (WARN_ON(!cdptr))
+			continue;
+
+		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
+		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
+					&target_cd);
+	}
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+}
+
 /*
  * Check if the CPU ASID is available on the SMMU side. If a private context
  * descriptor is using it, try to replace it.
@@ -96,7 +119,7 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
 	 * be some overlap between use of both ASIDs, until we invalidate the
 	 * TLB.
 	 */
-	arm_smmu_update_ctx_desc_devices(smmu_domain, IOMMU_NO_PASID, cd);
+	arm_smmu_update_s1_domain_cd_entry(smmu_domain);
 
 	/* Invalidate TLB entries previously associated with that context */
 	arm_smmu_tlb_inv_asid(smmu, asid);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e7a896d55cc89..553675cd98fcc 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1203,8 +1203,8 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
-					       u32 ssid)
+struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
+					u32 ssid)
 {
 	__le64 *l1ptr;
 	unsigned int idx;
@@ -1269,9 +1269,9 @@ static const struct arm_smmu_entry_writer_ops arm_smmu_cd_writer_ops = {
 	.get_used = arm_smmu_get_cd_used,
 };
 
-static void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
-				    struct arm_smmu_cd *cdptr,
-				    const struct arm_smmu_cd *target)
+void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
+			     struct arm_smmu_cd *cdptr,
+			     const struct arm_smmu_cd *target)
 {
 	struct arm_smmu_cd_writer cd_writer = {
 		.writer = {
@@ -1284,6 +1284,32 @@ static void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 	arm_smmu_write_entry(&cd_writer.writer, cdptr->data, target->data);
 }
 
+void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
+			 struct arm_smmu_master *master,
+			 struct arm_smmu_domain *smmu_domain)
+{
+	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
+
+	memset(target, 0, sizeof(*target));
+
+	target->data[0] = cpu_to_le64(
+		cd->tcr |
+#ifdef __BIG_ENDIAN
+		CTXDESC_CD_0_ENDI |
+#endif
+		CTXDESC_CD_0_V |
+		CTXDESC_CD_0_AA64 |
+		(master->stall_enabled ? CTXDESC_CD_0_S : 0) |
+		CTXDESC_CD_0_R |
+		CTXDESC_CD_0_A |
+		CTXDESC_CD_0_ASET |
+		FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid)
+		);
+
+	target->data[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
+	target->data[3] = cpu_to_le64(cd->mair);
+}
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 			    struct arm_smmu_ctx_desc *cd)
 {
@@ -1292,14 +1318,11 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	 *
 	 * (1) Install primary CD, for normal DMA traffic (SSID = IOMMU_NO_PASID = 0).
 	 * (2) Install a secondary CD, for SID+SSID traffic.
-	 * (3) Update ASID of a CD. Atomically write the first 64 bits of the
-	 *     CD, then invalidate the old entry and mappings.
 	 * (4) Quiesce the context without clearing the valid bit. Disable
 	 *     translation, and ignore any translation fault.
 	 * (5) Remove a secondary CD.
 	 */
 	u64 val;
-	bool cd_live;
 	struct arm_smmu_cd target;
 	struct arm_smmu_cd *cdptr = &target;
 	struct arm_smmu_cd *cd_table_entry;
@@ -1315,7 +1338,6 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 
 	target = *cd_table_entry;
 	val = le64_to_cpu(cdptr->data[0]);
-	cd_live = !!(val & CTXDESC_CD_0_V);
 
 	if (!cd) { /* (5) */
 		memset(cdptr, 0, sizeof(*cdptr));
@@ -1328,13 +1350,6 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 			val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R);
 		val |= CTXDESC_CD_0_TCR_EPD0;
 		cdptr->data[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK);
-	} else if (cd_live) { /* (3) */
-		val &= ~CTXDESC_CD_0_ASID;
-		val |= FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid);
-		/*
-		 * Until CD+TLB invalidation, both ASIDs may be used for tagging
-		 * this substream's traffic
-		 */
 	} else { /* (1) and (2) */
 		cdptr->data[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
 		cdptr->data[2] = 0;
@@ -2633,29 +2648,29 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	switch (smmu_domain->stage) {
-	case ARM_SMMU_DOMAIN_S1:
+	case ARM_SMMU_DOMAIN_S1: {
+		struct arm_smmu_cd target_cd;
+		struct arm_smmu_cd *cdptr;
+
 		if (!master->cd_table.cdtab) {
 			ret = arm_smmu_alloc_cd_tables(master);
 			if (ret)
 				goto out_list_del;
-		} else {
-			/*
-			 * arm_smmu_write_ctx_desc() relies on the entry being
-			 * invalid to work, clear any existing entry.
-			 */
-			ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
-						      NULL);
-			if (ret)
-				goto out_list_del;
 		}
 
-		ret = arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, &smmu_domain->cd);
-		if (ret)
+		cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID);
+		if (!cdptr) {
+			ret = -ENOMEM;
 			goto out_list_del;
+		}
 
+		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
+		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
+					&target_cd);
 		arm_smmu_make_cdtable_ste(&target, master);
 		arm_smmu_install_ste_for_dev(master, &target);
 		break;
+	}
 	case ARM_SMMU_DOMAIN_S2:
 		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
 		arm_smmu_install_ste_for_dev(master, &target);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 4b767e0eeeb68..bb08f087ba39e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -751,6 +751,15 @@ extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 extern struct arm_smmu_ctx_desc quiet_cd;
 
+struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
+					u32 ssid);
+void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
+			 struct arm_smmu_master *master,
+			 struct arm_smmu_domain *smmu_domain);
+void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
+			     struct arm_smmu_cd *cdptr,
+			     const struct arm_smmu_cd *target);
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid,
 			    struct arm_smmu_ctx_desc *cd);
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);

From c2f13108dc544e04e836df4a90da66a7384b52e5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:36 -0300
Subject: [PATCH 144/352] iommu/arm-smmu-v3: Consolidate clearing a CD table
 entry

A cleared entry is all 0's. Make arm_smmu_clear_cd() do this sequence.

If we are clearing an entry and for some reason it is not already
allocated in the CD table then something has gone wrong.

Remove case (5) from arm_smmu_write_ctx_desc().

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Moritz Fischer <moritzf@google.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit af8f0b83ea2bcc7cd365c32044f31bdadc07c351)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 26 ++++++++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  1 +
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index d159f60480935..7cf286f7a009f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -569,7 +569,7 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 
 	mutex_lock(&sva_lock);
 
-	arm_smmu_write_ctx_desc(master, id, NULL);
+	arm_smmu_clear_cd(master, id);
 
 	list_for_each_entry(t, &master->bonds, list) {
 		if (t->mm == mm) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 553675cd98fcc..610909ff5824b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1310,6 +1310,19 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 	target->data[3] = cpu_to_le64(cd->mair);
 }
 
+void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid)
+{
+	struct arm_smmu_cd target = {};
+	struct arm_smmu_cd *cdptr;
+
+	if (!master->cd_table.cdtab)
+		return;
+	cdptr = arm_smmu_get_cd_ptr(master, ssid);
+	if (WARN_ON(!cdptr))
+		return;
+	arm_smmu_write_cd_entry(master, ssid, cdptr, &target);
+}
+
 int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 			    struct arm_smmu_ctx_desc *cd)
 {
@@ -1320,7 +1333,6 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	 * (2) Install a secondary CD, for SID+SSID traffic.
 	 * (4) Quiesce the context without clearing the valid bit. Disable
 	 *     translation, and ignore any translation fault.
-	 * (5) Remove a secondary CD.
 	 */
 	u64 val;
 	struct arm_smmu_cd target;
@@ -1339,10 +1351,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	target = *cd_table_entry;
 	val = le64_to_cpu(cdptr->data[0]);
 
-	if (!cd) { /* (5) */
-		memset(cdptr, 0, sizeof(*cdptr));
-		val = 0;
-	} else if (cd == &quiet_cd) { /* (4) */
+	if (cd == &quiet_cd) { /* (4) */
 		val &= ~(CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 |
 			 CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 |
 			 CTXDESC_CD_0_TCR_SH0);
@@ -2674,9 +2683,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	case ARM_SMMU_DOMAIN_S2:
 		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
 		arm_smmu_install_ste_for_dev(master, &target);
-		if (master->cd_table.cdtab)
-			arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID,
-						      NULL);
+		arm_smmu_clear_cd(master, IOMMU_NO_PASID);
 		break;
 	}
 
@@ -2724,8 +2731,7 @@ static int arm_smmu_attach_dev_ste(struct device *dev,
 	 * arm_smmu_domain->devices to avoid races updating the same context
 	 * descriptor from arm_smmu_share_asid().
 	 */
-	if (master->cd_table.cdtab)
-		arm_smmu_write_ctx_desc(master, IOMMU_NO_PASID, NULL);
+	arm_smmu_clear_cd(master, IOMMU_NO_PASID);
 	return 0;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index bb08f087ba39e..99fd6f24caa81 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -751,6 +751,7 @@ extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 extern struct arm_smmu_ctx_desc quiet_cd;
 
+void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid);
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid);
 void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,

From 04233e0eeb78dcd5ea11dfbfc7b49964139157da Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:37 -0300
Subject: [PATCH 145/352] iommu/arm-smmu-v3: Make arm_smmu_alloc_cd_ptr()

Only the attach callers can perform an allocation for the CD table entry,
the other callers must not do so, they do not have the correct locking and
they cannot sleep. Split up the functions so this is clear.

arm_smmu_get_cd_ptr() will return pointer to a CD table entry without
doing any kind of allocation.

arm_smmu_alloc_cd_ptr() will allocate the table and any required
leaf.

A following patch will add lockdep assertions to arm_smmu_alloc_cd_ptr()
once the restructuring is completed and arm_smmu_alloc_cd_ptr() is never
called in the wrong context.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit b2f4c0fcf094dacd2d1fb96a6fd6598919501589)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 59 +++++++++++++--------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +-
 2 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 610909ff5824b..a7b31cddb42a5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -97,6 +97,7 @@ static struct arm_smmu_option_prop arm_smmu_options[] = {
 
 static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 				    struct arm_smmu_device *smmu);
+static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master);
 
 static void parse_driver_options(struct arm_smmu_device *smmu)
 {
@@ -1206,29 +1207,51 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid)
 {
-	__le64 *l1ptr;
-	unsigned int idx;
 	struct arm_smmu_l1_ctx_desc *l1_desc;
-	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
+	if (!cd_table->cdtab)
+		return NULL;
+
 	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
 		return (struct arm_smmu_cd *)(cd_table->cdtab +
 					      ssid * CTXDESC_CD_DWORDS);
 
-	idx = ssid >> CTXDESC_SPLIT;
-	l1_desc = &cd_table->l1_desc[idx];
-	if (!l1_desc->l2ptr) {
-		if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+	l1_desc = &cd_table->l1_desc[ssid / CTXDESC_L2_ENTRIES];
+	if (!l1_desc->l2ptr)
+		return NULL;
+	return &l1_desc->l2ptr[ssid % CTXDESC_L2_ENTRIES];
+}
+
+static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
+						 u32 ssid)
+{
+	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
+	struct arm_smmu_device *smmu = master->smmu;
+
+	if (!cd_table->cdtab) {
+		if (arm_smmu_alloc_cd_tables(master))
 			return NULL;
+	}
+
+	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_64K_L2) {
+		unsigned int idx = ssid / CTXDESC_L2_ENTRIES;
+		struct arm_smmu_l1_ctx_desc *l1_desc;
+
+		l1_desc = &cd_table->l1_desc[idx];
+		if (!l1_desc->l2ptr) {
+			__le64 *l1ptr;
 
-		l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
-		arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
-		/* An invalid L1CD can be cached */
-		arm_smmu_sync_cd(master, ssid, false);
+			if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+				return NULL;
+
+			l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
+			arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
+			/* An invalid L1CD can be cached */
+			arm_smmu_sync_cd(master, ssid, false);
+		}
 	}
-	idx = ssid & (CTXDESC_L2_ENTRIES - 1);
-	return &l1_desc->l2ptr[idx];
+	return arm_smmu_get_cd_ptr(master, ssid);
 }
 
 struct arm_smmu_cd_writer {
@@ -1344,7 +1367,7 @@ int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
 	if (WARN_ON(ssid >= (1 << cd_table->s1cdmax)))
 		return -E2BIG;
 
-	cd_table_entry = arm_smmu_get_cd_ptr(master, ssid);
+	cd_table_entry = arm_smmu_alloc_cd_ptr(master, ssid);
 	if (!cd_table_entry)
 		return -ENOMEM;
 
@@ -2661,13 +2684,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		struct arm_smmu_cd target_cd;
 		struct arm_smmu_cd *cdptr;
 
-		if (!master->cd_table.cdtab) {
-			ret = arm_smmu_alloc_cd_tables(master);
-			if (ret)
-				goto out_list_del;
-		}
-
-		cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID);
+		cdptr = arm_smmu_alloc_cd_ptr(master, IOMMU_NO_PASID);
 		if (!cdptr) {
 			ret = -ENOMEM;
 			goto out_list_del;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 99fd6f24caa81..c5c55d3e28186 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -275,8 +275,7 @@ struct arm_smmu_ste {
  * 2lvl: at most 1024 L1 entries,
  *       1024 lazy entries per table.
  */
-#define CTXDESC_SPLIT			10
-#define CTXDESC_L2_ENTRIES		(1 << CTXDESC_SPLIT)
+#define CTXDESC_L2_ENTRIES		1024
 
 #define CTXDESC_L1_DESC_DWORDS		1
 #define CTXDESC_L1_DESC_V		(1UL << 0)

From e30832553ac0c39722ec52622a6534499c5d8a14 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:38 -0300
Subject: [PATCH 146/352] iommu/arm-smmu-v3: Allocate the CD table entry in
 advance

Avoid arm_smmu_attach_dev() having to undo the changes to the
smmu_domain->devices list, acquire the cdptr earlier so we don't need to
handle that error.

Now there is a clear break in arm_smmu_attach_dev() where all the
prep-work has been done non-disruptively and we commit to making the HW
change, which cannot fail.

This completes transforming arm_smmu_attach_dev() so that it does not
disturb the HW if it fails.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/6-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 13abe4faac4348da0cf1c4eeb2b1b39fcfdb4b8f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 24 +++++++--------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index a7b31cddb42a5..5bf027ee9a97a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2635,6 +2635,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	struct arm_smmu_device *smmu;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_master *master;
+	struct arm_smmu_cd *cdptr;
 
 	if (!fwspec)
 		return -ENOENT;
@@ -2663,6 +2664,12 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	if (ret)
 		return ret;
 
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
+		cdptr = arm_smmu_alloc_cd_ptr(master, IOMMU_NO_PASID);
+		if (!cdptr)
+			return -ENOMEM;
+	}
+
 	/*
 	 * Prevent arm_smmu_share_asid() from trying to change the ASID
 	 * of either the old or new domain while we are working on it.
@@ -2682,13 +2689,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	switch (smmu_domain->stage) {
 	case ARM_SMMU_DOMAIN_S1: {
 		struct arm_smmu_cd target_cd;
-		struct arm_smmu_cd *cdptr;
-
-		cdptr = arm_smmu_alloc_cd_ptr(master, IOMMU_NO_PASID);
-		if (!cdptr) {
-			ret = -ENOMEM;
-			goto out_list_del;
-		}
 
 		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
 		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
@@ -2705,16 +2705,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	}
 
 	arm_smmu_enable_ats(master, smmu_domain);
-	goto out_unlock;
-
-out_list_del:
-	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_del_init(&master->domain_head);
-	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
-
-out_unlock:
 	mutex_unlock(&arm_smmu_asid_lock);
-	return ret;
+	return 0;
 }
 
 static int arm_smmu_attach_dev_ste(struct device *dev,

From 0d61fb11152528ab7b7401d592d71b98702940b9 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:39 -0300
Subject: [PATCH 147/352] iommu/arm-smmu-v3: Move the CD generation for SVA
 into a function

Pull all the calculations for building the CD table entry for a mmu_struct
into arm_smmu_make_sva_cd().

Call it in the two places installing the SVA CD table entry.

Open code the last caller of arm_smmu_update_ctx_desc_devices() and remove
the function.

Remove arm_smmu_write_ctx_desc() since all callers are gone. Add the
locking assertions to arm_smmu_alloc_cd_ptr() since
arm_smmu_update_ctx_desc_devices() was the last problematic caller.

Remove quiet_cd since all users are gone, arm_smmu_make_sva_cd() creates
the same value.

The behavior of quiet_cd changes slightly, the old implementation edited
the CD in place to set CTXDESC_CD_0_TCR_EPD0 assuming it was a SVA CD
entry. This version generates a full CD entry with a 0 TTB0 and relies on
arm_smmu_write_cd_entry() to install it hitlessly.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/7-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7b87c93c8b86d9d9b9567d83f0ca3d3046fdfc5a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 155 +++++++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  77 +--------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   7 +-
 3 files changed, 107 insertions(+), 132 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 7cf286f7a009f..8730a7043909e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -34,25 +34,6 @@ struct arm_smmu_bond {
 
 static DEFINE_MUTEX(sva_lock);
 
-/*
- * Write the CD to the CD tables for all masters that this domain is attached
- * to. Note that this is only used to update existing CD entries in the target
- * CD table, for which it's assumed that arm_smmu_write_ctx_desc can't fail.
- */
-static void arm_smmu_update_ctx_desc_devices(struct arm_smmu_domain *smmu_domain,
-					   int ssid,
-					   struct arm_smmu_ctx_desc *cd)
-{
-	struct arm_smmu_master *master;
-	unsigned long flags;
-
-	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
-		arm_smmu_write_ctx_desc(master, ssid, cd);
-	}
-	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
-}
-
 static void
 arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
 {
@@ -128,11 +109,85 @@ arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
 	return NULL;
 }
 
+static u64 page_size_to_cd(void)
+{
+	static_assert(PAGE_SIZE == SZ_4K || PAGE_SIZE == SZ_16K ||
+		      PAGE_SIZE == SZ_64K);
+	if (PAGE_SIZE == SZ_64K)
+		return ARM_LPAE_TCR_TG0_64K;
+	if (PAGE_SIZE == SZ_16K)
+		return ARM_LPAE_TCR_TG0_16K;
+	return ARM_LPAE_TCR_TG0_4K;
+}
+
+static void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
+				 struct arm_smmu_master *master,
+				 struct mm_struct *mm, u16 asid)
+{
+	u64 par;
+
+	memset(target, 0, sizeof(*target));
+
+	par = cpuid_feature_extract_unsigned_field(
+		read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1),
+		ID_AA64MMFR0_EL1_PARANGE_SHIFT);
+
+	target->data[0] = cpu_to_le64(
+		CTXDESC_CD_0_TCR_EPD1 |
+#ifdef __BIG_ENDIAN
+		CTXDESC_CD_0_ENDI |
+#endif
+		CTXDESC_CD_0_V |
+		FIELD_PREP(CTXDESC_CD_0_TCR_IPS, par) |
+		CTXDESC_CD_0_AA64 |
+		(master->stall_enabled ? CTXDESC_CD_0_S : 0) |
+		CTXDESC_CD_0_R |
+		CTXDESC_CD_0_A |
+		CTXDESC_CD_0_ASET |
+		FIELD_PREP(CTXDESC_CD_0_ASID, asid));
+
+	/*
+	 * If no MM is passed then this creates a SVA entry that faults
+	 * everything. arm_smmu_write_cd_entry() can hitlessly go between these
+	 * two entries types since TTB0 is ignored by HW when EPD0 is set.
+	 */
+	if (mm) {
+		target->data[0] |= cpu_to_le64(
+			FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ,
+				   64ULL - vabits_actual) |
+			FIELD_PREP(CTXDESC_CD_0_TCR_TG0, page_size_to_cd()) |
+			FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0,
+				   ARM_LPAE_TCR_RGN_WBWA) |
+			FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0,
+				   ARM_LPAE_TCR_RGN_WBWA) |
+			FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS));
+
+		target->data[1] = cpu_to_le64(virt_to_phys(mm->pgd) &
+					      CTXDESC_CD_1_TTB0_MASK);
+	} else {
+		target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_EPD0);
+
+		/*
+		 * Disable stall and immediately generate an abort if stall
+		 * disable is permitted. This speeds up cleanup for an unclean
+		 * exit if the device is still doing a lot of DMA.
+		 */
+		if (!(master->smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
+			target->data[0] &=
+				cpu_to_le64(~(CTXDESC_CD_0_S | CTXDESC_CD_0_R));
+	}
+
+	/*
+	 * MAIR value is pretty much constant and global, so we can just get it
+	 * from the current CPU register
+	 */
+	target->data[3] = cpu_to_le64(read_sysreg(mair_el1));
+}
+
 static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
 {
 	u16 asid;
 	int err = 0;
-	u64 tcr, par, reg;
 	struct arm_smmu_ctx_desc *cd;
 	struct arm_smmu_ctx_desc *ret = NULL;
 
@@ -166,39 +221,6 @@ static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
 	if (err)
 		goto out_free_asid;
 
-	tcr = FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, 64ULL - vabits_actual) |
-	      FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, ARM_LPAE_TCR_RGN_WBWA) |
-	      FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, ARM_LPAE_TCR_RGN_WBWA) |
-	      FIELD_PREP(CTXDESC_CD_0_TCR_SH0, ARM_LPAE_TCR_SH_IS) |
-	      CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
-
-	switch (PAGE_SIZE) {
-	case SZ_4K:
-		tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_4K);
-		break;
-	case SZ_16K:
-		tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_16K);
-		break;
-	case SZ_64K:
-		tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_TG0, ARM_LPAE_TCR_TG0_64K);
-		break;
-	default:
-		WARN_ON(1);
-		err = -EINVAL;
-		goto out_free_asid;
-	}
-
-	reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
-	par = cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR0_EL1_PARANGE_SHIFT);
-	tcr |= FIELD_PREP(CTXDESC_CD_0_TCR_IPS, par);
-
-	cd->ttbr = virt_to_phys(mm->pgd);
-	cd->tcr = tcr;
-	/*
-	 * MAIR value is pretty much constant and global, so we can just get it
-	 * from the current CPU register
-	 */
-	cd->mair = read_sysreg(mair_el1);
 	cd->asid = asid;
 	cd->mm = mm;
 
@@ -276,6 +298,8 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
 	struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
+	struct arm_smmu_master *master;
+	unsigned long flags;
 
 	mutex_lock(&sva_lock);
 	if (smmu_mn->cleared) {
@@ -287,8 +311,19 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events,
 	 * but disable translation.
 	 */
-	arm_smmu_update_ctx_desc_devices(smmu_domain, mm_get_enqcmd_pasid(mm),
-					 &quiet_cd);
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+		struct arm_smmu_cd target;
+		struct arm_smmu_cd *cdptr;
+
+		cdptr = arm_smmu_get_cd_ptr(master, mm_get_enqcmd_pasid(mm));
+		if (WARN_ON(!cdptr))
+			continue;
+		arm_smmu_make_sva_cd(&target, master, NULL, smmu_mn->cd->asid);
+		arm_smmu_write_cd_entry(master, mm_get_enqcmd_pasid(mm), cdptr,
+					&target);
+	}
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
 	arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0);
@@ -383,6 +418,8 @@ static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid,
 			       struct mm_struct *mm)
 {
 	int ret;
+	struct arm_smmu_cd target;
+	struct arm_smmu_cd *cdptr;
 	struct arm_smmu_bond *bond;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
@@ -409,9 +446,13 @@ static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid,
 		goto err_free_bond;
 	}
 
-	ret = arm_smmu_write_ctx_desc(master, pasid, bond->smmu_mn->cd);
-	if (ret)
+	cdptr = arm_smmu_alloc_cd_ptr(master, mm_get_enqcmd_pasid(mm));
+	if (!cdptr) {
+		ret = -ENOMEM;
 		goto err_put_notifier;
+	}
+	arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid);
+	arm_smmu_write_cd_entry(master, pasid, cdptr, &target);
 
 	list_add(&bond->list, &master->bonds);
 	return 0;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5bf027ee9a97a..cffaeceec4113 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -83,12 +83,6 @@ struct arm_smmu_option_prop {
 DEFINE_XARRAY_ALLOC1(arm_smmu_asid_xa);
 DEFINE_MUTEX(arm_smmu_asid_lock);
 
-/*
- * Special value used by SVA when a process dies, to quiesce a CD without
- * disabling it.
- */
-struct arm_smmu_ctx_desc quiet_cd = { 0 };
-
 static struct arm_smmu_option_prop arm_smmu_options[] = {
 	{ ARM_SMMU_OPT_SKIP_PREFETCH, "hisilicon,broken-prefetch-cmd" },
 	{ ARM_SMMU_OPT_PAGE0_REGS_ONLY, "cavium,cn9900-broken-page1-regspace"},
@@ -1200,7 +1194,7 @@ static void arm_smmu_write_cd_l1_desc(__le64 *dst,
 	u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
 		  CTXDESC_L1_DESC_V;
 
-	/* See comment in arm_smmu_write_ctx_desc() */
+	/* The HW has 64 bit atomicity with stores to the L2 CD table */
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
@@ -1223,12 +1217,15 @@ struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 	return &l1_desc->l2ptr[ssid % CTXDESC_L2_ENTRIES];
 }
 
-static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
-						 u32 ssid)
+struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
+					  u32 ssid)
 {
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
 
+	might_sleep();
+	iommu_group_mutex_assert(master->dev);
+
 	if (!cd_table->cdtab) {
 		if (arm_smmu_alloc_cd_tables(master))
 			return NULL;
@@ -1346,65 +1343,6 @@ void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid)
 	arm_smmu_write_cd_entry(master, ssid, cdptr, &target);
 }
 
-int arm_smmu_write_ctx_desc(struct arm_smmu_master *master, int ssid,
-			    struct arm_smmu_ctx_desc *cd)
-{
-	/*
-	 * This function handles the following cases:
-	 *
-	 * (1) Install primary CD, for normal DMA traffic (SSID = IOMMU_NO_PASID = 0).
-	 * (2) Install a secondary CD, for SID+SSID traffic.
-	 * (4) Quiesce the context without clearing the valid bit. Disable
-	 *     translation, and ignore any translation fault.
-	 */
-	u64 val;
-	struct arm_smmu_cd target;
-	struct arm_smmu_cd *cdptr = &target;
-	struct arm_smmu_cd *cd_table_entry;
-	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
-	struct arm_smmu_device *smmu = master->smmu;
-
-	if (WARN_ON(ssid >= (1 << cd_table->s1cdmax)))
-		return -E2BIG;
-
-	cd_table_entry = arm_smmu_alloc_cd_ptr(master, ssid);
-	if (!cd_table_entry)
-		return -ENOMEM;
-
-	target = *cd_table_entry;
-	val = le64_to_cpu(cdptr->data[0]);
-
-	if (cd == &quiet_cd) { /* (4) */
-		val &= ~(CTXDESC_CD_0_TCR_T0SZ | CTXDESC_CD_0_TCR_TG0 |
-			 CTXDESC_CD_0_TCR_IRGN0 | CTXDESC_CD_0_TCR_ORGN0 |
-			 CTXDESC_CD_0_TCR_SH0);
-		if (!(smmu->features & ARM_SMMU_FEAT_STALL_FORCE))
-			val &= ~(CTXDESC_CD_0_S | CTXDESC_CD_0_R);
-		val |= CTXDESC_CD_0_TCR_EPD0;
-		cdptr->data[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK);
-	} else { /* (1) and (2) */
-		cdptr->data[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
-		cdptr->data[2] = 0;
-		cdptr->data[3] = cpu_to_le64(cd->mair);
-
-		val = cd->tcr |
-#ifdef __BIG_ENDIAN
-			CTXDESC_CD_0_ENDI |
-#endif
-			CTXDESC_CD_0_R | CTXDESC_CD_0_A |
-			(cd->mm ? 0 : CTXDESC_CD_0_ASET) |
-			CTXDESC_CD_0_AA64 |
-			FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid) |
-			CTXDESC_CD_0_V;
-
-		if (cd_table->stall_enabled)
-			val |= CTXDESC_CD_0_S;
-	}
-	cdptr->data[0] = cpu_to_le64(val);
-	arm_smmu_write_cd_entry(master, ssid, cd_table_entry, &target);
-	return 0;
-}
-
 static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 {
 	int ret;
@@ -1413,7 +1351,6 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
-	cd_table->stall_enabled = master->stall_enabled;
 	cd_table->s1cdmax = master->ssid_bits;
 	max_contexts = 1 << cd_table->s1cdmax;
 
@@ -1511,7 +1448,7 @@ arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
 	val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
 	val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
 
-	/* See comment in arm_smmu_write_ctx_desc() */
+	/* The HW has 64 bit atomicity with stores to the L2 STE table */
 	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index c5c55d3e28186..5540609069fcd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -608,8 +608,6 @@ struct arm_smmu_ctx_desc_cfg {
 	u8				s1fmt;
 	/* log2 of the maximum number of CDs supported by this table */
 	u8				s1cdmax;
-	/* Whether CD entries in this table have the stall bit set. */
-	u8				stall_enabled:1;
 };
 
 struct arm_smmu_s2_cfg {
@@ -748,11 +746,12 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
 
 extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
-extern struct arm_smmu_ctx_desc quiet_cd;
 
 void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid);
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid);
+struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
+					  u32 ssid);
 void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 			 struct arm_smmu_master *master,
 			 struct arm_smmu_domain *smmu_domain);
@@ -760,8 +759,6 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 			     struct arm_smmu_cd *cdptr,
 			     const struct arm_smmu_cd *target);
 
-int arm_smmu_write_ctx_desc(struct arm_smmu_master *smmu_master, int ssid,
-			    struct arm_smmu_ctx_desc *cd);
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
 void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 				 size_t granule, bool leaf,

From c865cf5b55a3c6a09ef2d43acdd975f097ad90e5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:40 -0300
Subject: [PATCH 148/352] iommu/arm-smmu-v3: Build the whole CD in
 arm_smmu_make_s1_cd()

Half the code was living in arm_smmu_domain_finalise_s1(), just move it
here and take the values directly from the pgtbl_ops instead of storing
copies.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/8-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 04905c17f64890311e6b5a5065d8c220602712e5)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 47 ++++++++-------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 --
 2 files changed, 18 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index cffaeceec4113..0599f05f7e00b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1309,15 +1309,25 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 			 struct arm_smmu_domain *smmu_domain)
 {
 	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
+	const struct io_pgtable_cfg *pgtbl_cfg =
+		&io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops)->cfg;
+	typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr =
+		&pgtbl_cfg->arm_lpae_s1_cfg.tcr;
 
 	memset(target, 0, sizeof(*target));
 
 	target->data[0] = cpu_to_le64(
-		cd->tcr |
+		FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) |
+		FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) |
+		FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) |
+		FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) |
+		FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) |
 #ifdef __BIG_ENDIAN
 		CTXDESC_CD_0_ENDI |
 #endif
+		CTXDESC_CD_0_TCR_EPD1 |
 		CTXDESC_CD_0_V |
+		FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) |
 		CTXDESC_CD_0_AA64 |
 		(master->stall_enabled ? CTXDESC_CD_0_S : 0) |
 		CTXDESC_CD_0_R |
@@ -1325,9 +1335,9 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 		CTXDESC_CD_0_ASET |
 		FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid)
 		);
-
-	target->data[1] = cpu_to_le64(cd->ttbr & CTXDESC_CD_1_TTB0_MASK);
-	target->data[3] = cpu_to_le64(cd->mair);
+	target->data[1] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.ttbr &
+				      CTXDESC_CD_1_TTB0_MASK);
+	target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair);
 }
 
 void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid)
@@ -2284,13 +2294,11 @@ static void arm_smmu_domain_free(struct iommu_domain *domain)
 }
 
 static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
-				       struct arm_smmu_domain *smmu_domain,
-				       struct io_pgtable_cfg *pgtbl_cfg)
+				       struct arm_smmu_domain *smmu_domain)
 {
 	int ret;
 	u32 asid;
 	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
-	typeof(&pgtbl_cfg->arm_lpae_s1_cfg.tcr) tcr = &pgtbl_cfg->arm_lpae_s1_cfg.tcr;
 
 	refcount_set(&cd->refs, 1);
 
@@ -2298,31 +2306,13 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
 	mutex_lock(&arm_smmu_asid_lock);
 	ret = xa_alloc(&arm_smmu_asid_xa, &asid, cd,
 		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
-	if (ret)
-		goto out_unlock;
-
 	cd->asid	= (u16)asid;
-	cd->ttbr	= pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
-	cd->tcr		= FIELD_PREP(CTXDESC_CD_0_TCR_T0SZ, tcr->tsz) |
-			  FIELD_PREP(CTXDESC_CD_0_TCR_TG0, tcr->tg) |
-			  FIELD_PREP(CTXDESC_CD_0_TCR_IRGN0, tcr->irgn) |
-			  FIELD_PREP(CTXDESC_CD_0_TCR_ORGN0, tcr->orgn) |
-			  FIELD_PREP(CTXDESC_CD_0_TCR_SH0, tcr->sh) |
-			  FIELD_PREP(CTXDESC_CD_0_TCR_IPS, tcr->ips) |
-			  CTXDESC_CD_0_TCR_EPD1 | CTXDESC_CD_0_AA64;
-	cd->mair	= pgtbl_cfg->arm_lpae_s1_cfg.mair;
-
-	mutex_unlock(&arm_smmu_asid_lock);
-	return 0;
-
-out_unlock:
 	mutex_unlock(&arm_smmu_asid_lock);
 	return ret;
 }
 
 static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
-				       struct arm_smmu_domain *smmu_domain,
-				       struct io_pgtable_cfg *pgtbl_cfg)
+				       struct arm_smmu_domain *smmu_domain)
 {
 	int vmid;
 	struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
@@ -2346,8 +2336,7 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 	struct io_pgtable_cfg pgtbl_cfg;
 	struct io_pgtable_ops *pgtbl_ops;
 	int (*finalise_stage_fn)(struct arm_smmu_device *smmu,
-				 struct arm_smmu_domain *smmu_domain,
-				 struct io_pgtable_cfg *pgtbl_cfg);
+				 struct arm_smmu_domain *smmu_domain);
 
 	/* Restrict the stage to what we can actually support */
 	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
@@ -2390,7 +2379,7 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 	smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
 	smmu_domain->domain.geometry.force_aperture = true;
 
-	ret = finalise_stage_fn(smmu, smmu_domain, &pgtbl_cfg);
+	ret = finalise_stage_fn(smmu, smmu_domain);
 	if (ret < 0) {
 		free_io_pgtable_ops(pgtbl_ops);
 		return ret;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 5540609069fcd..392130b840d55 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -587,9 +587,6 @@ struct arm_smmu_strtab_l1_desc {
 
 struct arm_smmu_ctx_desc {
 	u16				asid;
-	u64				ttbr;
-	u64				tcr;
-	u64				mair;
 
 	refcount_t			refs;
 	struct mm_struct		*mm;

From f40cc875fcfe27b91d87ec12618190365a6a1be9 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 30 Apr 2024 14:21:41 -0300
Subject: [PATCH 149/352] iommu/arm-smmu-v3: Add unit tests for
 arm_smmu_write_entry

Add tests for some of the more common STE update operations that we expect
to see, as well as some artificial STE updates to test the edges of
arm_smmu_write_entry. These also serve as a record of which common
operation is expected to be hitless, and how many syncs they require.

arm_smmu_write_entry implements a generic algorithm that updates an STE/CD
to any other abritrary STE/CD configuration. The update requires a
sequence of write+sync operations with some invariants that must be held
true after each sync. arm_smmu_write_entry lends itself well to
unit-testing since the function's interaction with the STE/CD is already
abstracted by input callbacks that we can hook to introspect into the
sequence of operations. We can use these hooks to guarantee that
invariants are held throughout the entire update operation.

Link: https://lore.kernel.org/r/20240106083617.1173871-3-mshavit@google.com
Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Michael Shavit <mshavit@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/9-v9-5040dc602008+177d7-smmuv3_newapi_p2_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 56e1a4cc2588a7cb9664457a62fd7a77e005aa01)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/Kconfig                         |  13 +-
 drivers/iommu/arm/arm-smmu-v3/Makefile        |   1 +
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   |   8 +-
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c  | 465 ++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  43 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  30 ++
 6 files changed, 533 insertions(+), 27 deletions(-)
 create mode 100644 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 84921c49a05d3..c503a384215c8 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -401,9 +401,9 @@ config ARM_SMMU_V3
 	  Say Y here if your system includes an IOMMU device implementing
 	  the ARM SMMUv3 architecture.
 
+if ARM_SMMU_V3
 config ARM_SMMU_V3_SVA
 	bool "Shared Virtual Addressing support for the ARM SMMUv3"
-	depends on ARM_SMMU_V3
 	select IOMMU_SVA
 	select IOMMU_IOPF
 	select MMU_NOTIFIER
@@ -414,6 +414,17 @@ config ARM_SMMU_V3_SVA
 	  Say Y here if your system supports SVA extensions such as PCIe PASID
 	  and PRI.
 
+config ARM_SMMU_V3_KUNIT_TEST
+	bool "KUnit tests for arm-smmu-v3 driver"  if !KUNIT_ALL_TESTS
+	depends on KUNIT
+	depends on ARM_SMMU_V3_SVA
+	default KUNIT_ALL_TESTS
+	help
+	  Enable this option to unit-test arm-smmu-v3 driver functions.
+
+	  If unsure, say N.
+endif
+
 config S390_IOMMU
 	def_bool y if S390 && PCI
 	depends on S390 && PCI
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 54feb1ecccad8..0b97054b3929b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -2,4 +2,5 @@
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
 arm_smmu_v3-objs-y += arm-smmu-v3.o
 arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
+arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o
 arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 8730a7043909e..34a977a0767d4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -8,6 +8,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/sched/mm.h>
 #include <linux/slab.h>
+#include <kunit/visibility.h>
 
 #include "arm-smmu-v3.h"
 #include "../../io-pgtable-arm.h"
@@ -120,9 +121,10 @@ static u64 page_size_to_cd(void)
 	return ARM_LPAE_TCR_TG0_4K;
 }
 
-static void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
-				 struct arm_smmu_master *master,
-				 struct mm_struct *mm, u16 asid)
+VISIBLE_IF_KUNIT
+void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
+			  struct arm_smmu_master *master, struct mm_struct *mm,
+			  u16 asid)
 {
 	u64 par;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
new file mode 100644
index 0000000000000..417804392ff08
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -0,0 +1,465 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2024 Google LLC.
+ */
+#include <kunit/test.h>
+#include <linux/io-pgtable.h>
+
+#include "arm-smmu-v3.h"
+
+struct arm_smmu_test_writer {
+	struct arm_smmu_entry_writer writer;
+	struct kunit *test;
+	const __le64 *init_entry;
+	const __le64 *target_entry;
+	__le64 *entry;
+
+	bool invalid_entry_written;
+	unsigned int num_syncs;
+};
+
+#define NUM_ENTRY_QWORDS 8
+#define NUM_EXPECTED_SYNCS(x) x
+
+static struct arm_smmu_ste bypass_ste;
+static struct arm_smmu_ste abort_ste;
+static struct arm_smmu_device smmu = {
+	.features = ARM_SMMU_FEAT_STALLS | ARM_SMMU_FEAT_ATTR_TYPES_OVR
+};
+static struct mm_struct sva_mm = {
+	.pgd = (void *)0xdaedbeefdeadbeefULL,
+};
+
+static bool arm_smmu_entry_differs_in_used_bits(const __le64 *entry,
+						const __le64 *used_bits,
+						const __le64 *target,
+						unsigned int length)
+{
+	bool differs = false;
+	unsigned int i;
+
+	for (i = 0; i < length; i++) {
+		if ((entry[i] & used_bits[i]) != target[i])
+			differs = true;
+	}
+	return differs;
+}
+
+static void
+arm_smmu_test_writer_record_syncs(struct arm_smmu_entry_writer *writer)
+{
+	struct arm_smmu_test_writer *test_writer =
+		container_of(writer, struct arm_smmu_test_writer, writer);
+	__le64 *entry_used_bits;
+
+	entry_used_bits = kunit_kzalloc(
+		test_writer->test, sizeof(*entry_used_bits) * NUM_ENTRY_QWORDS,
+		GFP_KERNEL);
+	KUNIT_ASSERT_NOT_NULL(test_writer->test, entry_used_bits);
+
+	pr_debug("STE value is now set to: ");
+	print_hex_dump_debug("    ", DUMP_PREFIX_NONE, 16, 8,
+			     test_writer->entry,
+			     NUM_ENTRY_QWORDS * sizeof(*test_writer->entry),
+			     false);
+
+	test_writer->num_syncs += 1;
+	if (!test_writer->entry[0]) {
+		test_writer->invalid_entry_written = true;
+	} else {
+		/*
+		 * At any stage in a hitless transition, the entry must be
+		 * equivalent to either the initial entry or the target entry
+		 * when only considering the bits used by the current
+		 * configuration.
+		 */
+		writer->ops->get_used(test_writer->entry, entry_used_bits);
+		KUNIT_EXPECT_FALSE(
+			test_writer->test,
+			arm_smmu_entry_differs_in_used_bits(
+				test_writer->entry, entry_used_bits,
+				test_writer->init_entry, NUM_ENTRY_QWORDS) &&
+				arm_smmu_entry_differs_in_used_bits(
+					test_writer->entry, entry_used_bits,
+					test_writer->target_entry,
+					NUM_ENTRY_QWORDS));
+	}
+}
+
+static void
+arm_smmu_v3_test_debug_print_used_bits(struct arm_smmu_entry_writer *writer,
+				       const __le64 *ste)
+{
+	__le64 used_bits[NUM_ENTRY_QWORDS] = {};
+
+	arm_smmu_get_ste_used(ste, used_bits);
+	pr_debug("STE used bits: ");
+	print_hex_dump_debug("    ", DUMP_PREFIX_NONE, 16, 8, used_bits,
+			     sizeof(used_bits), false);
+}
+
+static const struct arm_smmu_entry_writer_ops test_ste_ops = {
+	.sync = arm_smmu_test_writer_record_syncs,
+	.get_used = arm_smmu_get_ste_used,
+};
+
+static const struct arm_smmu_entry_writer_ops test_cd_ops = {
+	.sync = arm_smmu_test_writer_record_syncs,
+	.get_used = arm_smmu_get_cd_used,
+};
+
+static void arm_smmu_v3_test_ste_expect_transition(
+	struct kunit *test, const struct arm_smmu_ste *cur,
+	const struct arm_smmu_ste *target, unsigned int num_syncs_expected,
+	bool hitless)
+{
+	struct arm_smmu_ste cur_copy = *cur;
+	struct arm_smmu_test_writer test_writer = {
+		.writer = {
+			.ops = &test_ste_ops,
+		},
+		.test = test,
+		.init_entry = cur->data,
+		.target_entry = target->data,
+		.entry = cur_copy.data,
+		.num_syncs = 0,
+		.invalid_entry_written = false,
+
+	};
+
+	pr_debug("STE initial value: ");
+	print_hex_dump_debug("    ", DUMP_PREFIX_NONE, 16, 8, cur_copy.data,
+			     sizeof(cur_copy), false);
+	arm_smmu_v3_test_debug_print_used_bits(&test_writer.writer, cur->data);
+	pr_debug("STE target value: ");
+	print_hex_dump_debug("    ", DUMP_PREFIX_NONE, 16, 8, target->data,
+			     sizeof(cur_copy), false);
+	arm_smmu_v3_test_debug_print_used_bits(&test_writer.writer,
+					       target->data);
+
+	arm_smmu_write_entry(&test_writer.writer, cur_copy.data, target->data);
+
+	KUNIT_EXPECT_EQ(test, test_writer.invalid_entry_written, !hitless);
+	KUNIT_EXPECT_EQ(test, test_writer.num_syncs, num_syncs_expected);
+	KUNIT_EXPECT_MEMEQ(test, target->data, cur_copy.data, sizeof(cur_copy));
+}
+
+static void arm_smmu_v3_test_ste_expect_hitless_transition(
+	struct kunit *test, const struct arm_smmu_ste *cur,
+	const struct arm_smmu_ste *target, unsigned int num_syncs_expected)
+{
+	arm_smmu_v3_test_ste_expect_transition(test, cur, target,
+					       num_syncs_expected, true);
+}
+
+static const dma_addr_t fake_cdtab_dma_addr = 0xF0F0F0F0F0F0;
+
+static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
+					   const dma_addr_t dma_addr)
+{
+	struct arm_smmu_master master = {
+		.cd_table.cdtab_dma = dma_addr,
+		.cd_table.s1cdmax = 0xFF,
+		.cd_table.s1fmt = STRTAB_STE_0_S1FMT_64K_L2,
+		.smmu = &smmu,
+	};
+
+	arm_smmu_make_cdtable_ste(ste, &master);
+}
+
+static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
+{
+	/*
+	 * Bypass STEs has used bits in the first two Qwords, while abort STEs
+	 * only have used bits in the first QWord. Transitioning from bypass to
+	 * abort requires two syncs: the first to set the first qword and make
+	 * the STE into an abort, the second to clean up the second qword.
+	 */
+	arm_smmu_v3_test_ste_expect_hitless_transition(
+		test, &bypass_ste, &abort_ste, NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_abort_to_bypass(struct kunit *test)
+{
+	/*
+	 * Transitioning from abort to bypass also requires two syncs: the first
+	 * to set the second qword data required by the bypass STE, and the
+	 * second to set the first qword and switch to bypass.
+	 */
+	arm_smmu_v3_test_ste_expect_hitless_transition(
+		test, &abort_ste, &bypass_ste, NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_cdtable_to_abort(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste,
+						       NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_abort_to_cdtable(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste,
+						       NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_cdtable_to_bypass(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste,
+						       NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_v3_write_ste_test_bypass_to_cdtable(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste,
+						       NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
+				      bool ats_enabled)
+{
+	struct arm_smmu_master master = {
+		.smmu = &smmu,
+		.ats_enabled = ats_enabled,
+	};
+	struct io_pgtable io_pgtable = {};
+	struct arm_smmu_domain smmu_domain = {
+		.pgtbl_ops = &io_pgtable.ops,
+	};
+
+	io_pgtable.cfg.arm_lpae_s2_cfg.vttbr = 0xdaedbeefdeadbeefULL;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.ps = 1;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tg = 2;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sh = 3;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.orgn = 1;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.irgn = 2;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sl = 3;
+	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tsz = 4;
+
+	arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain);
+}
+
+static void arm_smmu_v3_write_ste_test_s2_to_abort(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_s2_ste(&ste, true);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste,
+						       NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_abort_to_s2(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_s2_ste(&ste, true);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste,
+						       NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_s2_to_bypass(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_s2_ste(&ste, true);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste,
+						       NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_ste_test_bypass_to_s2(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+
+	arm_smmu_test_make_s2_ste(&ste, true);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste,
+						       NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_test_cd_expect_transition(
+	struct kunit *test, const struct arm_smmu_cd *cur,
+	const struct arm_smmu_cd *target, unsigned int num_syncs_expected,
+	bool hitless)
+{
+	struct arm_smmu_cd cur_copy = *cur;
+	struct arm_smmu_test_writer test_writer = {
+		.writer = {
+			.ops = &test_cd_ops,
+		},
+		.test = test,
+		.init_entry = cur->data,
+		.target_entry = target->data,
+		.entry = cur_copy.data,
+		.num_syncs = 0,
+		.invalid_entry_written = false,
+
+	};
+
+	pr_debug("CD initial value: ");
+	print_hex_dump_debug("    ", DUMP_PREFIX_NONE, 16, 8, cur_copy.data,
+			     sizeof(cur_copy), false);
+	arm_smmu_v3_test_debug_print_used_bits(&test_writer.writer, cur->data);
+	pr_debug("CD target value: ");
+	print_hex_dump_debug("    ", DUMP_PREFIX_NONE, 16, 8, target->data,
+			     sizeof(cur_copy), false);
+	arm_smmu_v3_test_debug_print_used_bits(&test_writer.writer,
+					       target->data);
+
+	arm_smmu_write_entry(&test_writer.writer, cur_copy.data, target->data);
+
+	KUNIT_EXPECT_EQ(test, test_writer.invalid_entry_written, !hitless);
+	KUNIT_EXPECT_EQ(test, test_writer.num_syncs, num_syncs_expected);
+	KUNIT_EXPECT_MEMEQ(test, target->data, cur_copy.data, sizeof(cur_copy));
+}
+
+static void arm_smmu_v3_test_cd_expect_non_hitless_transition(
+	struct kunit *test, const struct arm_smmu_cd *cur,
+	const struct arm_smmu_cd *target, unsigned int num_syncs_expected)
+{
+	arm_smmu_v3_test_cd_expect_transition(test, cur, target,
+					      num_syncs_expected, false);
+}
+
+static void arm_smmu_v3_test_cd_expect_hitless_transition(
+	struct kunit *test, const struct arm_smmu_cd *cur,
+	const struct arm_smmu_cd *target, unsigned int num_syncs_expected)
+{
+	arm_smmu_v3_test_cd_expect_transition(test, cur, target,
+					      num_syncs_expected, true);
+}
+
+static void arm_smmu_test_make_s1_cd(struct arm_smmu_cd *cd, unsigned int asid)
+{
+	struct arm_smmu_master master = {
+		.smmu = &smmu,
+	};
+	struct io_pgtable io_pgtable = {};
+	struct arm_smmu_domain smmu_domain = {
+		.pgtbl_ops = &io_pgtable.ops,
+		.cd = {
+			.asid = asid,
+		},
+	};
+
+	io_pgtable.cfg.arm_lpae_s1_cfg.ttbr = 0xdaedbeefdeadbeefULL;
+	io_pgtable.cfg.arm_lpae_s1_cfg.tcr.ips = 1;
+	io_pgtable.cfg.arm_lpae_s1_cfg.tcr.tg = 2;
+	io_pgtable.cfg.arm_lpae_s1_cfg.tcr.sh = 3;
+	io_pgtable.cfg.arm_lpae_s1_cfg.tcr.orgn = 1;
+	io_pgtable.cfg.arm_lpae_s1_cfg.tcr.irgn = 2;
+	io_pgtable.cfg.arm_lpae_s1_cfg.tcr.tsz = 4;
+	io_pgtable.cfg.arm_lpae_s1_cfg.mair = 0xabcdef012345678ULL;
+
+	arm_smmu_make_s1_cd(cd, &master, &smmu_domain);
+}
+
+static void arm_smmu_v3_write_cd_test_s1_clear(struct kunit *test)
+{
+	struct arm_smmu_cd cd = {};
+	struct arm_smmu_cd cd_2;
+
+	arm_smmu_test_make_s1_cd(&cd_2, 1997);
+	arm_smmu_v3_test_cd_expect_non_hitless_transition(
+		test, &cd, &cd_2, NUM_EXPECTED_SYNCS(2));
+	arm_smmu_v3_test_cd_expect_non_hitless_transition(
+		test, &cd_2, &cd, NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_cd_test_s1_change_asid(struct kunit *test)
+{
+	struct arm_smmu_cd cd = {};
+	struct arm_smmu_cd cd_2;
+
+	arm_smmu_test_make_s1_cd(&cd, 778);
+	arm_smmu_test_make_s1_cd(&cd_2, 1997);
+	arm_smmu_v3_test_cd_expect_hitless_transition(test, &cd, &cd_2,
+						      NUM_EXPECTED_SYNCS(1));
+	arm_smmu_v3_test_cd_expect_hitless_transition(test, &cd_2, &cd,
+						      NUM_EXPECTED_SYNCS(1));
+}
+
+static void arm_smmu_test_make_sva_cd(struct arm_smmu_cd *cd, unsigned int asid)
+{
+	struct arm_smmu_master master = {
+		.smmu = &smmu,
+	};
+
+	arm_smmu_make_sva_cd(cd, &master, &sva_mm, asid);
+}
+
+static void arm_smmu_test_make_sva_release_cd(struct arm_smmu_cd *cd,
+					      unsigned int asid)
+{
+	struct arm_smmu_master master = {
+		.smmu = &smmu,
+	};
+
+	arm_smmu_make_sva_cd(cd, &master, NULL, asid);
+}
+
+static void arm_smmu_v3_write_cd_test_sva_clear(struct kunit *test)
+{
+	struct arm_smmu_cd cd = {};
+	struct arm_smmu_cd cd_2;
+
+	arm_smmu_test_make_sva_cd(&cd_2, 1997);
+	arm_smmu_v3_test_cd_expect_non_hitless_transition(
+		test, &cd, &cd_2, NUM_EXPECTED_SYNCS(2));
+	arm_smmu_v3_test_cd_expect_non_hitless_transition(
+		test, &cd_2, &cd, NUM_EXPECTED_SYNCS(2));
+}
+
+static void arm_smmu_v3_write_cd_test_sva_release(struct kunit *test)
+{
+	struct arm_smmu_cd cd;
+	struct arm_smmu_cd cd_2;
+
+	arm_smmu_test_make_sva_cd(&cd, 1997);
+	arm_smmu_test_make_sva_release_cd(&cd_2, 1997);
+	arm_smmu_v3_test_cd_expect_hitless_transition(test, &cd, &cd_2,
+						      NUM_EXPECTED_SYNCS(2));
+	arm_smmu_v3_test_cd_expect_hitless_transition(test, &cd_2, &cd,
+						      NUM_EXPECTED_SYNCS(2));
+}
+
+static struct kunit_case arm_smmu_v3_test_cases[] = {
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_abort),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_bypass),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_abort),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_cdtable),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_bypass),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_cdtable),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_abort),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_s2),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_bypass),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_s2),
+	KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_clear),
+	KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid),
+	KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear),
+	KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_release),
+	{},
+};
+
+static int arm_smmu_v3_test_suite_init(struct kunit_suite *test)
+{
+	arm_smmu_make_bypass_ste(&smmu, &bypass_ste);
+	arm_smmu_make_abort_ste(&abort_ste);
+	return 0;
+}
+
+static struct kunit_suite arm_smmu_v3_test_module = {
+	.name = "arm-smmu-v3-kunit-test",
+	.suite_init = arm_smmu_v3_test_suite_init,
+	.test_cases = arm_smmu_v3_test_cases,
+};
+kunit_test_suites(&arm_smmu_v3_test_module);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0599f05f7e00b..c2810eab908ab 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -26,6 +26,7 @@
 #include <linux/pci.h>
 #include <linux/pci-ats.h>
 #include <linux/platform_device.h>
+#include <kunit/visibility.h>
 
 #include "arm-smmu-v3.h"
 #include "../../dma-iommu.h"
@@ -42,17 +43,6 @@ enum arm_smmu_msi_index {
 	ARM_SMMU_MAX_MSIS,
 };
 
-struct arm_smmu_entry_writer_ops;
-struct arm_smmu_entry_writer {
-	const struct arm_smmu_entry_writer_ops *ops;
-	struct arm_smmu_master *master;
-};
-
-struct arm_smmu_entry_writer_ops {
-	void (*get_used)(const __le64 *entry, __le64 *used);
-	void (*sync)(struct arm_smmu_entry_writer *writer);
-};
-
 #define NUM_ENTRY_QWORDS 8
 static_assert(sizeof(struct arm_smmu_ste) == NUM_ENTRY_QWORDS * sizeof(u64));
 static_assert(sizeof(struct arm_smmu_cd) == NUM_ENTRY_QWORDS * sizeof(u64));
@@ -979,7 +969,8 @@ void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid)
  * would be nice if this was complete according to the spec, but minimally it
  * has to capture the bits this driver uses.
  */
-static void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
+VISIBLE_IF_KUNIT
+void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
 {
 	unsigned int cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(ent[0]));
 
@@ -1101,8 +1092,9 @@ static bool entry_set(struct arm_smmu_entry_writer *writer, __le64 *entry,
  * V=0 process. This relies on the IGNORED behavior described in the
  * specification.
  */
-static void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer,
-				 __le64 *entry, const __le64 *target)
+VISIBLE_IF_KUNIT
+void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
+			  const __le64 *target)
 {
 	__le64 unused_update[NUM_ENTRY_QWORDS];
 	u8 used_qword_diff;
@@ -1256,7 +1248,8 @@ struct arm_smmu_cd_writer {
 	unsigned int ssid;
 };
 
-static void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits)
+VISIBLE_IF_KUNIT
+void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits)
 {
 	used_bits[0] = cpu_to_le64(CTXDESC_CD_0_V);
 	if (!(ent[0] & cpu_to_le64(CTXDESC_CD_0_V)))
@@ -1514,7 +1507,8 @@ static void arm_smmu_write_ste(struct arm_smmu_master *master, u32 sid,
 	}
 }
 
-static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
+VISIBLE_IF_KUNIT
+void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
 {
 	memset(target, 0, sizeof(*target));
 	target->data[0] = cpu_to_le64(
@@ -1522,8 +1516,9 @@ static void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
 		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT));
 }
 
-static void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
-				     struct arm_smmu_ste *target)
+VISIBLE_IF_KUNIT
+void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
+			      struct arm_smmu_ste *target)
 {
 	memset(target, 0, sizeof(*target));
 	target->data[0] = cpu_to_le64(
@@ -1535,8 +1530,9 @@ static void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
 							 STRTAB_STE_1_SHCFG_INCOMING));
 }
 
-static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
-				      struct arm_smmu_master *master)
+VISIBLE_IF_KUNIT
+void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
+			       struct arm_smmu_master *master)
 {
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
@@ -1585,9 +1581,10 @@ static void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 	}
 }
 
-static void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
-					struct arm_smmu_master *master,
-					struct arm_smmu_domain *smmu_domain)
+VISIBLE_IF_KUNIT
+void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
+				 struct arm_smmu_master *master,
+				 struct arm_smmu_domain *smmu_domain)
 {
 	struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
 	const struct io_pgtable_cfg *pgtbl_cfg =
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 392130b840d55..1242a086c9f94 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -736,6 +736,36 @@ struct arm_smmu_domain {
 	struct list_head		mmu_notifiers;
 };
 
+/* The following are exposed for testing purposes. */
+struct arm_smmu_entry_writer_ops;
+struct arm_smmu_entry_writer {
+	const struct arm_smmu_entry_writer_ops *ops;
+	struct arm_smmu_master *master;
+};
+
+struct arm_smmu_entry_writer_ops {
+	void (*get_used)(const __le64 *entry, __le64 *used);
+	void (*sync)(struct arm_smmu_entry_writer *writer);
+};
+
+#if IS_ENABLED(CONFIG_KUNIT)
+void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits);
+void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *cur,
+			  const __le64 *target);
+void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits);
+void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
+void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
+			      struct arm_smmu_ste *target);
+void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
+			       struct arm_smmu_master *master);
+void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
+				 struct arm_smmu_master *master,
+				 struct arm_smmu_domain *smmu_domain);
+void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
+			  struct arm_smmu_master *master, struct mm_struct *mm,
+			  u16 asid);
+#endif
+
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
 {
 	return container_of(dom, struct arm_smmu_domain, domain);

From b68117f0a34a57eb1b7b6efe2f68f56f1e2a0fe3 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:02 +0100
Subject: [PATCH 150/352] mm/memory-failure: convert shake_page() to
 shake_folio()

Removes two calls to compound_head().  Move the prototype to internal.h;
we definitely don't want code outside mm using it.

Link: https://lkml.kernel.org/r/20240412193510.2356957-6-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit fed5348ee2b136c84c5a27d6fceef14066beeb66)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/mm.h   |  1 -
 mm/hwpoison-inject.c | 11 ++++++-----
 mm/internal.h        |  1 +
 mm/memory-failure.c  | 15 ++++++++++-----
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index dedd899d538ff..28fca5d2d9193 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3965,7 +3965,6 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
-extern void shake_page(struct page *p);
 extern atomic_long_t num_poisoned_pages __read_mostly;
 extern int soft_offline_page(unsigned long pfn, int flags);
 #ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index d0548e382b6ba..c9d653f51e45b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -15,7 +15,7 @@ static int hwpoison_inject(void *data, u64 val)
 {
 	unsigned long pfn = val;
 	struct page *p;
-	struct page *hpage;
+	struct folio *folio;
 	int err;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -25,16 +25,17 @@ static int hwpoison_inject(void *data, u64 val)
 		return -ENXIO;
 
 	p = pfn_to_page(pfn);
-	hpage = compound_head(p);
+	folio = page_folio(p);
 
 	if (!hwpoison_filter_enable)
 		goto inject;
 
-	shake_page(hpage);
+	shake_folio(folio);
 	/*
 	 * This implies unable to support non-LRU pages except free page.
 	 */
-	if (!PageLRU(hpage) && !PageHuge(p) && !is_free_buddy_page(p))
+	if (!folio_test_lru(folio) && !folio_test_hugetlb(folio) &&
+	    !is_free_buddy_page(p))
 		return 0;
 
 	/*
@@ -42,7 +43,7 @@ static int hwpoison_inject(void *data, u64 val)
 	 * the targeted owner (or on a free page).
 	 * memory_failure() will redo the check reliably inside page lock.
 	 */
-	err = hwpoison_filter(hpage);
+	err = hwpoison_filter(&folio->page);
 	if (err)
 		return 0;
 
diff --git a/mm/internal.h b/mm/internal.h
index c3f3e0f191151..6379bbaecfcf4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -851,6 +851,7 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
 /*
  * mm/memory-failure.c
  */
+void shake_folio(struct folio *folio);
 extern int hwpoison_filter(struct page *p);
 
 extern u32 hwpoison_filter_dev_major;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7751bd78fbcb2..f70ad9c8ffbe7 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -370,20 +370,25 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
  * Unknown page type encountered. Try to check whether it can turn PageLRU by
  * lru_add_drain_all.
  */
-void shake_page(struct page *p)
+void shake_folio(struct folio *folio)
 {
-	if (PageHuge(p))
+	if (folio_test_hugetlb(folio))
 		return;
 	/*
 	 * TODO: Could shrink slab caches here if a lightweight range-based
 	 * shrinker will be available.
 	 */
-	if (PageSlab(p))
+	if (folio_test_slab(folio))
 		return;
 
 	lru_add_drain_all();
 }
-EXPORT_SYMBOL_GPL(shake_page);
+EXPORT_SYMBOL_GPL(shake_folio);
+
+static void shake_page(struct page *page)
+{
+	shake_folio(page_folio(page));
+}
 
 static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
 		unsigned long address)
@@ -1652,7 +1657,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * shake_page() again to ensure that it's flushed.
 	 */
 	if (mlocked)
-		shake_page(hpage);
+		shake_folio(folio);
 
 	/*
 	 * Now that the dirty bit has been propagated to the

From 87db5ccc64d4b901d066e78f794f5339d873cfd0 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:03 +0100
Subject: [PATCH 151/352] mm: convert hugetlb_page_mapping_lock_write to folio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The page is only used to get the mapping, so the folio will do just as
well.  Both callers already have a folio available, so this saves a call
to compound_head().

Link: https://lkml.kernel.org/r/20240412193510.2356957-7-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu  <jane.chu@oracle.com>
Reviewed-by: Oscar Salvador <osalvador@suse.de>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 6e8cda4c2c87b2a44828e651a10705647a6fd542)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/hugetlb.h | 6 +++---
 mm/hugetlb.c            | 6 +++---
 mm/memory-failure.c     | 2 +-
 mm/migrate.c            | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index c1ee640d87b11..065370dd8521e 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -175,7 +175,7 @@ u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx);
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
 		      unsigned long addr, pud_t *pud);
 
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage);
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio);
 
 extern int sysctl_hugetlb_shm_group;
 extern struct list_head huge_boot_pages;
@@ -298,8 +298,8 @@ static inline unsigned long hugetlb_total_pages(void)
 	return 0;
 }
 
-static inline struct address_space *hugetlb_page_mapping_lock_write(
-							struct page *hpage)
+static inline struct address_space *hugetlb_folio_mapping_lock_write(
+							struct folio *folio)
 {
 	return NULL;
 }
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index f20e00c677212..3142d7eaa2266 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2161,13 +2161,13 @@ static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
 /*
  * Find and lock address space (mapping) in write mode.
  *
- * Upon entry, the page is locked which means that page_mapping() is
+ * Upon entry, the folio is locked which means that folio_mapping() is
  * stable.  Due to locking order, we can only trylock_write.  If we can
  * not get the lock, simply return NULL to caller.
  */
-struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+struct address_space *hugetlb_folio_mapping_lock_write(struct folio *folio)
 {
-	struct address_space *mapping = page_mapping(hpage);
+	struct address_space *mapping = folio_mapping(folio);
 
 	if (!mapping)
 		return mapping;
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index f70ad9c8ffbe7..980e540a4a677 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1637,7 +1637,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 		 * TTU_RMAP_LOCKED to indicate we have taken the lock
 		 * at this higher level.
 		 */
-		mapping = hugetlb_page_mapping_lock_write(hpage);
+		mapping = hugetlb_folio_mapping_lock_write(folio);
 		if (mapping) {
 			try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
 			i_mmap_unlock_write(mapping);
diff --git a/mm/migrate.c b/mm/migrate.c
index c27b1f8097d4a..fa68b560d55ff 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1422,7 +1422,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
 			 * semaphore in write mode here and set TTU_RMAP_LOCKED
 			 * to let lower levels know we have taken the lock.
 			 */
-			mapping = hugetlb_page_mapping_lock_write(&src->page);
+			mapping = hugetlb_folio_mapping_lock_write(src);
 			if (unlikely(!mapping))
 				goto unlock_put_anon;
 

From 4c86272411630c4f5f91380fe478260d63113b1d Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:04 +0100
Subject: [PATCH 152/352] mm/memory-failure: convert memory_failure() to use a
 folio

Saves dozens of calls to compound_head().

Link: https://lkml.kernel.org/r/20240412193510.2356957-8-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 5dba5c356ab3bbf6b00a42632f3e14728f327553)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 mm/memory-failure.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 980e540a4a677..0da03e679d8aa 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2202,7 +2202,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 int memory_failure(unsigned long pfn, int flags)
 {
 	struct page *p;
-	struct page *hpage;
+	struct folio *folio;
 	struct dev_pagemap *pgmap;
 	int res = 0;
 	unsigned long page_flags;
@@ -2290,8 +2290,8 @@ int memory_failure(unsigned long pfn, int flags)
 		}
 	}
 
-	hpage = compound_head(p);
-	if (PageTransHuge(hpage)) {
+	folio = page_folio(p);
+	if (folio_test_large(folio)) {
 		/*
 		 * The flag must be set after the refcount is bumped
 		 * otherwise it may race with THP split.
@@ -2305,12 +2305,13 @@ int memory_failure(unsigned long pfn, int flags)
 		 * or unhandlable page.  The refcount is bumped iff the
 		 * page is a valid handlable page.
 		 */
-		SetPageHasHWPoisoned(hpage);
+		folio_set_has_hwpoisoned(folio);
 		if (try_to_split_thp_page(p) < 0) {
 			res = action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
 			goto unlock_mutex;
 		}
 		VM_BUG_ON_PAGE(!page_count(p), p);
+		folio = page_folio(p);
 	}
 
 	/*
@@ -2321,9 +2322,9 @@ int memory_failure(unsigned long pfn, int flags)
 	 * The check (unnecessarily) ignores LRU pages being isolated and
 	 * walked by the page reclaim code, however that's not a big loss.
 	 */
-	shake_page(p);
+	shake_folio(folio);
 
-	lock_page(p);
+	folio_lock(folio);
 
 	/*
 	 * We're only intended to deal with the non-Compound page here.
@@ -2331,11 +2332,11 @@ int memory_failure(unsigned long pfn, int flags)
 	 * race window. If this happens, we could try again to hopefully
 	 * handle the page next round.
 	 */
-	if (PageCompound(p)) {
+	if (folio_test_large(folio)) {
 		if (retry) {
 			ClearPageHWPoison(p);
-			unlock_page(p);
-			put_page(p);
+			folio_unlock(folio);
+			folio_put(folio);
 			flags &= ~MF_COUNT_INCREASED;
 			retry = false;
 			goto try_again;
@@ -2351,29 +2352,29 @@ int memory_failure(unsigned long pfn, int flags)
 	 * folio_remove_rmap_*() in try_to_unmap_one(). So to determine page
 	 * status correctly, we save a copy of the page flags at this time.
 	 */
-	page_flags = p->flags;
+	page_flags = folio->flags;
 
 	if (hwpoison_filter(p)) {
 		ClearPageHWPoison(p);
-		unlock_page(p);
-		put_page(p);
+		folio_unlock(folio);
+		folio_put(folio);
 		res = -EOPNOTSUPP;
 		goto unlock_mutex;
 	}
 
 	/*
-	 * __munlock_folio() may clear a writeback page's LRU flag without
-	 * page_lock. We need wait writeback completion for this page or it
-	 * may trigger vfs BUG while evict inode.
+	 * __munlock_folio() may clear a writeback folio's LRU flag without
+	 * the folio lock. We need to wait for writeback completion for this
+	 * folio or it may trigger a vfs BUG while evicting inode.
 	 */
-	if (!PageLRU(p) && !PageWriteback(p))
+	if (!folio_test_lru(folio) && !folio_test_writeback(folio))
 		goto identify_page_state;
 
 	/*
 	 * It's very difficult to mess with pages currently under IO
 	 * and in many cases impossible, so we just avoid it here.
 	 */
-	wait_on_page_writeback(p);
+	folio_wait_writeback(folio);
 
 	/*
 	 * Now take care of user space mappings.
@@ -2387,7 +2388,8 @@ int memory_failure(unsigned long pfn, int flags)
 	/*
 	 * Torn down by someone else?
 	 */
-	if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) {
+	if (folio_test_lru(folio) && !folio_test_swapcache(folio) &&
+	    folio->mapping == NULL) {
 		res = action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED);
 		goto unlock_page;
 	}
@@ -2397,7 +2399,7 @@ int memory_failure(unsigned long pfn, int flags)
 	mutex_unlock(&mf_mutex);
 	return res;
 unlock_page:
-	unlock_page(p);
+	folio_unlock(folio);
 unlock_mutex:
 	mutex_unlock(&mf_mutex);
 	return res;

From 049c4a9f14810c4df9b792d6fb37e12c4076cd20 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:05 +0100
Subject: [PATCH 153/352] mm/memory-failure: convert hwpoison_user_mappings to
 take a folio

Pass the folio from the callers, and use it throughout instead of hpage.
Saves dozens of calls to compound_head().

Link: https://lkml.kernel.org/r/20240412193510.2356957-9-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 03468a0f52893b8dea4a96677ad9ff78bf55d765)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 mm/memory-failure.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0da03e679d8aa..e2f0541c03845 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1572,24 +1572,24 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
  * Do all that is necessary to remove user space mappings. Unmap
  * the pages and send SIGBUS to the processes if the data was dirty.
  */
-static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
-				  int flags, struct page *hpage)
+static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
+		unsigned long pfn, int flags)
 {
-	struct folio *folio = page_folio(hpage);
 	enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_HWPOISON;
 	struct address_space *mapping;
 	LIST_HEAD(tokill);
 	bool unmap_success;
 	int forcekill;
-	bool mlocked = PageMlocked(hpage);
+	bool mlocked = folio_test_mlocked(folio);
 
 	/*
 	 * Here we are interested only in user-mapped pages, so skip any
 	 * other types of pages.
 	 */
-	if (PageReserved(p) || PageSlab(p) || PageTable(p) || PageOffline(p))
+	if (folio_test_reserved(folio) || folio_test_slab(folio) ||
+	    folio_test_pgtable(folio) || folio_test_offline(folio))
 		return true;
-	if (!(PageLRU(hpage) || PageHuge(p)))
+	if (!(folio_test_lru(folio) || folio_test_hugetlb(folio)))
 		return true;
 
 	/*
@@ -1599,7 +1599,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	if (!page_mapped(p))
 		return true;
 
-	if (PageSwapCache(p)) {
+	if (folio_test_swapcache(folio)) {
 		pr_err("%#lx: keeping poisoned page in swap cache\n", pfn);
 		ttu &= ~TTU_HWPOISON;
 	}
@@ -1610,11 +1610,11 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * XXX: the dirty test could be racy: set_page_dirty() may not always
 	 * be called inside page lock (it's recommended but not enforced).
 	 */
-	mapping = page_mapping(hpage);
-	if (!(flags & MF_MUST_KILL) && !PageDirty(hpage) && mapping &&
+	mapping = folio_mapping(folio);
+	if (!(flags & MF_MUST_KILL) && !folio_test_dirty(folio) && mapping &&
 	    mapping_can_writeback(mapping)) {
-		if (page_mkclean(hpage)) {
-			SetPageDirty(hpage);
+		if (folio_mkclean(folio)) {
+			folio_set_dirty(folio);
 		} else {
 			ttu &= ~TTU_HWPOISON;
 			pr_info("%#lx: corrupted page was clean: dropped without side effects\n",
@@ -1629,7 +1629,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 */
 	collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
 
-	if (PageHuge(hpage) && !PageAnon(hpage)) {
+	if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
 		/*
 		 * For hugetlb pages in shared mappings, try_to_unmap
 		 * could potentially call huge_pmd_unshare.  Because of
@@ -1669,7 +1669,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
 	 * use a more force-full uncatchable kill to prevent
 	 * any accesses to the poisoned memory.
 	 */
-	forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) ||
+	forcekill = folio_test_dirty(folio) || (flags & MF_MUST_KILL) ||
 		    !unmap_success;
 	kill_procs(&tokill, forcekill, !unmap_success, pfn, flags);
 
@@ -2113,7 +2113,7 @@ static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb
 
 	page_flags = folio->flags;
 
-	if (!hwpoison_user_mappings(p, pfn, flags, &folio->page)) {
+	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
 		folio_unlock(folio);
 		return action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 	}
@@ -2380,7 +2380,7 @@ int memory_failure(unsigned long pfn, int flags)
 	 * Now take care of user space mappings.
 	 * Abort on fail: __filemap_remove_folio() assumes unmapped page.
 	 */
-	if (!hwpoison_user_mappings(p, pfn, flags, p)) {
+	if (!hwpoison_user_mappings(folio, p, pfn, flags)) {
 		res = action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED);
 		goto unlock_page;
 	}

From 625908e21e341023c286b249636c40e957c5b694 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:06 +0100
Subject: [PATCH 154/352] mm/memory-failure: add some folio conversions to
 unpoison_memory

Some of these folio APIs didn't exist when the unpoison_memory()
conversion was done originally.

Link: https://lkml.kernel.org/r/20240412193510.2356957-10-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit ee299e9849736f60e6e01f7a5dcb258de7c7d1b9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 mm/memory-failure.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e2f0541c03845..e788f65be1fa5 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2576,8 +2576,8 @@ int unpoison_memory(unsigned long pfn)
 		goto unlock_mutex;
 	}
 
-	if (folio_test_slab(folio) || PageTable(&folio->page) ||
-	    folio_test_reserved(folio) || PageOffline(&folio->page))
+	if (folio_test_slab(folio) || folio_test_pgtable(folio) ||
+	    folio_test_reserved(folio) || folio_test_offline(folio))
 		goto unlock_mutex;
 
 	/*
@@ -2598,7 +2598,7 @@ int unpoison_memory(unsigned long pfn)
 
 	ghp = get_hwpoison_page(p, MF_UNPOISON);
 	if (!ghp) {
-		if (PageHuge(p)) {
+		if (folio_test_hugetlb(folio)) {
 			huge = true;
 			count = folio_free_raw_hwp(folio, false);
 			if (count == 0)
@@ -2614,7 +2614,7 @@ int unpoison_memory(unsigned long pfn)
 					 pfn, &unpoison_rs);
 		}
 	} else {
-		if (PageHuge(p)) {
+		if (folio_test_hugetlb(folio)) {
 			huge = true;
 			count = folio_free_raw_hwp(folio, false);
 			if (count == 0) {

From e624df47dbfc24353b62a4ef0ae7b0da0a4f4dd7 Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:07 +0100
Subject: [PATCH 155/352] mm/memory-failure: use folio functions throughout
 collect_procs()

Saves a couple of calls to compound_head().

Link: https://lkml.kernel.org/r/20240412193510.2356957-11-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 0edb5b282ac5a4f9b1bdc22120c9b145be315622)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 mm/memory-failure.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e788f65be1fa5..ab8b38b2a842a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -732,9 +732,9 @@ static void collect_procs(struct folio *folio, struct page *page,
 {
 	if (!folio->mapping)
 		return;
-	if (unlikely(PageKsm(page)))
+	if (unlikely(folio_test_ksm(folio)))
 		collect_procs_ksm(page, tokill, force_early);
-	else if (PageAnon(page))
+	else if (folio_test_anon(folio))
 		collect_procs_anon(folio, page, tokill, force_early);
 	else
 		collect_procs_file(folio, page, tokill, force_early);

From 4fb0282b0558259c5d3ecac1cef5228f1b7f197b Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Fri, 12 Apr 2024 20:35:08 +0100
Subject: [PATCH 156/352] mm/memory-failure: pass the folio to
 collect_procs_ksm()

We've already calculated it, so pass it in instead of recalculating it in
collect_procs_ksm().

Link: https://lkml.kernel.org/r/20240412193510.2356957-12-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Jane Chu <jane.chu@oracle.com>
Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Oscar Salvador <osalvador@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit b650e1d2aefbbb31e7578ad60a0a71bf5e5c5346)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/ksm.h | 14 +++-----------
 mm/ksm.c            |  5 ++---
 mm/memory-failure.c |  2 +-
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index f4692ec361e1b..9b75f34c84573 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -100,15 +100,9 @@ struct folio *ksm_might_need_to_copy(struct folio *folio,
 
 void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc);
 void folio_migrate_ksm(struct folio *newfolio, struct folio *folio);
-
-#ifdef CONFIG_MEMORY_FAILURE
-void collect_procs_ksm(struct page *page, struct list_head *to_kill,
-		       int force_early);
-#endif
-
-#ifdef CONFIG_PROC_FS
+void collect_procs_ksm(struct folio *folio, struct page *page,
+		struct list_head *to_kill, int force_early);
 long ksm_process_profit(struct mm_struct *);
-#endif /* CONFIG_PROC_FS */
 
 #else  /* !CONFIG_KSM */
 
@@ -139,12 +133,10 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte)
 {
 }
 
-#ifdef CONFIG_MEMORY_FAILURE
-static inline void collect_procs_ksm(struct page *page,
+static inline void collect_procs_ksm(struct folio *folio, struct page *page,
 				     struct list_head *to_kill, int force_early)
 {
 }
-#endif
 
 #ifdef CONFIG_MMU
 static inline int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
diff --git a/mm/ksm.c b/mm/ksm.c
index 27c6ddf2a4002..c015d2c117f57 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -3174,12 +3174,11 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc)
 /*
  * Collect processes when the error hit an ksm page.
  */
-void collect_procs_ksm(struct page *page, struct list_head *to_kill,
-		       int force_early)
+void collect_procs_ksm(struct folio *folio, struct page *page,
+		struct list_head *to_kill, int force_early)
 {
 	struct ksm_stable_node *stable_node;
 	struct ksm_rmap_item *rmap_item;
-	struct folio *folio = page_folio(page);
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
 
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ab8b38b2a842a..d78de3342651d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -733,7 +733,7 @@ static void collect_procs(struct folio *folio, struct page *page,
 	if (!folio->mapping)
 		return;
 	if (unlikely(folio_test_ksm(folio)))
-		collect_procs_ksm(page, tokill, force_early);
+		collect_procs_ksm(folio, page, tokill, force_early);
 	else if (folio_test_anon(folio))
 		collect_procs_anon(folio, page, tokill, force_early);
 	else

From c2dc6d162fdd93655f3587145631971d8bf2580c Mon Sep 17 00:00:00 2001
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Date: Tue, 23 Apr 2024 23:55:34 +0100
Subject: [PATCH 157/352] memory-failure: remove calls to page_mapping()

This is mostly just inlining page_mapping() into the two callers.

Link: https://lkml.kernel.org/r/20240423225552.4113447-4-willy@infradead.org
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Miaohe Lin <linmiaohe@huawei.com>
Cc: Eric Biggers <ebiggers@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
(cherry picked from commit 89f5c54b228181713f1c00b27b360b29643cdfa6)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 mm/memory-failure.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index d78de3342651d..e6c957f5128eb 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -217,6 +217,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value);
 
 static int hwpoison_filter_dev(struct page *p)
 {
+	struct folio *folio = page_folio(p);
 	struct address_space *mapping;
 	dev_t dev;
 
@@ -224,7 +225,7 @@ static int hwpoison_filter_dev(struct page *p)
 	    hwpoison_filter_dev_minor == ~0U)
 		return 0;
 
-	mapping = page_mapping(p);
+	mapping = folio_mapping(folio);
 	if (mapping == NULL || mapping->host == NULL)
 		return -EINVAL;
 
@@ -1094,7 +1095,8 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p)
  */
 static int me_pagecache_dirty(struct page_state *ps, struct page *p)
 {
-	struct address_space *mapping = page_mapping(p);
+	struct folio *folio = page_folio(p);
+	struct address_space *mapping = folio_mapping(folio);
 
 	SetPageError(p);
 	/* TBD: print more information about the file. */

From 0d19949145f32a2ad20b60633b4885ff5df2f4ba Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Sun, 7 Apr 2024 21:11:41 -0700
Subject: [PATCH 158/352] swiotlb: remove alloc_size argument to
 swiotlb_tbl_map_single()

Currently swiotlb_tbl_map_single() takes alloc_align_mask and
alloc_size arguments to specify an swiotlb allocation that is larger
than mapping_size.  This larger allocation is used solely by
iommu_dma_map_single() to handle untrusted devices that should not have
DMA visibility to memory pages that are partially used for unrelated
kernel data.

Having two arguments to specify the allocation is redundant. While
alloc_align_mask naturally specifies the alignment of the starting
address of the allocation, it can also implicitly specify the size
by rounding up the mapping_size to that alignment.

Additionally, the current approach has an edge case bug.
iommu_dma_map_page() already does the rounding up to compute the
alloc_size argument. But swiotlb_tbl_map_single() then calculates the
alignment offset based on the DMA min_align_mask, and adds that offset to
alloc_size. If the offset is non-zero, the addition may result in a value
that is larger than the max the swiotlb can allocate.  If the rounding up
is done _after_ the alignment offset is added to the mapping_size (and
the original mapping_size conforms to the value returned by
swiotlb_max_mapping_size), then the max that the swiotlb can allocate
will not be exceeded.

In view of these issues, simplify the swiotlb_tbl_map_single() interface
by removing the alloc_size argument. Most call sites pass the same value
for mapping_size and alloc_size, and they pass alloc_align_mask as zero.
Just remove the redundant argument from these callers, as they will see
no functional change. For iommu_dma_map_page() also remove the alloc_size
argument, and have swiotlb_tbl_map_single() compute the alloc_size by
rounding up mapping_size after adding the offset based on min_align_mask.
This has the side effect of fixing the edge case bug but with no other
functional change.

Also add a sanity test on the alloc_align_mask. While IOMMU code
currently ensures the granule is not larger than PAGE_SIZE, if that
guarantee were to be removed in the future, the downstream effect on the
swiotlb might go unnoticed until strange allocation failures occurred.

Tested on an ARM64 system with 16K page size and some kernel test-only
hackery to allow modifying the DMA min_align_mask and the granule size
that becomes the alloc_align_mask. Tested these combinations with a
variety of original memory addresses and sizes, including those that
reproduce the edge case bug:

 * 4K granule and 0 min_align_mask
 * 4K granule and 0xFFF min_align_mask (4K - 1)
 * 16K granule and 0xFFF min_align_mask
 * 64K granule and 0xFFF min_align_mask
 * 64K granule and 0x3FFF min_align_mask (16K - 1)

With the changes, all combinations pass.

Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
(cherry picked from commit 327e2c97c46a4d971c5450a9d05b4a673f46c4da)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/dma-iommu.c |  2 +-
 drivers/xen/swiotlb-xen.c |  2 +-
 include/linux/swiotlb.h   |  2 +-
 kernel/dma/swiotlb.c      | 56 +++++++++++++++++++++++++++++----------
 4 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 16a7c4a4f3dba..e21e54938ef5c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1166,7 +1166,7 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 		trace_swiotlb_bounced(dev, phys, size);
 
 		aligned_size = iova_align(iovad, size);
-		phys = swiotlb_tbl_map_single(dev, phys, size, aligned_size,
+		phys = swiotlb_tbl_map_single(dev, phys, size,
 					      iova_mask(iovad), dir, attrs);
 
 		if (phys == DMA_MAPPING_ERROR)
diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 0e6c6c25d154f..d4f1f8d1ebd88 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -216,7 +216,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page,
 	 */
 	trace_swiotlb_bounced(dev, dev_addr, size);
 
-	map = swiotlb_tbl_map_single(dev, phys, size, size, 0, dir, attrs);
+	map = swiotlb_tbl_map_single(dev, phys, size, 0, dir, attrs);
 	if (map == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ecde0312dd520..05e6f1b3474ee 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -43,7 +43,7 @@ int swiotlb_init_late(size_t size, gfp_t gfp_mask,
 extern void __init swiotlb_update_mem_attributes(void);
 
 phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
-		size_t mapping_size, size_t alloc_size,
+		size_t mapping_size,
 		unsigned int alloc_aligned_mask, enum dma_data_direction dir,
 		unsigned long attrs);
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 3386a7a4f834e..7f49aaea76092 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -1317,15 +1317,40 @@ static unsigned long mem_used(struct io_tlb_mem *mem)
 
 #endif /* CONFIG_DEBUG_FS */
 
+/**
+ * swiotlb_tbl_map_single() - bounce buffer map a single contiguous physical area
+ * @dev:		Device which maps the buffer.
+ * @orig_addr:		Original (non-bounced) physical IO buffer address
+ * @mapping_size:	Requested size of the actual bounce buffer, excluding
+ *			any pre- or post-padding for alignment
+ * @alloc_align_mask:	Required start and end alignment of the allocated buffer
+ * @dir:		DMA direction
+ * @attrs:		Optional DMA attributes for the map operation
+ *
+ * Find and allocate a suitable sequence of IO TLB slots for the request.
+ * The allocated space starts at an alignment specified by alloc_align_mask,
+ * and the size of the allocated space is rounded up so that the total amount
+ * of allocated space is a multiple of (alloc_align_mask + 1). If
+ * alloc_align_mask is zero, the allocated space may be at any alignment and
+ * the size is not rounded up.
+ *
+ * The returned address is within the allocated space and matches the bits
+ * of orig_addr that are specified in the DMA min_align_mask for the device. As
+ * such, this returned address may be offset from the beginning of the allocated
+ * space. The bounce buffer space starting at the returned address for
+ * mapping_size bytes is initialized to the contents of the original IO buffer
+ * area. Any pre-padding (due to an offset) and any post-padding (due to
+ * rounding-up the size) is not initialized.
+ */
 phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
-		size_t mapping_size, size_t alloc_size,
-		unsigned int alloc_align_mask, enum dma_data_direction dir,
-		unsigned long attrs)
+		size_t mapping_size, unsigned int alloc_align_mask,
+		enum dma_data_direction dir, unsigned long attrs)
 {
 	struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
 	unsigned int offset;
 	struct io_tlb_pool *pool;
 	unsigned int i;
+	size_t size;
 	int index;
 	phys_addr_t tlb_addr;
 	unsigned short pad_slots;
@@ -1339,20 +1364,24 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
 		pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
 
-	if (mapping_size > alloc_size) {
-		dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)",
-			      mapping_size, alloc_size);
-		return (phys_addr_t)DMA_MAPPING_ERROR;
-	}
+	/*
+	 * The default swiotlb memory pool is allocated with PAGE_SIZE
+	 * alignment. If a mapping is requested with larger alignment,
+	 * the mapping may be unable to use the initial slot(s) in all
+	 * sets of IO_TLB_SEGSIZE slots. In such case, a mapping request
+	 * of or near the maximum mapping size would always fail.
+	 */
+	dev_WARN_ONCE(dev, alloc_align_mask > ~PAGE_MASK,
+		"Alloc alignment may prevent fulfilling requests with max mapping_size\n");
 
 	offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
-	index = swiotlb_find_slots(dev, orig_addr,
-				   alloc_size + offset, alloc_align_mask, &pool);
+	size = ALIGN(mapping_size + offset, alloc_align_mask + 1);
+	index = swiotlb_find_slots(dev, orig_addr, size, alloc_align_mask, &pool);
 	if (index == -1) {
 		if (!(attrs & DMA_ATTR_NO_WARN))
 			dev_warn_ratelimited(dev,
 	"swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n",
-				 alloc_size, mem->nslabs, mem_used(mem));
+				 size, mem->nslabs, mem_used(mem));
 		return (phys_addr_t)DMA_MAPPING_ERROR;
 	}
 
@@ -1365,7 +1394,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
 	offset &= (IO_TLB_SIZE - 1);
 	index += pad_slots;
 	pool->slots[index].pad_slots = pad_slots;
-	for (i = 0; i < nr_slots(alloc_size + offset); i++)
+	for (i = 0; i < (nr_slots(size) - pad_slots); i++)
 		pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
 	tlb_addr = slot_addr(pool->start, index) + offset;
 	/*
@@ -1519,8 +1548,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
 
 	trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size);
 
-	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
-			attrs);
+	swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, 0, dir, attrs);
 	if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
 		return DMA_MAPPING_ERROR;
 

From 6125cb82954e92e2676c090f873222caff9324ee Mon Sep 17 00:00:00 2001
From: Michael Kelley <mhklinux@outlook.com>
Date: Sun, 7 Apr 2024 21:11:42 -0700
Subject: [PATCH 159/352] iommu/dma: fix zeroing of bounce buffer padding used
 by untrusted devices

iommu_dma_map_page() allocates swiotlb memory as a bounce buffer when an
untrusted device wants to map only part of the memory in an granule.  The
goal is to disallow the untrusted device having DMA access to unrelated
kernel data that may be sharing the granule.  To meet this goal, the
bounce buffer itself is zeroed, and any additional swiotlb memory up to
alloc_size after the bounce buffer end (i.e., "post-padding") is also
zeroed.

However, as of commit 901c7280ca0d ("Reinstate some of "swiotlb: rework
"fix info leak with DMA_FROM_DEVICE"""), swiotlb_tbl_map_single() always
initializes the contents of the bounce buffer to the original memory.
Zeroing the bounce buffer is redundant and probably wrong per the
discussion in that commit. Only the post-padding needs to be zeroed.

Also, when the DMA min_align_mask is non-zero, the allocated bounce
buffer space may not start on a granule boundary.  The swiotlb memory
from the granule boundary to the start of the allocated bounce buffer
might belong to some unrelated bounce buffer. So as described in the
"second issue" in [1], it can't be zeroed to protect against untrusted
devices. But as of commit af133562d5af ("swiotlb: extend buffer
pre-padding to alloc_align_mask if necessary"), swiotlb_tbl_map_single()
allocates pre-padding slots when necessary to meet min_align_mask
requirements, making it possible to zero the pre-padding area as well.

Finally, iommu_dma_map_page() uses the swiotlb for untrusted devices
and also for certain kmalloc() memory. Current code does the zeroing
for both cases, but it is needed only for the untrusted device case.

Fix all of this by updating iommu_dma_map_page() to zero both the
pre-padding and post-padding areas, but not the actual bounce buffer.
Do this only in the case where the bounce buffer is used because
of an untrusted device.

[1] https://lore.kernel.org/all/20210929023300.335969-1-stevensd@google.com/

Signed-off-by: Michael Kelley <mhklinux@outlook.com>
Reviewed-by: Petr Tesarik <petr@tesarici.cz>
Signed-off-by: Christoph Hellwig <hch@lst.de>
(cherry picked from commit 2650073f1b5858008c32712f3d9e1e808ce7e967)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/dma-iommu.c | 29 ++++++++++++++++-------------
 include/linux/iova.h      |  5 +++++
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index e21e54938ef5c..053340d0af2a7 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1155,9 +1155,6 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 	 */
 	if (dev_use_swiotlb(dev, size, dir) &&
 	    iova_offset(iovad, phys | size)) {
-		void *padding_start;
-		size_t padding_size, aligned_size;
-
 		if (!is_swiotlb_active(dev)) {
 			dev_warn_once(dev, "DMA bounce buffers are inactive, unable to map unaligned transaction.\n");
 			return DMA_MAPPING_ERROR;
@@ -1165,24 +1162,30 @@ static dma_addr_t iommu_dma_map_page(struct device *dev, struct page *page,
 
 		trace_swiotlb_bounced(dev, phys, size);
 
-		aligned_size = iova_align(iovad, size);
 		phys = swiotlb_tbl_map_single(dev, phys, size,
 					      iova_mask(iovad), dir, attrs);
 
 		if (phys == DMA_MAPPING_ERROR)
 			return DMA_MAPPING_ERROR;
 
-		/* Cleanup the padding area. */
-		padding_start = phys_to_virt(phys);
-		padding_size = aligned_size;
+		/*
+		 * Untrusted devices should not see padding areas with random
+		 * leftover kernel data, so zero the pre- and post-padding.
+		 * swiotlb_tbl_map_single() has initialized the bounce buffer
+		 * proper to the contents of the original memory buffer.
+		 */
+		if (dev_is_untrusted(dev)) {
+			size_t start, virt = (size_t)phys_to_virt(phys);
 
-		if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
-		    (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) {
-			padding_start += size;
-			padding_size -= size;
-		}
+			/* Pre-padding */
+			start = iova_align_down(iovad, virt);
+			memset((void *)start, 0, virt - start);
 
-		memset(padding_start, 0, padding_size);
+			/* Post-padding */
+			start = virt + size;
+			memset((void *)start, 0,
+			       iova_align(iovad, start) - start);
+		}
 	}
 
 	if (!coherent && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 83c00fac2acb1..d2c4fd923efab 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -65,6 +65,11 @@ static inline size_t iova_align(struct iova_domain *iovad, size_t size)
 	return ALIGN(size, iovad->granule);
 }
 
+static inline size_t iova_align_down(struct iova_domain *iovad, size_t size)
+{
+	return ALIGN_DOWN(size, iovad->granule);
+}
+
 static inline dma_addr_t iova_dma_addr(struct iova_domain *iovad, struct iova *iova)
 {
 	return (dma_addr_t)iova->pfn_lo << iova_shift(iovad);

From 058f32e8b26b0fe326d56949cc9c3cf654109b0f Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 7 May 2024 10:21:10 -0300
Subject: [PATCH 160/352] iommu/arm-smmu-v3: Make the kunit into a module

It turns out kconfig has problems ensuring the SMMU module and the KUNIT
module are consistently y/m to allow linking. It will permit KUNIT to be a
module while SMMU is built in.

Also, Fedora apparently enables kunit on production kernels.

So, put the entire kunit in its own module using the
VISIBLE_IF_KUNIT/EXPORT_SYMBOL_IF_KUNIT machinery. This keeps it out of
vmlinus on Fedora and makes the kconfig work in the normal way. There is
no cost if kunit is disabled.

Fixes: 56e1a4cc2588 ("iommu/arm-smmu-v3: Add unit tests for arm_smmu_write_entry")
Reported-by: Thorsten Leemhuis <linux@leemhuis.info>
Link: https://lore.kernel.org/all/aeea8546-5bce-4c51-b506-5d2008e52fef@leemhuis.info
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Tested-by: Thorsten Leemhuis <linux@leemhuis.info>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/0-v1-24cba6c0f404+2ae-smmu_kunit_module_jgg@nvidia.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit da55da5a42d4247d7a48b843fa5fcd9a4a10f4fe)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/Kconfig                            | 2 +-
 drivers/iommu/arm/arm-smmu-v3/Makefile           | 3 ++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c  | 1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c | 3 +++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c      | 8 ++++++++
 5 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index c503a384215c8..2591fea0a2bee 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -415,7 +415,7 @@ config ARM_SMMU_V3_SVA
 	  and PRI.
 
 config ARM_SMMU_V3_KUNIT_TEST
-	bool "KUnit tests for arm-smmu-v3 driver"  if !KUNIT_ALL_TESTS
+	tristate "KUnit tests for arm-smmu-v3 driver"  if !KUNIT_ALL_TESTS
 	depends on KUNIT
 	depends on ARM_SMMU_V3_SVA
 	default KUNIT_ALL_TESTS
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 0b97054b3929b..014a997753a8a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -2,5 +2,6 @@
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
 arm_smmu_v3-objs-y += arm-smmu-v3.o
 arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
-arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o
 arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
+
+obj-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 34a977a0767d4..e490ffb380154 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -185,6 +185,7 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
 	 */
 	target->data[3] = cpu_to_le64(read_sysreg(mair_el1));
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd);
 
 static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
 {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index 417804392ff08..315e487fd990e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -463,3 +463,6 @@ static struct kunit_suite arm_smmu_v3_test_module = {
 	.test_cases = arm_smmu_v3_test_cases,
 };
 kunit_test_suites(&arm_smmu_v3_test_module);
+
+MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c2810eab908ab..852ed94cd1e0a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1007,6 +1007,7 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
 	if (cfg == STRTAB_STE_0_CFG_BYPASS)
 		used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_ste_used);
 
 /*
  * Figure out if we can do a hitless update of entry to become target. Returns a
@@ -1141,6 +1142,7 @@ void arm_smmu_write_entry(struct arm_smmu_entry_writer *writer, __le64 *entry,
 			entry_set(writer, entry, target, 0, NUM_ENTRY_QWORDS));
 	}
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_write_entry);
 
 static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 			     int ssid, bool leaf)
@@ -1268,6 +1270,7 @@ void arm_smmu_get_cd_used(const __le64 *ent, __le64 *used_bits)
 		used_bits[1] &= ~cpu_to_le64(CTXDESC_CD_1_TTB0_MASK);
 	}
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_get_cd_used);
 
 static void arm_smmu_cd_writer_sync_entry(struct arm_smmu_entry_writer *writer)
 {
@@ -1332,6 +1335,7 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 				      CTXDESC_CD_1_TTB0_MASK);
 	target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair);
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_s1_cd);
 
 void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid)
 {
@@ -1515,6 +1519,7 @@ void arm_smmu_make_abort_ste(struct arm_smmu_ste *target)
 		STRTAB_STE_0_V |
 		FIELD_PREP(STRTAB_STE_0_CFG, STRTAB_STE_0_CFG_ABORT));
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_abort_ste);
 
 VISIBLE_IF_KUNIT
 void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
@@ -1529,6 +1534,7 @@ void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
 		target->data[1] = cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
 							 STRTAB_STE_1_SHCFG_INCOMING));
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_bypass_ste);
 
 VISIBLE_IF_KUNIT
 void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
@@ -1580,6 +1586,7 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 			cpu_to_le64(FIELD_PREP(STRTAB_STE_2_S2VMID, 0));
 	}
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste);
 
 VISIBLE_IF_KUNIT
 void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
@@ -1627,6 +1634,7 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 	target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s2_cfg.vttbr &
 				      STRTAB_STE_3_S2TTB_MASK);
 }
+EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_s2_domain_ste);
 
 /*
  * This can safely directly manipulate the STE memory without a sync sequence

From fe94dbf6cc51240f1d5697911d3f573b1709db80 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Thu, 16 May 2024 11:48:30 -0600
Subject: [PATCH 161/352] vfio/pci: Restore zero affected bus reset devices
 warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Yi notes relative to commit f6944d4a0b87 ("vfio/pci: Collect hot-reset
devices to local buffer") that we previously tested the resulting
device count with a WARN_ON, which was removed when we switched to
the in-loop user copy in commit b56b7aabcf3c ("vfio/pci: Copy hot-reset
device info to userspace in the devices loop").  Finding no devices in
the bus/slot would be an unexpected condition, so let's restore the
warning and trigger a -ERANGE error here as success with no devices
would be an unexpected result to userspace as well.

Suggested-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Cédric Le Goater <clg@redhat.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Link: https://lore.kernel.org/r/20240516174831.2257970-1-alex.williamson@redhat.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit cbb325e77fbe62a06184175aa98c9eb98736c3e8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index d8c95cc16be81..80cae87fff36e 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1281,6 +1281,9 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
 	if (ret)
 		return ret;
 
+	if (WARN_ON(!count)) /* Should always be at least one */
+		return -ERANGE;
+
 	if (count > (hdr.argsz - sizeof(hdr)) / sizeof(*devices)) {
 		hdr.count = count;
 		ret = -ENOSPC;

From 25a6a14f37041e24f82f0d3cff45270a9ef1609c Mon Sep 17 00:00:00 2001
From: Mostafa Saleh <smostafa@google.com>
Date: Tue, 4 Jun 2024 18:52:18 +0000
Subject: [PATCH 162/352] iommu/arm-smmu-v3: Avoid uninitialized asid in case
 of error

Static checker is complaining about the ASID possibly set uninitialized.
This only happens in case of error and this value would be ignored anyway.

A simple fix would be just to initialize the local variable to zero,
this path will only be reached on the first attach to a domain where
the CD is already initialized to zero.
This avoids having to bloat the function with an error path.

Closes: https://lore.kernel.org/linux-iommu/849e3d77-0a3c-43c4-878d-a0e061c8cd61@moroto.mountain/T/#u
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Signed-off-by: Mostafa Saleh <smostafa@google.com>
Fixes: 04905c17f648 ("iommu/arm-smmu-v3: Build the whole CD in arm_smmu_make_s1_cd()")
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240604185218.2602058-1-smostafa@google.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d3867e7148318e12b5d69b64950622f5ed06fe86)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 852ed94cd1e0a..ee0acb87a0e5d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2302,7 +2302,7 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
 				       struct arm_smmu_domain *smmu_domain)
 {
 	int ret;
-	u32 asid;
+	u32 asid = 0;
 	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
 
 	refcount_set(&cd->refs, 1);

From 487578f41a20f0880220d18843af6368f8cb6619 Mon Sep 17 00:00:00 2001
From: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Date: Wed, 8 May 2024 18:15:55 +0300
Subject: [PATCH 163/352] iommu/arm-smmu-v3: Use *-y instead of *-objs in
 Makefile

*-objs suffix is reserved rather for (user-space) host programs while
usually *-y suffix is used for kernel drivers (although *-objs works
for that purpose for now).

Let's correct the old usages of *-objs in Makefiles.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240508151611.1444352-1-andriy.shevchenko@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 16c0bad7ae04e4a1e7361fbb91573248de06a008)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/Makefile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 014a997753a8a..355173d1441d2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
-arm_smmu_v3-objs-y += arm-smmu-v3.o
-arm_smmu_v3-objs-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
-arm_smmu_v3-objs := $(arm_smmu_v3-objs-y)
+arm_smmu_v3-y := arm-smmu-v3.o
+arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
 
 obj-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o

From 63e993be78d4d17e88d00755c9aaa2da06bb6ee8 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 28 May 2024 12:54:58 +0800
Subject: [PATCH 164/352] iommu: Make iommu_sva_domain_alloc() static

iommu_sva_domain_alloc() is only called in iommu-sva.c, hence make it
static.

On the other hand, iommu_sva_domain_alloc() should not return NULL anymore
after commit <80af5a452024> ("iommu: Add ops->domain_alloc_sva()"), the
removal of inline code avoids potential confusion.

Fixes: 80af5a452024 ("iommu: Add ops->domain_alloc_sva()")
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240528045458.81458-1-baolu.lu@linux.intel.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit b5c29fba72a6c950655d1cb0f6aa16b60dc83be7)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu-sva.c | 6 ++++--
 include/linux/iommu.h     | 8 --------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 18a35e798b729..25e5812992264 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -10,6 +10,8 @@
 #include "iommu-priv.h"
 
 static DEFINE_MUTEX(iommu_sva_lock);
+static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+						   struct mm_struct *mm);
 
 /* Allocate a PASID for the mm within range (inclusive) */
 static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct device *dev)
@@ -277,8 +279,8 @@ static int iommu_sva_iopf_handler(struct iopf_group *group)
 	return 0;
 }
 
-struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
-					    struct mm_struct *mm)
+static struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
+						   struct mm_struct *mm)
 {
 	const struct iommu_ops *ops = dev_iommu_ops(dev);
 	struct iommu_domain *domain;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index a44ff9e17b1b1..365e7d3d418c5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1519,8 +1519,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev,
 					struct mm_struct *mm);
 void iommu_sva_unbind_device(struct iommu_sva *handle);
 u32 iommu_sva_get_pasid(struct iommu_sva *handle);
-struct iommu_domain *iommu_sva_domain_alloc(struct device *dev,
-					    struct mm_struct *mm);
 #else
 static inline struct iommu_sva *
 iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
@@ -1545,12 +1543,6 @@ static inline u32 mm_get_enqcmd_pasid(struct mm_struct *mm)
 }
 
 static inline void mm_pasid_drop(struct mm_struct *mm) {}
-
-static inline struct iommu_domain *
-iommu_sva_domain_alloc(struct device *dev, struct mm_struct *mm)
-{
-	return NULL;
-}
 #endif /* CONFIG_IOMMU_SVA */
 
 #ifdef CONFIG_IOMMU_IOPF

From 9fad206af82481000babb51c2f30800737047427 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 4 Jun 2024 13:39:09 +0100
Subject: [PATCH 165/352] iommu/dma: Prune redundant pgprot arguments

Somewhere amongst previous refactorings, the pgprot value in
__iommu_dma_alloc_noncontiguous() became entirely unused, and the one
used in iommu_dma_alloc_remap() can be computed locally rather than by
its one remaining caller. Clean 'em up.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/c2a81b72df59a71a13f8bad94f834e627c4c93dd.1717504749.git.robin.murphy@arm.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit 8d485a69603f667032d61daf4f1cb9464f315e1c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/dma-iommu.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 053340d0af2a7..19537aef13ffd 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -942,8 +942,7 @@ static struct page **__iommu_dma_alloc_pages(struct device *dev,
  * but an IOMMU which supports smaller pages might not map the whole thing.
  */
 static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
-		size_t size, struct sg_table *sgt, gfp_t gfp, pgprot_t prot,
-		unsigned long attrs)
+		size_t size, struct sg_table *sgt, gfp_t gfp, unsigned long attrs)
 {
 	struct iommu_domain *domain = iommu_get_dma_domain(dev);
 	struct iommu_dma_cookie *cookie = domain->iova_cookie;
@@ -1017,15 +1016,14 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
 }
 
 static void *iommu_dma_alloc_remap(struct device *dev, size_t size,
-		dma_addr_t *dma_handle, gfp_t gfp, pgprot_t prot,
-		unsigned long attrs)
+		dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
 {
 	struct page **pages;
 	struct sg_table sgt;
 	void *vaddr;
+	pgprot_t prot = dma_pgprot(dev, PAGE_KERNEL, attrs);
 
-	pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, prot,
-						attrs);
+	pages = __iommu_dma_alloc_noncontiguous(dev, size, &sgt, gfp, attrs);
 	if (!pages)
 		return NULL;
 	*dma_handle = sgt.sgl->dma_address;
@@ -1052,8 +1050,7 @@ static struct sg_table *iommu_dma_alloc_noncontiguous(struct device *dev,
 	if (!sh)
 		return NULL;
 
-	sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp,
-						    PAGE_KERNEL, attrs);
+	sh->pages = __iommu_dma_alloc_noncontiguous(dev, size, &sh->sgt, gfp, attrs);
 	if (!sh->pages) {
 		kfree(sh);
 		return NULL;
@@ -1622,8 +1619,7 @@ static void *iommu_dma_alloc(struct device *dev, size_t size,
 
 	if (gfpflags_allow_blocking(gfp) &&
 	    !(attrs & DMA_ATTR_FORCE_CONTIGUOUS)) {
-		return iommu_dma_alloc_remap(dev, size, handle, gfp,
-				dma_pgprot(dev, PAGE_KERNEL, attrs), attrs);
+		return iommu_dma_alloc_remap(dev, size, handle, gfp, attrs);
 	}
 
 	if (IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&

From fc0a9d02512804346e0724fa6b4add10ba0513b7 Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 13 Jun 2024 18:14:36 -0700
Subject: [PATCH 166/352] iommu/iova: Add missing MODULE_DESCRIPTION() macro

With ARCH=arm, make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/iommu/iova.o

Add the missing invocation of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Acked-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20240613-md-arm-drivers-iommu-v1-1-1fe0bd953119@quicinc.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit c94ad1d5e3885bd4fa6abb695baf5a8f5c3c309c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iova.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d59d0ea2fd219..16c6adff3eb7b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -1000,4 +1000,5 @@ void iova_cache_put(void)
 EXPORT_SYMBOL_GPL(iova_cache_put);
 
 MODULE_AUTHOR("Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>");
+MODULE_DESCRIPTION("IOMMU I/O Virtual Address management");
 MODULE_LICENSE("GPL");

From 0dcba268247db8fa656f47a591fc4ca6f0decda0 Mon Sep 17 00:00:00 2001
From: Uros Bizjak <ubizjak@gmail.com>
Date: Wed, 22 May 2024 10:26:49 +0200
Subject: [PATCH 167/352] iommufd: Use atomic_long_try_cmpxchg() in
 incr_user_locked_vm()

Use atomic_long_try_cmpxchg() instead of
atomic_long_cmpxchg (*ptr, old, new) != old in incr_user_locked_vm().
cmpxchg returns success in ZF flag, so this change saves a compare
after cmpxchg (and related move instruction in front of cmpxchg).

Also, atomic_long_try_cmpxchg() implicitly assigns old *ptr
value to "old" when cmpxchg fails. There is no need to re-read
the value in the loop.

Signed-off-by: Uros Bizjak <ubizjak@gmail.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: Kevin Tian <kevin.tian@intel.com>
Cc: Joerg Roedel <joro@8bytes.org>
Cc: Will Deacon <will@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240522082729.971123-3-ubizjak@gmail.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit b95a40122a8183873736e0506df8e3a881178099)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/pages.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 528f356238b34..117f644a0c5b7 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -809,13 +809,14 @@ static int incr_user_locked_vm(struct iopt_pages *pages, unsigned long npages)
 
 	lock_limit = task_rlimit(pages->source_task, RLIMIT_MEMLOCK) >>
 		     PAGE_SHIFT;
+
+	cur_pages = atomic_long_read(&pages->source_user->locked_vm);
 	do {
-		cur_pages = atomic_long_read(&pages->source_user->locked_vm);
 		new_pages = cur_pages + npages;
 		if (new_pages > lock_limit)
 			return -ENOMEM;
-	} while (atomic_long_cmpxchg(&pages->source_user->locked_vm, cur_pages,
-				     new_pages) != cur_pages);
+	} while (!atomic_long_try_cmpxchg(&pages->source_user->locked_vm,
+					  &cur_pages, new_pages));
 	return 0;
 }
 

From 8210d450d4f8f6c14281df606063bb8370f386e3 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:00:55 +0100
Subject: [PATCH 168/352] iommufd/selftest: Fix dirty bitmap tests with u8
 bitmaps

With 64k base pages, the first 128k iova length test requires less than a
byte for a bitmap, exposing a bug in the tests that assume that bitmaps are
at least a byte.

Rather than dealing with bytes, have _test_mock_dirty_bitmaps() pass the
number of bits. The caller functions are adjusted to also use bits as well,
and converting to bytes when clearing, allocating and freeing the bitmap.

Link: https://lore.kernel.org/r/20240627110105.62325-2-joao.m.martins@oracle.com
Reported-by: Matt Ochs <mochs@nvidia.com>
Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP")
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit ec61f820a2ff07d1717583bd57d6ee45d2763a6e)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 10 +++++-----
 tools/testing/selftests/iommu/iommufd_utils.h |  6 ++++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index edf1c99c9936c..0b04d782a19fc 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1722,6 +1722,7 @@ FIXTURE_VARIANT(iommufd_dirty_tracking)
 
 FIXTURE_SETUP(iommufd_dirty_tracking)
 {
+	unsigned long size;
 	int mmap_flags;
 	void *vrc;
 	int rc;
@@ -1749,12 +1750,11 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 	assert(vrc == self->buffer);
 
 	self->page_size = MOCK_PAGE_SIZE;
-	self->bitmap_size =
-		variant->buffer_size / self->page_size / BITS_PER_BYTE;
+	self->bitmap_size = variant->buffer_size / self->page_size;
 
 	/* Provision with an extra (PAGE_SIZE) for the unaligned case */
-	rc = posix_memalign(&self->bitmap, PAGE_SIZE,
-			    self->bitmap_size + PAGE_SIZE);
+	size = DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE);
+	rc = posix_memalign(&self->bitmap, PAGE_SIZE, size + PAGE_SIZE);
 	assert(!rc);
 	assert(self->bitmap);
 	assert((uintptr_t)self->bitmap % PAGE_SIZE == 0);
@@ -1775,7 +1775,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 FIXTURE_TEARDOWN(iommufd_dirty_tracking)
 {
 	munmap(self->buffer, variant->buffer_size);
-	munmap(self->bitmap, self->bitmap_size);
+	munmap(self->bitmap, DIV_ROUND_UP(self->bitmap_size, BITS_PER_BYTE));
 	teardown_iommufd(self->fd, _metadata);
 }
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 8d2b46b2114da..c612fbf0195ba 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -22,6 +22,8 @@
 #define BIT_MASK(nr) (1UL << ((nr) % __BITS_PER_LONG))
 #define BIT_WORD(nr) ((nr) / __BITS_PER_LONG)
 
+#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
+
 static inline void set_bit(unsigned int nr, unsigned long *addr)
 {
 	unsigned long mask = BIT_MASK(nr);
@@ -346,12 +348,12 @@ static int _test_cmd_mock_domain_set_dirty(int fd, __u32 hwpt_id, size_t length,
 static int _test_mock_dirty_bitmaps(int fd, __u32 hwpt_id, size_t length,
 				    __u64 iova, size_t page_size,
 				    size_t pte_page_size, __u64 *bitmap,
-				    __u64 bitmap_size, __u32 flags,
+				    __u64 nbits, __u32 flags,
 				    struct __test_metadata *_metadata)
 {
 	unsigned long npte = pte_page_size / page_size, pteset = 2 * npte;
-	unsigned long nbits = bitmap_size * BITS_PER_BYTE;
 	unsigned long j, i, nr = nbits / pteset ?: 1;
+	unsigned long bitmap_size = DIV_ROUND_UP(nbits, BITS_PER_BYTE);
 	__u64 out_dirty = 0;
 
 	/* Mark all even bits as dirty in the mock domain */

From 959bab88161cf51163249d7fc243ba537c2ef5fb Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:00:56 +0100
Subject: [PATCH 169/352] iommufd/selftest: Fix iommufd_test_dirty() to handle
 <u8 bitmaps

The calculation returns 0 if it sets less than the number of bits per
byte. For calculating memory allocation from bits, lets round it up to
one byte.

Link: https://lore.kernel.org/r/20240627110105.62325-3-joao.m.martins@oracle.com
Reported-by: Matt Ochs <mochs@nvidia.com>
Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP")
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 9560393b830b415b2151b3dd8e065257cccbffa7)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 7a2199470f312..654ed33390957 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1334,7 +1334,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 	}
 
 	max = length / page_size;
-	bitmap_size = max / BITS_PER_BYTE;
+	bitmap_size = DIV_ROUND_UP(max, BITS_PER_BYTE);
 
 	tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
 	if (!tmp) {

From 934a7fdd1419137cb3590ae5746a05e0a344a23b Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:00:57 +0100
Subject: [PATCH 170/352] iommufd/selftest: Add tests for <= u8 bitmap sizes

Add more tests for bitmaps smaller than or equal to an u8, though skip the
tests if the IOVA buffer size is smaller than the mock page size.

Link: https://lore.kernel.org/r/20240627110105.62325-4-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 33335584eb78c0bda21ff8d759c39e035abb48ac)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 0b04d782a19fc..61189215e1ab7 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1727,6 +1727,12 @@ FIXTURE_SETUP(iommufd_dirty_tracking)
 	void *vrc;
 	int rc;
 
+	if (variant->buffer_size < MOCK_PAGE_SIZE) {
+		SKIP(return,
+		     "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu",
+		     variant->buffer_size, MOCK_PAGE_SIZE);
+	}
+
 	self->fd = open("/dev/iommu", O_RDWR);
 	ASSERT_NE(-1, self->fd);
 
@@ -1779,6 +1785,18 @@ FIXTURE_TEARDOWN(iommufd_dirty_tracking)
 	teardown_iommufd(self->fd, _metadata);
 }
 
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty8k)
+{
+	/* half of an u8 index bitmap */
+	.buffer_size = 8UL * 1024UL,
+};
+
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty16k)
+{
+	/* one u8 index bitmap */
+	.buffer_size = 16UL * 1024UL,
+};
+
 FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k)
 {
 	/* one u32 index bitmap */

From bd97196b0c7b34136b924537d44b1b6bdabc6f44 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:00:58 +0100
Subject: [PATCH 171/352] iommufd/selftest: Fix tests to use MOCK_PAGE_SIZE
 based buffer sizes

commit a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP")
added tests covering edge cases in the boundaries of iova bitmap. Although
it used buffer sizes thinking in PAGE_SIZE (4K) as opposed to the
MOCK_PAGE_SIZE (2K) that is used in iommufd mock selftests. This meant that
isn't correctly exercising everything specifically the u32 and 4K bitmap
test cases. Fix selftests buffer sizes to be based on mock page size.

Link: https://lore.kernel.org/r/20240627110105.62325-5-joao.m.martins@oracle.com
Reported-by: Kevin Tian <kevin.tian@intel.com>
Closes: https://lore.kernel.org/linux-iommu/96efb6cf-a41c-420f-9673-2f0b682cac8c@oracle.com/
Fixes: a9af47e382a4 ("iommufd/selftest: Test IOMMU_HWPT_GET_DIRTY_BITMAP")
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit ffa3c799ce157493615f9f3c2b3c9ba602d320b9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c | 36 ++++++++++++-------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 61189215e1ab7..5f7d5a5ba89b0 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -1797,47 +1797,47 @@ FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty16k)
 	.buffer_size = 16UL * 1024UL,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty64k)
 {
 	/* one u32 index bitmap */
-	.buffer_size = 128UL * 1024UL,
+	.buffer_size = 64UL * 1024UL,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256k)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128k)
 {
 	/* one u64 index bitmap */
-	.buffer_size = 256UL * 1024UL,
+	.buffer_size = 128UL * 1024UL,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty640k)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty320k)
 {
 	/* two u64 index and trailing end bitmap */
-	.buffer_size = 640UL * 1024UL,
+	.buffer_size = 320UL * 1024UL,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty64M)
 {
-	/* 4K bitmap (128M IOVA range) */
-	.buffer_size = 128UL * 1024UL * 1024UL,
+	/* 4K bitmap (64M IOVA range) */
+	.buffer_size = 64UL * 1024UL * 1024UL,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty64M_huge)
 {
-	/* 4K bitmap (128M IOVA range) */
-	.buffer_size = 128UL * 1024UL * 1024UL,
+	/* 4K bitmap (64M IOVA range) */
+	.buffer_size = 64UL * 1024UL * 1024UL,
 	.hugepages = true,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M)
 {
-	/* 8K bitmap (256M IOVA range) */
-	.buffer_size = 256UL * 1024UL * 1024UL,
+	/* 8K bitmap (128M IOVA range) */
+	.buffer_size = 128UL * 1024UL * 1024UL,
 };
 
-FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty256M_huge)
+FIXTURE_VARIANT_ADD(iommufd_dirty_tracking, domain_dirty128M_huge)
 {
-	/* 8K bitmap (256M IOVA range) */
-	.buffer_size = 256UL * 1024UL * 1024UL,
+	/* 8K bitmap (128M IOVA range) */
+	.buffer_size = 128UL * 1024UL * 1024UL,
 	.hugepages = true,
 };
 

From 02dfdf255cb1fb4a4931b6e1c8ae43e8437bf3df Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:00:59 +0100
Subject: [PATCH 172/352] iommufd/selftest: Do not record head iova to better
 match iommu drivers

Do not set a hugepage-aligned IOVA for incrementing an IOVA, to better
match current IOMMU driver implementations. Keep the logic of clearing all
IOPTE dirty bits for a whole hugepage, even if the range being dirtied
starts from part of the hugepage. This is also similar to AMD driver (iommu
v1 format) where IOMMU uses various subpage PTE data for dirty tracking
(for non-standard page sizes).

Link: https://lore.kernel.org/r/20240627110105.62325-6-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit dceb5304d7263f72333d25e5940254f98b663010)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/selftest.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 654ed33390957..7a70a3e0fee6a 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -266,8 +266,8 @@ static int mock_domain_read_and_clear_dirty(struct iommu_domain *domain,
 
 		/* Clear dirty */
 		if (mock_test_and_clear_dirty(mock, head, pgsize, flags))
-			iommu_dirty_bitmap_record(dirty, head, pgsize);
-		iova = head + pgsize;
+			iommu_dirty_bitmap_record(dirty, iova, pgsize);
+		iova += pgsize;
 	} while (iova < end);
 
 	return 0;

From 99e03cb372e25427e17669e2f7dfa2e6d4c6da22 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:01:00 +0100
Subject: [PATCH 173/352] iommufd/iova_bitmap: Check iova_bitmap_done() after
 set ahead

After iova_bitmap_set_ahead() returns it may be at the end of the range.
Move iova_bitmap_set_ahead() earlier to avoid unnecessary attempt in
trying to pin the next pages by reusing iova_bitmap_done() check.

Fixes: 2780025e01e2 ("iommufd/iova_bitmap: Handle recording beyond the mapped pages")
Link: https://lore.kernel.org/r/20240627110105.62325-7-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 792583656f554e35383d6b2325371c8fe056a56b)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index db8c46bee1559..e33ddfc239b5b 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -384,8 +384,6 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap)
 	bitmap->mapped_base_index += count;
 
 	iova_bitmap_put(bitmap);
-	if (iova_bitmap_done(bitmap))
-		return 0;
 
 	/* Iterate, set and skip any bits requested for next iteration */
 	if (bitmap->set_ahead_length) {
@@ -396,6 +394,9 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap)
 			return ret;
 	}
 
+	if (iova_bitmap_done(bitmap))
+		return 0;
+
 	/* When advancing the index we pin the next set of bitmap pages */
 	return iova_bitmap_get(bitmap);
 }

From cf346ada9853e5e8f6a3305d4d1818e9f4341590 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:01:01 +0100
Subject: [PATCH 174/352] iommufd/iova_bitmap: Cache mapped length in
 iova_bitmap_map struct

The amount of IOVA mapped will be used more often in iova_bitmap_set() in
preparation to dynamically iterate the bitmap. Cache said length to avoid
having to calculate it all the time.

Link: https://lore.kernel.org/r/20240627110105.62325-8-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit a84c690e10ae03f1cddec908ac7f5068ceed67a8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index e33ddfc239b5b..2b87ea16ad669 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -35,6 +35,9 @@ struct iova_bitmap_map {
 	/* base IOVA representing bit 0 of the first page */
 	unsigned long iova;
 
+	/* mapped length */
+	unsigned long length;
+
 	/* page size order that each bit granules to */
 	unsigned long pgshift;
 
@@ -156,6 +159,8 @@ static unsigned long iova_bitmap_mapped_iova(struct iova_bitmap *bitmap)
 	return bitmap->iova + iova_bitmap_index_to_offset(bitmap, skip);
 }
 
+static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap);
+
 /*
  * Pins the bitmap user pages for the current range window.
  * This is internal to IOVA bitmap and called when advancing the
@@ -206,6 +211,7 @@ static int iova_bitmap_get(struct iova_bitmap *bitmap)
 	 * aligned.
 	 */
 	mapped->pgoff = offset_in_page(addr);
+	mapped->length = iova_bitmap_mapped_length(bitmap);
 	return 0;
 }
 

From ce80e91854c7b15f6c6d8ffd190d3297884a02ae Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:01:02 +0100
Subject: [PATCH 175/352] iommufd/iova_bitmap: Move initial pinning to
 iova_bitmap_for_each()

The pinned pages are only relevant when it starts iterating the bitmap so
defer that into iova_bitmap_for_each().

Link: https://lore.kernel.org/r/20240627110105.62325-9-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 781bc08797a2146400332acf2d7706793b51e20f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index 2b87ea16ad669..b94636b7977eb 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -269,9 +269,6 @@ struct iova_bitmap *iova_bitmap_alloc(unsigned long iova, size_t length,
 		goto err;
 	}
 
-	rc = iova_bitmap_get(bitmap);
-	if (rc)
-		goto err;
 	return bitmap;
 
 err:
@@ -425,6 +422,10 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 {
 	int ret = 0;
 
+	ret = iova_bitmap_get(bitmap);
+	if (ret)
+		return ret;
+
 	for (; !iova_bitmap_done(bitmap) && !ret;
 	     ret = iova_bitmap_advance(bitmap)) {
 		ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),

From 7458096827aedad1b713707fae7423ca2fc17391 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:01:03 +0100
Subject: [PATCH 176/352] iommufd/iova_bitmap: Consolidate iova_bitmap_set exit
 conditionals

There's no need to have two conditionals when they are closely tied
together. Move the setting of bitmap::set_ahead_length after it checks for
::pages array out of bounds access.

Link: https://lore.kernel.org/r/20240627110105.62325-10-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 00fa1a89917fbc319909231e648439b26e8857af)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index b94636b7977eb..be97bb464f6bb 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -465,18 +465,18 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 					 last_bit - cur_bit + 1);
 		void *kaddr;
 
-		if (unlikely(page_idx > last_page_idx))
+		if (unlikely(page_idx > last_page_idx)) {
+			unsigned long left =
+				((last_bit - cur_bit + 1) << mapped->pgshift);
+
+			bitmap->set_ahead_length = left;
 			break;
+		}
 
 		kaddr = kmap_local_page(mapped->pages[page_idx]);
 		bitmap_set(kaddr, offset, nbits);
 		kunmap_local(kaddr);
 		cur_bit += nbits;
 	} while (cur_bit <= last_bit);
-
-	if (unlikely(cur_bit <= last_bit)) {
-		bitmap->set_ahead_length =
-			((last_bit - cur_bit + 1) << bitmap->mapped.pgshift);
-	}
 }
 EXPORT_SYMBOL_NS_GPL(iova_bitmap_set, IOMMUFD);

From a92129189abebf708ba8686b15666d71b2230dd8 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:01:04 +0100
Subject: [PATCH 177/352] iommufd/iova_bitmap: Dynamic pinning on
 iova_bitmap_set()

Today zerocopy iova bitmaps use a static iteration scheme where it walks
the bitmap data in a max iteration size of 2M of bitmap of data at a time.
That translates to a fixed window of IOVA space that can span up to 64G
(e.g.  base pages, x86). Here 'window' refers to the IOVA space represented
by the bitmap data it is iterating. This static scheme is the ideal one
where the reported page-size is the same as the one behind the dirty
tracker.

However, problems start to appear when the dirty tracker may
dirty in many PTE sizes beyond or unaligned at the boundaries of the
iteration window. Such is the case for the IOMMU and
commit 2780025e01e2 ("iommufd/iova_bitmap: Handle recording beyond the mapped pages")
tried to fix the problem by handling the PTEs that get dirty which
surprass the end of the iteration. But the fix was incomplete and it
didn't handle all the data structure issues namely:

1) when there's nothing to dirty but the end of the iteration IOVA range is
a IOMMU hugepage PTE that crosses iterations, when it goes to the next
iteration it finds the other end of the said hugepage but don't account that
it had checked for that IOPTE already. iommu driver then walk the IOVA
space as if it is a new page without accounting that it is past the start
of a bigger page which ends up setting (future) dirty bits slightly
offset-ed. Note that the partial ranges here are self induced
due as a result of the fixed 'window' scheme being unaligned to this
hugepage IOPTE.

2) on the same line of thinking between pinning pages of different
iterations it could allow DMA to mark PTEs as dirty on the second part of
this previously mentioned partial hugepage. This leads to marking part of
the hugepage as dirty but still clearing IOPTE leading to missed dirty
data.

So to fix these problems more fundamentally and avoid future ones: instead
of iterating the whole bitmap in fixed chunks, instead only pin the bitmap
pages when it has dirty bits to set. The logic is simple in
iova_bitmap_set(): check where the current iova range to be marked as dirty
is pinned and pin the bitmap pages where to-be-recorded @iova starts if
it's not. If it's partially mapped out of the whole set, continue pinning
it and set bits until the whole dirty-size is covered. The latter is more
relevant with AMD iommu pgtable v1 format where you can have up
64G/128G/256G page sizes and thus you can set 64G at a time. Code also gets
simpler and easier to follow.

Fixing this without changing this iteration scheme means changing iommu
drivers to ignore any partial pages and not clear dirty bits, which is a
bit hacky. Though getting to walk only part of a IOMMU hugepage is a
self-induced due to this iteration scheme as it doesn't (and can't) align the
iteration boundary to the huge IOPTE at the end. Thus it can't know what
the hugepage size the iteration should align to until it walks the begin/end.

Dynamically pinning adds some comparisons inside iova_bitmap_set() to check
if something needs to be pinned if the IOVA range is out of range. Though
it has the benefit that non-dirty IOVA ranges only walk page tables without
needing to pin any bitmap pages. This dynamic scheme should be better for IOMMUs
where upper layers don't need or know what PTE sizes IOVAs map into (and there
could be more than one PTE size[*]) until they walk the IOMMU page tables.

A follow-up change will remove the iteration logic.

[*] Specially on AMD v1 iommu pgtable format where most powers of two are
supported as page-size.

Link: https://lore.kernel.org/linux-iommu/6b90f949-48da-4cb3-ad9a-ed54f1351a9a@oracle.com/
Fixes: 2780025e01e2 ("iommufd/iova_bitmap: Handle recording beyond the mapped pages")
Link: https://lore.kernel.org/r/20240627110105.62325-11-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 7a7bba16244a5c55861d8fefea72cdbb8b05323e)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 73 ++++++++++++++++++++++++++---
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index be97bb464f6bb..b047e1e180be7 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -119,6 +119,9 @@ struct iova_bitmap {
 
 	/* length of the IOVA range set ahead the pinned pages */
 	unsigned long set_ahead_length;
+
+	/* true if it using the iterator otherwise it pins dynamically */
+	bool iterator;
 };
 
 /*
@@ -340,6 +343,17 @@ static unsigned long iova_bitmap_mapped_length(struct iova_bitmap *bitmap)
 	return remaining;
 }
 
+/*
+ * Returns true if [@iova..@iova+@length-1] is part of the mapped IOVA range.
+ */
+static bool iova_bitmap_mapped_range(struct iova_bitmap_map *mapped,
+				     unsigned long iova, size_t length)
+{
+	return mapped->npages &&
+		(iova >= mapped->iova &&
+		 (iova + length - 1) <= (mapped->iova + mapped->length - 1));
+}
+
 /*
  * Returns true if there's not more data to iterate.
  */
@@ -374,6 +388,27 @@ static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap,
 	return ret;
 }
 
+/*
+ * Advances to a selected range, releases the current pinned
+ * pages and pins the next set of bitmap pages.
+ * Returns 0 on success or otherwise errno.
+ */
+static int iova_bitmap_advance_to(struct iova_bitmap *bitmap,
+				  unsigned long iova)
+{
+	unsigned long index;
+
+	index = iova_bitmap_offset_to_index(bitmap, iova - bitmap->iova);
+	if (index >= bitmap->mapped_total_index)
+		return -EINVAL;
+	bitmap->mapped_base_index = index;
+
+	iova_bitmap_put(bitmap);
+
+	/* Pin the next set of bitmap pages */
+	return iova_bitmap_get(bitmap);
+}
+
 /*
  * Advances to the next range, releases the current pinned
  * pages and pins the next set of bitmap pages.
@@ -426,6 +461,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 	if (ret)
 		return ret;
 
+	bitmap->iterator = true;
 	for (; !iova_bitmap_done(bitmap) && !ret;
 	     ret = iova_bitmap_advance(bitmap)) {
 		ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
@@ -433,6 +469,7 @@ int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 		if (ret)
 			break;
 	}
+	bitmap->iterator = false;
 
 	return ret;
 }
@@ -452,11 +489,28 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 		     unsigned long iova, size_t length)
 {
 	struct iova_bitmap_map *mapped = &bitmap->mapped;
-	unsigned long cur_bit = ((iova - mapped->iova) >>
-			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
-	unsigned long last_bit = (((iova + length - 1) - mapped->iova) >>
-			mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
-	unsigned long last_page_idx = mapped->npages - 1;
+	unsigned long cur_bit, last_bit, last_page_idx;
+
+update_indexes:
+	if (unlikely(!bitmap->iterator &&
+		     !iova_bitmap_mapped_range(mapped, iova, length))) {
+
+		/*
+		 * The attempt to advance the base index to @iova
+		 * may fail if it's out of bounds, or pinning the pages
+		 * returns an error.
+		 *
+		 * It is a no-op if within a iova_bitmap_for_each() closure.
+		 */
+		if (iova_bitmap_advance_to(bitmap, iova))
+			return;
+	}
+
+	last_page_idx = mapped->npages - 1;
+	cur_bit = ((iova - mapped->iova) >>
+		mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
+	last_bit = (((iova + length - 1) - mapped->iova) >>
+		mapped->pgshift) + mapped->pgoff * BITS_PER_BYTE;
 
 	do {
 		unsigned int page_idx = cur_bit / BITS_PER_PAGE;
@@ -469,8 +523,13 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 			unsigned long left =
 				((last_bit - cur_bit + 1) << mapped->pgshift);
 
-			bitmap->set_ahead_length = left;
-			break;
+			if (bitmap->iterator) {
+				bitmap->set_ahead_length = left;
+				return;
+			}
+			iova += (length - left);
+			length = left;
+			goto update_indexes;
 		}
 
 		kaddr = kmap_local_page(mapped->pages[page_idx]);

From c869c759f4019ae758062fc30e3862445c6c2c6d Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Thu, 27 Jun 2024 12:01:05 +0100
Subject: [PATCH 178/352] iommufd/iova_bitmap: Remove iterator logic

The newly introduced dynamic pinning/windowing greatly simplifies the code
and there's no obvious performance advantage that has been identified that
justifies maintinaing both schemes.

Remove the iterator logic and have iova_bitmap_for_each() just invoke the
callback with the total iova/length.

Fixes: 2780025e01e2 ("iommufd/iova_bitmap: Handle recording beyond the mapped pages")
Link: https://lore.kernel.org/r/20240627110105.62325-12-joao.m.martins@oracle.com
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Tested-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 53e6b65693b68519dcfd384280bfc3d34c7398e2)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iova_bitmap.c | 97 +----------------------------
 1 file changed, 2 insertions(+), 95 deletions(-)

diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index b047e1e180be7..b9e964b1ad5cc 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -116,12 +116,6 @@ struct iova_bitmap {
 
 	/* length of the IOVA range for the whole bitmap */
 	size_t length;
-
-	/* length of the IOVA range set ahead the pinned pages */
-	unsigned long set_ahead_length;
-
-	/* true if it using the iterator otherwise it pins dynamically */
-	bool iterator;
 };
 
 /*
@@ -354,40 +348,6 @@ static bool iova_bitmap_mapped_range(struct iova_bitmap_map *mapped,
 		 (iova + length - 1) <= (mapped->iova + mapped->length - 1));
 }
 
-/*
- * Returns true if there's not more data to iterate.
- */
-static bool iova_bitmap_done(struct iova_bitmap *bitmap)
-{
-	return bitmap->mapped_base_index >= bitmap->mapped_total_index;
-}
-
-static int iova_bitmap_set_ahead(struct iova_bitmap *bitmap,
-				 size_t set_ahead_length)
-{
-	int ret = 0;
-
-	while (set_ahead_length > 0 && !iova_bitmap_done(bitmap)) {
-		unsigned long length = iova_bitmap_mapped_length(bitmap);
-		unsigned long iova = iova_bitmap_mapped_iova(bitmap);
-
-		ret = iova_bitmap_get(bitmap);
-		if (ret)
-			break;
-
-		length = min(length, set_ahead_length);
-		iova_bitmap_set(bitmap, iova, length);
-
-		set_ahead_length -= length;
-		bitmap->mapped_base_index +=
-			iova_bitmap_offset_to_index(bitmap, length - 1) + 1;
-		iova_bitmap_put(bitmap);
-	}
-
-	bitmap->set_ahead_length = 0;
-	return ret;
-}
-
 /*
  * Advances to a selected range, releases the current pinned
  * pages and pins the next set of bitmap pages.
@@ -409,36 +369,6 @@ static int iova_bitmap_advance_to(struct iova_bitmap *bitmap,
 	return iova_bitmap_get(bitmap);
 }
 
-/*
- * Advances to the next range, releases the current pinned
- * pages and pins the next set of bitmap pages.
- * Returns 0 on success or otherwise errno.
- */
-static int iova_bitmap_advance(struct iova_bitmap *bitmap)
-{
-	unsigned long iova = iova_bitmap_mapped_length(bitmap) - 1;
-	unsigned long count = iova_bitmap_offset_to_index(bitmap, iova) + 1;
-
-	bitmap->mapped_base_index += count;
-
-	iova_bitmap_put(bitmap);
-
-	/* Iterate, set and skip any bits requested for next iteration */
-	if (bitmap->set_ahead_length) {
-		int ret;
-
-		ret = iova_bitmap_set_ahead(bitmap, bitmap->set_ahead_length);
-		if (ret)
-			return ret;
-	}
-
-	if (iova_bitmap_done(bitmap))
-		return 0;
-
-	/* When advancing the index we pin the next set of bitmap pages */
-	return iova_bitmap_get(bitmap);
-}
-
 /**
  * iova_bitmap_for_each() - Iterates over the bitmap
  * @bitmap: IOVA bitmap to iterate
@@ -455,23 +385,7 @@ static int iova_bitmap_advance(struct iova_bitmap *bitmap)
 int iova_bitmap_for_each(struct iova_bitmap *bitmap, void *opaque,
 			 iova_bitmap_fn_t fn)
 {
-	int ret = 0;
-
-	ret = iova_bitmap_get(bitmap);
-	if (ret)
-		return ret;
-
-	bitmap->iterator = true;
-	for (; !iova_bitmap_done(bitmap) && !ret;
-	     ret = iova_bitmap_advance(bitmap)) {
-		ret = fn(bitmap, iova_bitmap_mapped_iova(bitmap),
-			 iova_bitmap_mapped_length(bitmap), opaque);
-		if (ret)
-			break;
-	}
-	bitmap->iterator = false;
-
-	return ret;
+	return fn(bitmap, bitmap->iova, bitmap->length, opaque);
 }
 EXPORT_SYMBOL_NS_GPL(iova_bitmap_for_each, IOMMUFD);
 
@@ -492,15 +406,12 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 	unsigned long cur_bit, last_bit, last_page_idx;
 
 update_indexes:
-	if (unlikely(!bitmap->iterator &&
-		     !iova_bitmap_mapped_range(mapped, iova, length))) {
+	if (unlikely(!iova_bitmap_mapped_range(mapped, iova, length))) {
 
 		/*
 		 * The attempt to advance the base index to @iova
 		 * may fail if it's out of bounds, or pinning the pages
 		 * returns an error.
-		 *
-		 * It is a no-op if within a iova_bitmap_for_each() closure.
 		 */
 		if (iova_bitmap_advance_to(bitmap, iova))
 			return;
@@ -523,10 +434,6 @@ void iova_bitmap_set(struct iova_bitmap *bitmap,
 			unsigned long left =
 				((last_bit - cur_bit + 1) << mapped->pgshift);
 
-			if (bitmap->iterator) {
-				bitmap->set_ahead_length = left;
-				return;
-			}
 			iova += (length - left);
 			length = left;
 			goto update_indexes;

From d79c84d09c0b1597ddfa89b04f847528fdc1ce62 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:32 -0300
Subject: [PATCH 179/352] iommu/arm-smmu-v3: Convert to domain_alloc_sva()

This allows the driver the receive the mm and always a device during
allocation. Later patches need this to properly setup the notifier when
the domain is first allocated.

Remove ops->domain_alloc() as SVA was the only remaining purpose.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 678d79b98028ce2365b30e35479bea0e555c23d3)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c |  6 ++++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c     | 10 +---------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h     |  8 +++-----
 3 files changed, 8 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index e490ffb380154..28f8bf4327f69 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -656,13 +656,15 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
 	.free			= arm_smmu_sva_domain_free
 };
 
-struct iommu_domain *arm_smmu_sva_domain_alloc(void)
+struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
+					       struct mm_struct *mm)
 {
 	struct iommu_domain *domain;
 
 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
 	if (!domain)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
+	domain->type = IOMMU_DOMAIN_SVA;
 	domain->ops = &arm_smmu_sva_domain_ops;
 
 	return domain;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ee0acb87a0e5d..f58067a8d6ebe 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2237,14 +2237,6 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 	}
 }
 
-static struct iommu_domain *arm_smmu_domain_alloc(unsigned type)
-{
-
-	if (type == IOMMU_DOMAIN_SVA)
-		return arm_smmu_sva_domain_alloc();
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
 static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
 {
 	struct arm_smmu_domain *smmu_domain;
@@ -3097,8 +3089,8 @@ static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
-	.domain_alloc		= arm_smmu_domain_alloc,
 	.domain_alloc_paging    = arm_smmu_domain_alloc_paging,
+	.domain_alloc_sva       = arm_smmu_sva_domain_alloc,
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.device_group		= arm_smmu_device_group,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 1242a086c9f94..b10712d3de66a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -802,7 +802,8 @@ int arm_smmu_master_enable_sva(struct arm_smmu_master *master);
 int arm_smmu_master_disable_sva(struct arm_smmu_master *master);
 bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master);
 void arm_smmu_sva_notifier_synchronize(void);
-struct iommu_domain *arm_smmu_sva_domain_alloc(void);
+struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
+					       struct mm_struct *mm);
 void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 				   struct device *dev, ioasid_t id);
 #else /* CONFIG_ARM_SMMU_V3_SVA */
@@ -838,10 +839,7 @@ static inline bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master
 
 static inline void arm_smmu_sva_notifier_synchronize(void) {}
 
-static inline struct iommu_domain *arm_smmu_sva_domain_alloc(void)
-{
-	return NULL;
-}
+#define arm_smmu_sva_domain_alloc NULL
 
 static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 						 struct device *dev,

From 32e1633fcf2437e0cc33eb4e4655e33452075072 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:33 -0300
Subject: [PATCH 180/352] iommu/arm-smmu-v3: Start building a generic PASID
 layer

Add arm_smmu_set_pasid()/arm_smmu_remove_pasid() which are to be used by
callers that already constructed the arm_smmu_cd they wish to program.

These functions will encapsulate the shared logic to setup a CD entry that
will be shared by SVA and S1 domain cases.

Prior fixes had already moved most of this logic up into
__arm_smmu_sva_bind(), move it to it's final home.

Following patches will relieve some of the remaining SVA restrictions:

 - The RID domain is a S1 domain and has already setup the STE to point to
   the CD table
 - The programmed PASID is the mm_get_enqcmd_pasid()
 - Nothing changes while SVA is running (sva_enable)

SVA invalidation will still iterate over the S1 domain's master list,
later patches will resolve that.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 85f2fb6ef4137c631c9d2663716d998d7e4f164f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 57 ++++++++++---------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 32 ++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  9 ++-
 3 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 28f8bf4327f69..71ca87c2c5c3b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -417,29 +417,27 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
 	arm_smmu_free_shared_cd(cd);
 }
 
-static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid,
-			       struct mm_struct *mm)
+static struct arm_smmu_bond *__arm_smmu_sva_bind(struct device *dev,
+						 struct mm_struct *mm)
 {
 	int ret;
-	struct arm_smmu_cd target;
-	struct arm_smmu_cd *cdptr;
 	struct arm_smmu_bond *bond;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
 	struct arm_smmu_domain *smmu_domain;
 
 	if (!(domain->type & __IOMMU_DOMAIN_PAGING))
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 	smmu_domain = to_smmu_domain(domain);
 	if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
 	if (!master || !master->sva_enabled)
-		return -ENODEV;
+		return ERR_PTR(-ENODEV);
 
 	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
 	if (!bond)
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 
 	bond->mm = mm;
 
@@ -449,22 +447,12 @@ static int __arm_smmu_sva_bind(struct device *dev, ioasid_t pasid,
 		goto err_free_bond;
 	}
 
-	cdptr = arm_smmu_alloc_cd_ptr(master, mm_get_enqcmd_pasid(mm));
-	if (!cdptr) {
-		ret = -ENOMEM;
-		goto err_put_notifier;
-	}
-	arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid);
-	arm_smmu_write_cd_entry(master, pasid, cdptr, &target);
-
 	list_add(&bond->list, &master->bonds);
-	return 0;
+	return bond;
 
-err_put_notifier:
-	arm_smmu_mmu_notifier_put(bond->smmu_mn);
 err_free_bond:
 	kfree(bond);
-	return ret;
+	return ERR_PTR(ret);
 }
 
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
@@ -611,10 +599,9 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 	struct arm_smmu_bond *bond = NULL, *t;
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
-	mutex_lock(&sva_lock);
-
-	arm_smmu_clear_cd(master, id);
+	arm_smmu_remove_pasid(master, to_smmu_domain(domain), id);
 
+	mutex_lock(&sva_lock);
 	list_for_each_entry(t, &master->bonds, list) {
 		if (t->mm == mm) {
 			bond = t;
@@ -633,17 +620,33 @@ void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 				      struct device *dev, ioasid_t id)
 {
-	int ret = 0;
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct mm_struct *mm = domain->mm;
+	struct arm_smmu_bond *bond;
+	struct arm_smmu_cd target;
+	int ret;
 
 	if (mm_get_enqcmd_pasid(mm) != id)
 		return -EINVAL;
 
 	mutex_lock(&sva_lock);
-	ret = __arm_smmu_sva_bind(dev, id, mm);
-	mutex_unlock(&sva_lock);
+	bond = __arm_smmu_sva_bind(dev, mm);
+	if (IS_ERR(bond)) {
+		mutex_unlock(&sva_lock);
+		return PTR_ERR(bond);
+	}
 
-	return ret;
+	arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid);
+	ret = arm_smmu_set_pasid(master, NULL, id, &target);
+	if (ret) {
+		list_del(&bond->list);
+		arm_smmu_mmu_notifier_put(bond->smmu_mn);
+		kfree(bond);
+		mutex_unlock(&sva_lock);
+		return ret;
+	}
+	mutex_unlock(&sva_lock);
+	return 0;
 }
 
 static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index f58067a8d6ebe..6b12c4be4df6d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1211,8 +1211,8 @@ struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 	return &l1_desc->l2ptr[ssid % CTXDESC_L2_ENTRIES];
 }
 
-struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
-					  u32 ssid)
+static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
+						 u32 ssid)
 {
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
@@ -2412,6 +2412,10 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
 	int i, j;
 	struct arm_smmu_device *smmu = master->smmu;
 
+	master->cd_table.in_ste =
+		FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(target->data[0])) ==
+		STRTAB_STE_0_CFG_S1_TRANS;
+
 	for (i = 0; i < master->num_streams; ++i) {
 		u32 sid = master->streams[i].id;
 		struct arm_smmu_ste *step =
@@ -2632,6 +2636,30 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	return 0;
 }
 
+int arm_smmu_set_pasid(struct arm_smmu_master *master,
+		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
+		       const struct arm_smmu_cd *cd)
+{
+	struct arm_smmu_cd *cdptr;
+
+	/* The core code validates pasid */
+
+	if (!master->cd_table.in_ste)
+		return -ENODEV;
+
+	cdptr = arm_smmu_alloc_cd_ptr(master, pasid);
+	if (!cdptr)
+		return -ENOMEM;
+	arm_smmu_write_cd_entry(master, pasid, cdptr, cd);
+	return 0;
+}
+
+void arm_smmu_remove_pasid(struct arm_smmu_master *master,
+			   struct arm_smmu_domain *smmu_domain, ioasid_t pasid)
+{
+	arm_smmu_clear_cd(master, pasid);
+}
+
 static int arm_smmu_attach_dev_ste(struct device *dev,
 				   struct arm_smmu_ste *ste)
 {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index b10712d3de66a..6a74d3d884fe8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -602,6 +602,7 @@ struct arm_smmu_ctx_desc_cfg {
 	dma_addr_t			cdtab_dma;
 	struct arm_smmu_l1_ctx_desc	*l1_desc;
 	unsigned int			num_l1_ents;
+	u8				in_ste;
 	u8				s1fmt;
 	/* log2 of the maximum number of CDs supported by this table */
 	u8				s1cdmax;
@@ -777,8 +778,6 @@ extern struct mutex arm_smmu_asid_lock;
 void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid);
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid);
-struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
-					  u32 ssid);
 void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 			 struct arm_smmu_master *master,
 			 struct arm_smmu_domain *smmu_domain);
@@ -786,6 +785,12 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 			     struct arm_smmu_cd *cdptr,
 			     const struct arm_smmu_cd *target);
 
+int arm_smmu_set_pasid(struct arm_smmu_master *master,
+		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
+		       const struct arm_smmu_cd *cd);
+void arm_smmu_remove_pasid(struct arm_smmu_master *master,
+			   struct arm_smmu_domain *smmu_domain, ioasid_t pasid);
+
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
 void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 				 size_t granule, bool leaf,

From d1bceae4a25855dfaffba339aeba201cd3092229 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:34 -0300
Subject: [PATCH 181/352] iommu/arm-smmu-v3: Make smmu_domain->devices into an
 allocated list

The next patch will need to store the same master twice (with different
SSIDs), so allocate memory for each list element.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit ad10dce61303d82f7bdd2dbb116e02146778f728)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 11 ++++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 39 ++++++++++++++++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  7 +++-
 3 files changed, 47 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index 71ca87c2c5c3b..cb3a0e4143c84 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -38,12 +38,13 @@ static DEFINE_MUTEX(sva_lock);
 static void
 arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
 {
-	struct arm_smmu_master *master;
+	struct arm_smmu_master_domain *master_domain;
 	struct arm_smmu_cd target_cd;
 	unsigned long flags;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+	list_for_each_entry(master_domain, &smmu_domain->devices, devices_elm) {
+		struct arm_smmu_master *master = master_domain->master;
 		struct arm_smmu_cd *cdptr;
 
 		/* S1 domains only support RID attachment right now */
@@ -301,7 +302,7 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
 	struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
 	struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
-	struct arm_smmu_master *master;
+	struct arm_smmu_master_domain *master_domain;
 	unsigned long flags;
 
 	mutex_lock(&sva_lock);
@@ -315,7 +316,9 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	 * but disable translation.
 	 */
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+	list_for_each_entry(master_domain, &smmu_domain->devices,
+			    devices_elm) {
+		struct arm_smmu_master *master = master_domain->master;
 		struct arm_smmu_cd target;
 		struct arm_smmu_cd *cdptr;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6b12c4be4df6d..3328d6c9f2a70 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2015,10 +2015,10 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
 int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 			    unsigned long iova, size_t size)
 {
+	struct arm_smmu_master_domain *master_domain;
 	int i;
 	unsigned long flags;
 	struct arm_smmu_cmdq_ent cmd;
-	struct arm_smmu_master *master;
 	struct arm_smmu_cmdq_batch cmds;
 
 	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
@@ -2046,7 +2046,10 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 	cmds.num = 0;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_for_each_entry(master, &smmu_domain->devices, domain_head) {
+	list_for_each_entry(master_domain, &smmu_domain->devices,
+			    devices_elm) {
+		struct arm_smmu_master *master = master_domain->master;
+
 		if (!master->ats_enabled)
 			continue;
 
@@ -2534,9 +2537,26 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 	pci_disable_pasid(pdev);
 }
 
+static struct arm_smmu_master_domain *
+arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
+			    struct arm_smmu_master *master)
+{
+	struct arm_smmu_master_domain *master_domain;
+
+	lockdep_assert_held(&smmu_domain->devices_lock);
+
+	list_for_each_entry(master_domain, &smmu_domain->devices,
+			    devices_elm) {
+		if (master_domain->master == master)
+			return master_domain;
+	}
+	return NULL;
+}
+
 static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 {
 	struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev);
+	struct arm_smmu_master_domain *master_domain;
 	struct arm_smmu_domain *smmu_domain;
 	unsigned long flags;
 
@@ -2547,7 +2567,11 @@ static void arm_smmu_detach_dev(struct arm_smmu_master *master)
 	arm_smmu_disable_ats(master, smmu_domain);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_del_init(&master->domain_head);
+	master_domain = arm_smmu_find_master_domain(smmu_domain, master);
+	if (master_domain) {
+		list_del(&master_domain->devices_elm);
+		kfree(master_domain);
+	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	master->ats_enabled = false;
@@ -2561,6 +2585,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct arm_smmu_device *smmu;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_master_domain *master_domain;
 	struct arm_smmu_master *master;
 	struct arm_smmu_cd *cdptr;
 
@@ -2597,6 +2622,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			return -ENOMEM;
 	}
 
+	master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
+	if (!master_domain)
+		return -ENOMEM;
+	master_domain->master = master;
+
 	/*
 	 * Prevent arm_smmu_share_asid() from trying to change the ASID
 	 * of either the old or new domain while we are working on it.
@@ -2610,7 +2640,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	master->ats_enabled = arm_smmu_ats_supported(master);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_add(&master->domain_head, &smmu_domain->devices);
+	list_add(&master_domain->devices_elm, &smmu_domain->devices);
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	switch (smmu_domain->stage) {
@@ -2925,7 +2955,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 	master->dev = dev;
 	master->smmu = smmu;
 	INIT_LIST_HEAD(&master->bonds);
-	INIT_LIST_HEAD(&master->domain_head);
 	dev_iommu_priv_set(dev, master);
 
 	ret = arm_smmu_insert_master(smmu, master);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 6a74d3d884fe8..01769b5286a83 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -697,7 +697,6 @@ struct arm_smmu_stream {
 struct arm_smmu_master {
 	struct arm_smmu_device		*smmu;
 	struct device			*dev;
-	struct list_head		domain_head;
 	struct arm_smmu_stream		*streams;
 	/* Locked by the iommu core using the group mutex */
 	struct arm_smmu_ctx_desc_cfg	cd_table;
@@ -731,6 +730,7 @@ struct arm_smmu_domain {
 
 	struct iommu_domain		domain;
 
+	/* List of struct arm_smmu_master_domain */
 	struct list_head		devices;
 	spinlock_t			devices_lock;
 
@@ -767,6 +767,11 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
 			  u16 asid);
 #endif
 
+struct arm_smmu_master_domain {
+	struct list_head devices_elm;
+	struct arm_smmu_master *master;
+};
+
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
 {
 	return container_of(dom, struct arm_smmu_domain, domain);

From 1eea884a6ba9de88ecf760cd361253119524c782 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:35 -0300
Subject: [PATCH 182/352] iommu/arm-smmu-v3: Make changing domains be hitless
 for ATS

The core code allows the domain to be changed on the fly without a forced
stop in BLOCKED/IDENTITY. In this flow the driver should just continually
maintain the ATS with no change while the STE is updated.

ATS relies on a linked list smmu_domain->devices to keep track of which
masters have the domain programmed, but this list is also used by
arm_smmu_share_asid(), unrelated to ats.

Create two new functions to encapsulate this combined logic:
 arm_smmu_attach_prepare()
 <caller generates and sets the STE>
 arm_smmu_attach_commit()

The two functions can sequence both enabling ATS and disabling across
the STE store. Have every update of the STE use this sequence.

Installing a S1/S2 domain always enables the ATS if the PCIe device
supports it.

The enable flow is now ordered differently to allow it to be hitless:

 1) Add the master to the new smmu_domain->devices list
 2) Program the STE
 3) Enable ATS at PCIe
 4) Remove the master from the old smmu_domain

This flow ensures that invalidations to either domain will generate an ATC
invalidation to the device while the STE is being switched. Thus we don't
need to turn off the ATS anymore for correctness.

The disable flow is the reverse:
 1) Disable ATS at PCIe
 2) Program the STE
 3) Invalidate the ATC
 4) Remove the master from the old smmu_domain

Move the nr_ats_masters adjustments to be close to the list
manipulations. It is a count of the number of ATS enabled masters
currently in the list. This is stricly before and after the STE/CD are
revised, and done under the list's spin_lock.

This is part of the bigger picture to allow changing the RID domain while
a PASID is in use. If a SVA PASID is relying on ATS to function then
changing the RID domain cannot just temporarily toggle ATS off without
also wrecking the SVA PASID. The new infrastructure here is organized so
that the PASID attach/detach flows will make use of it as well in
following patches.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7497f4211f4fbdcec5fc5bb4df7f6ccd345966e8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c  |   5 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 237 +++++++++++++-----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |   6 +-
 3 files changed, 177 insertions(+), 71 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index 315e487fd990e..a460b71f58578 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -164,7 +164,7 @@ static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
 		.smmu = &smmu,
 	};
 
-	arm_smmu_make_cdtable_ste(ste, &master);
+	arm_smmu_make_cdtable_ste(ste, &master, true);
 }
 
 static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
@@ -231,7 +231,6 @@ static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
 {
 	struct arm_smmu_master master = {
 		.smmu = &smmu,
-		.ats_enabled = ats_enabled,
 	};
 	struct io_pgtable io_pgtable = {};
 	struct arm_smmu_domain smmu_domain = {
@@ -247,7 +246,7 @@ static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
 	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.sl = 3;
 	io_pgtable.cfg.arm_lpae_s2_cfg.vtcr.tsz = 4;
 
-	arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain);
+	arm_smmu_make_s2_domain_ste(ste, &master, &smmu_domain, ats_enabled);
 }
 
 static void arm_smmu_v3_write_ste_test_s2_to_abort(struct kunit *test)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 3328d6c9f2a70..d9c0dca60eb95 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1538,7 +1538,7 @@ EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_bypass_ste);
 
 VISIBLE_IF_KUNIT
 void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
-			       struct arm_smmu_master *master)
+			       struct arm_smmu_master *master, bool ats_enabled)
 {
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
@@ -1561,7 +1561,7 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 			 STRTAB_STE_1_S1STALLD :
 			 0) |
 		FIELD_PREP(STRTAB_STE_1_EATS,
-			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
 	if (smmu->features & ARM_SMMU_FEAT_E2H) {
 		/*
@@ -1591,7 +1591,8 @@ EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_cdtable_ste);
 VISIBLE_IF_KUNIT
 void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 				 struct arm_smmu_master *master,
-				 struct arm_smmu_domain *smmu_domain)
+				 struct arm_smmu_domain *smmu_domain,
+				 bool ats_enabled)
 {
 	struct arm_smmu_s2_cfg *s2_cfg = &smmu_domain->s2_cfg;
 	const struct io_pgtable_cfg *pgtbl_cfg =
@@ -1608,7 +1609,7 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 
 	target->data[1] = cpu_to_le64(
 		FIELD_PREP(STRTAB_STE_1_EATS,
-			   master->ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
+			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
 	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
 		target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
@@ -2450,22 +2451,16 @@ static bool arm_smmu_ats_supported(struct arm_smmu_master *master)
 	return dev_is_pci(dev) && pci_ats_supported(to_pci_dev(dev));
 }
 
-static void arm_smmu_enable_ats(struct arm_smmu_master *master,
-				struct arm_smmu_domain *smmu_domain)
+static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 {
 	size_t stu;
 	struct pci_dev *pdev;
 	struct arm_smmu_device *smmu = master->smmu;
 
-	/* Don't enable ATS at the endpoint if it's not enabled in the STE */
-	if (!master->ats_enabled)
-		return;
-
 	/* Smallest Translation Unit: log2 of the smallest supported granule */
 	stu = __ffs(smmu->pgsize_bitmap);
 	pdev = to_pci_dev(master->dev);
 
-	atomic_inc(&smmu_domain->nr_ats_masters);
 	/*
 	 * ATC invalidation of PASID 0 causes the entire ATC to be flushed.
 	 */
@@ -2474,22 +2469,6 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master,
 		dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
 }
 
-static void arm_smmu_disable_ats(struct arm_smmu_master *master,
-				 struct arm_smmu_domain *smmu_domain)
-{
-	if (!master->ats_enabled)
-		return;
-
-	pci_disable_ats(to_pci_dev(master->dev));
-	/*
-	 * Ensure ATS is disabled at the endpoint before we issue the
-	 * ATC invalidation via the SMMU.
-	 */
-	wmb();
-	arm_smmu_atc_inv_master(master);
-	atomic_dec(&smmu_domain->nr_ats_masters);
-}
-
 static int arm_smmu_enable_pasid(struct arm_smmu_master *master)
 {
 	int ret;
@@ -2553,46 +2532,181 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 	return NULL;
 }
 
-static void arm_smmu_detach_dev(struct arm_smmu_master *master)
+/*
+ * If the domain uses the smmu_domain->devices list return the arm_smmu_domain
+ * structure, otherwise NULL. These domains track attached devices so they can
+ * issue invalidations.
+ */
+static struct arm_smmu_domain *
+to_smmu_domain_devices(struct iommu_domain *domain)
+{
+	/* The domain can be NULL only when processing the first attach */
+	if (!domain)
+		return NULL;
+	if (domain->type & __IOMMU_DOMAIN_PAGING)
+		return to_smmu_domain(domain);
+	return NULL;
+}
+
+static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
+					  struct iommu_domain *domain)
 {
-	struct iommu_domain *domain = iommu_get_domain_for_dev(master->dev);
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
 	struct arm_smmu_master_domain *master_domain;
-	struct arm_smmu_domain *smmu_domain;
 	unsigned long flags;
 
-	if (!domain || !(domain->type & __IOMMU_DOMAIN_PAGING))
+	if (!smmu_domain)
 		return;
 
-	smmu_domain = to_smmu_domain(domain);
-	arm_smmu_disable_ats(master, smmu_domain);
-
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	master_domain = arm_smmu_find_master_domain(smmu_domain, master);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
+		if (master->ats_enabled)
+			atomic_dec(&smmu_domain->nr_ats_masters);
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+}
+
+struct arm_smmu_attach_state {
+	/* Inputs */
+	struct iommu_domain *old_domain;
+	struct arm_smmu_master *master;
+	/* Resulting state */
+	bool ats_enabled;
+};
+
+/*
+ * Start the sequence to attach a domain to a master. The sequence contains three
+ * steps:
+ *  arm_smmu_attach_prepare()
+ *  arm_smmu_install_ste_for_dev()
+ *  arm_smmu_attach_commit()
+ *
+ * If prepare succeeds then the sequence must be completed. The STE installed
+ * must set the STE.EATS field according to state.ats_enabled.
+ *
+ * If the device supports ATS then this determines if EATS should be enabled
+ * in the STE, and starts sequencing EATS disable if required.
+ *
+ * The change of the EATS in the STE and the PCI ATS config space is managed by
+ * this sequence to be in the right order so that if PCI ATS is enabled then
+ * STE.ETAS is enabled.
+ *
+ * new_domain can be a non-paging domain. In this case ATS will not be enabled,
+ * and invalidations won't be tracked.
+ */
+static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
+				   struct iommu_domain *new_domain)
+{
+	struct arm_smmu_master *master = state->master;
+	struct arm_smmu_master_domain *master_domain;
+	struct arm_smmu_domain *smmu_domain =
+		to_smmu_domain_devices(new_domain);
+	unsigned long flags;
+
+	/*
+	 * arm_smmu_share_asid() must not see two domains pointing to the same
+	 * arm_smmu_master_domain contents otherwise it could randomly write one
+	 * or the other to the CD.
+	 */
+	lockdep_assert_held(&arm_smmu_asid_lock);
+
+	if (smmu_domain) {
+		/*
+		 * The SMMU does not support enabling ATS with bypass/abort.
+		 * When the STE is in bypass (STE.Config[2:0] == 0b100), ATS
+		 * Translation Requests and Translated transactions are denied
+		 * as though ATS is disabled for the stream (STE.EATS == 0b00),
+		 * causing F_BAD_ATS_TREQ and F_TRANSL_FORBIDDEN events
+		 * (IHI0070Ea 5.2 Stream Table Entry). Thus ATS can only be
+		 * enabled if we have arm_smmu_domain, those always have page
+		 * tables.
+		 */
+		state->ats_enabled = arm_smmu_ats_supported(master);
+
+		master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
+		if (!master_domain)
+			return -ENOMEM;
+		master_domain->master = master;
 
-	master->ats_enabled = false;
+		/*
+		 * During prepare we want the current smmu_domain and new
+		 * smmu_domain to be in the devices list before we change any
+		 * HW. This ensures that both domains will send ATS
+		 * invalidations to the master until we are done.
+		 *
+		 * It is tempting to make this list only track masters that are
+		 * using ATS, but arm_smmu_share_asid() also uses this to change
+		 * the ASID of a domain, unrelated to ATS.
+		 *
+		 * Notice if we are re-attaching the same domain then the list
+		 * will have two identical entries and commit will remove only
+		 * one of them.
+		 */
+		spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+		if (state->ats_enabled)
+			atomic_inc(&smmu_domain->nr_ats_masters);
+		list_add(&master_domain->devices_elm, &smmu_domain->devices);
+		spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+	}
+
+	if (!state->ats_enabled && master->ats_enabled) {
+		pci_disable_ats(to_pci_dev(master->dev));
+		/*
+		 * This is probably overkill, but the config write for disabling
+		 * ATS should complete before the STE is configured to generate
+		 * UR to avoid AER noise.
+		 */
+		wmb();
+	}
+	return 0;
+}
+
+/*
+ * Commit is done after the STE/CD are configured with the EATS setting. It
+ * completes synchronizing the PCI device's ATC and finishes manipulating the
+ * smmu_domain->devices list.
+ */
+static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
+{
+	struct arm_smmu_master *master = state->master;
+
+	lockdep_assert_held(&arm_smmu_asid_lock);
+
+	if (state->ats_enabled && !master->ats_enabled) {
+		arm_smmu_enable_ats(master);
+	} else if (master->ats_enabled) {
+		/*
+		 * The translation has changed, flush the ATC. At this point the
+		 * SMMU is translating for the new domain and both the old&new
+		 * domain will issue invalidations.
+		 */
+		arm_smmu_atc_inv_master(master);
+	}
+	master->ats_enabled = state->ats_enabled;
+
+	arm_smmu_remove_master_domain(master, state->old_domain);
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 {
 	int ret = 0;
-	unsigned long flags;
 	struct arm_smmu_ste target;
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
 	struct arm_smmu_device *smmu;
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	struct arm_smmu_master_domain *master_domain;
+	struct arm_smmu_attach_state state = {
+		.old_domain = iommu_get_domain_for_dev(dev),
+	};
 	struct arm_smmu_master *master;
 	struct arm_smmu_cd *cdptr;
 
 	if (!fwspec)
 		return -ENOENT;
 
-	master = dev_iommu_priv_get(dev);
+	state.master = master = dev_iommu_priv_get(dev);
 	smmu = master->smmu;
 
 	/*
@@ -2622,11 +2736,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 			return -ENOMEM;
 	}
 
-	master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
-	if (!master_domain)
-		return -ENOMEM;
-	master_domain->master = master;
-
 	/*
 	 * Prevent arm_smmu_share_asid() from trying to change the ASID
 	 * of either the old or new domain while we are working on it.
@@ -2635,13 +2744,11 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	 */
 	mutex_lock(&arm_smmu_asid_lock);
 
-	arm_smmu_detach_dev(master);
-
-	master->ats_enabled = arm_smmu_ats_supported(master);
-
-	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	list_add(&master_domain->devices_elm, &smmu_domain->devices);
-	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+	ret = arm_smmu_attach_prepare(&state, domain);
+	if (ret) {
+		mutex_unlock(&arm_smmu_asid_lock);
+		return ret;
+	}
 
 	switch (smmu_domain->stage) {
 	case ARM_SMMU_DOMAIN_S1: {
@@ -2650,18 +2757,19 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
 		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
 					&target_cd);
-		arm_smmu_make_cdtable_ste(&target, master);
+		arm_smmu_make_cdtable_ste(&target, master, state.ats_enabled);
 		arm_smmu_install_ste_for_dev(master, &target);
 		break;
 	}
 	case ARM_SMMU_DOMAIN_S2:
-		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain);
+		arm_smmu_make_s2_domain_ste(&target, master, smmu_domain,
+					    state.ats_enabled);
 		arm_smmu_install_ste_for_dev(master, &target);
 		arm_smmu_clear_cd(master, IOMMU_NO_PASID);
 		break;
 	}
 
-	arm_smmu_enable_ats(master, smmu_domain);
+	arm_smmu_attach_commit(&state);
 	mutex_unlock(&arm_smmu_asid_lock);
 	return 0;
 }
@@ -2690,10 +2798,14 @@ void arm_smmu_remove_pasid(struct arm_smmu_master *master,
 	arm_smmu_clear_cd(master, pasid);
 }
 
-static int arm_smmu_attach_dev_ste(struct device *dev,
-				   struct arm_smmu_ste *ste)
+static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+				   struct device *dev, struct arm_smmu_ste *ste)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_attach_state state = {
+		.master = master,
+		.old_domain = iommu_get_domain_for_dev(dev),
+	};
 
 	if (arm_smmu_master_sva_enabled(master))
 		return -EBUSY;
@@ -2704,16 +2816,9 @@ static int arm_smmu_attach_dev_ste(struct device *dev,
 	 */
 	mutex_lock(&arm_smmu_asid_lock);
 
-	/*
-	 * The SMMU does not support enabling ATS with bypass/abort. When the
-	 * STE is in bypass (STE.Config[2:0] == 0b100), ATS Translation Requests
-	 * and Translated transactions are denied as though ATS is disabled for
-	 * the stream (STE.EATS == 0b00), causing F_BAD_ATS_TREQ and
-	 * F_TRANSL_FORBIDDEN events (IHI0070Ea 5.2 Stream Table Entry).
-	 */
-	arm_smmu_detach_dev(master);
-
+	arm_smmu_attach_prepare(&state, domain);
 	arm_smmu_install_ste_for_dev(master, ste);
+	arm_smmu_attach_commit(&state);
 	mutex_unlock(&arm_smmu_asid_lock);
 
 	/*
@@ -2732,7 +2837,7 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
 	arm_smmu_make_bypass_ste(master->smmu, &ste);
-	return arm_smmu_attach_dev_ste(dev, &ste);
+	return arm_smmu_attach_dev_ste(domain, dev, &ste);
 }
 
 static const struct iommu_domain_ops arm_smmu_identity_ops = {
@@ -2750,7 +2855,7 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
 	struct arm_smmu_ste ste;
 
 	arm_smmu_make_abort_ste(&ste);
-	return arm_smmu_attach_dev_ste(dev, &ste);
+	return arm_smmu_attach_dev_ste(domain, dev, &ste);
 }
 
 static const struct iommu_domain_ops arm_smmu_blocked_ops = {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 01769b5286a83..f9b4bfb2e6b72 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -758,10 +758,12 @@ void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
 void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
 			      struct arm_smmu_ste *target);
 void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
-			       struct arm_smmu_master *master);
+			       struct arm_smmu_master *master,
+			       bool ats_enabled);
 void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 				 struct arm_smmu_master *master,
-				 struct arm_smmu_domain *smmu_domain);
+				 struct arm_smmu_domain *smmu_domain,
+				 bool ats_enabled);
 void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
 			  struct arm_smmu_master *master, struct mm_struct *mm,
 			  u16 asid);

From 6a0aa771f4ace565c7940af3d7620c3090ca3769 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:36 -0300
Subject: [PATCH 183/352] iommu/arm-smmu-v3: Add ssid to struct
 arm_smmu_master_domain

Prepare to allow a S1 domain to be attached to a PASID as well. Keep track
of the SSID the domain is using on each master in the
arm_smmu_master_domain.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 64efb3def3a53effe01fa750eec6e7369f65e386)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 15 ++++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 42 +++++++++++++++----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  5 ++-
 3 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index cb3a0e4143c84..d31caceb58498 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -47,13 +47,12 @@ arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
 		struct arm_smmu_master *master = master_domain->master;
 		struct arm_smmu_cd *cdptr;
 
-		/* S1 domains only support RID attachment right now */
-		cdptr = arm_smmu_get_cd_ptr(master, IOMMU_NO_PASID);
+		cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
 		if (WARN_ON(!cdptr))
 			continue;
 
 		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
-		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
+		arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr,
 					&target_cd);
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
@@ -294,8 +293,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
 						    smmu_domain);
 	}
 
-	arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), start,
-				size);
+	arm_smmu_atc_inv_domain_sva(smmu_domain, mm_get_enqcmd_pasid(mm), start,
+				    size);
 }
 
 static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
@@ -332,7 +331,7 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
 	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
-	arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0);
+	arm_smmu_atc_inv_domain_sva(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0);
 
 	smmu_mn->cleared = true;
 	mutex_unlock(&sva_lock);
@@ -411,8 +410,8 @@ static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
 	 */
 	if (!smmu_mn->cleared) {
 		arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
-		arm_smmu_atc_inv_domain(smmu_domain, mm_get_enqcmd_pasid(mm), 0,
-					0);
+		arm_smmu_atc_inv_domain_sva(smmu_domain,
+					    mm_get_enqcmd_pasid(mm), 0, 0);
 	}
 
 	/* Frees smmu_mn */
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d9c0dca60eb95..5355ebb1b0956 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2013,8 +2013,8 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
 	return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
 }
 
-int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
-			    unsigned long iova, size_t size)
+static int __arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
+				     ioasid_t ssid, unsigned long iova, size_t size)
 {
 	struct arm_smmu_master_domain *master_domain;
 	int i;
@@ -2042,8 +2042,6 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 	if (!atomic_read(&smmu_domain->nr_ats_masters))
 		return 0;
 
-	arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
-
 	cmds.num = 0;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
@@ -2054,6 +2052,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 		if (!master->ats_enabled)
 			continue;
 
+		/*
+		 * Non-zero ssid means SVA is co-opting the S1 domain to issue
+		 * invalidations for SVA PASIDs.
+		 */
+		if (ssid != IOMMU_NO_PASID)
+			arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
+		else
+			arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+						&cmd);
+
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
 			arm_smmu_cmdq_batch_add(smmu_domain->smmu, &cmds, &cmd);
@@ -2064,6 +2072,19 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
 	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
 }
 
+static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
+				   unsigned long iova, size_t size)
+{
+	return __arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, iova,
+					 size);
+}
+
+int arm_smmu_atc_inv_domain_sva(struct arm_smmu_domain *smmu_domain,
+				ioasid_t ssid, unsigned long iova, size_t size)
+{
+	return __arm_smmu_atc_inv_domain(smmu_domain, ssid, iova, size);
+}
+
 /* IO_PGTABLE API */
 static void arm_smmu_tlb_inv_context(void *cookie)
 {
@@ -2085,7 +2106,7 @@ static void arm_smmu_tlb_inv_context(void *cookie)
 		cmd.tlbi.vmid	= smmu_domain->s2_cfg.vmid;
 		arm_smmu_cmdq_issue_cmd_with_sync(smmu, &cmd);
 	}
-	arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, 0, 0);
+	arm_smmu_atc_inv_domain(smmu_domain, 0, 0);
 }
 
 static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
@@ -2183,7 +2204,7 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
 	 * Unfortunately, this can't be leaf-only since we may have
 	 * zapped an entire table.
 	 */
-	arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, iova, size);
+	arm_smmu_atc_inv_domain(smmu_domain, iova, size);
 }
 
 void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
@@ -2518,7 +2539,8 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 
 static struct arm_smmu_master_domain *
 arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
-			    struct arm_smmu_master *master)
+			    struct arm_smmu_master *master,
+			    ioasid_t ssid)
 {
 	struct arm_smmu_master_domain *master_domain;
 
@@ -2526,7 +2548,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 
 	list_for_each_entry(master_domain, &smmu_domain->devices,
 			    devices_elm) {
-		if (master_domain->master == master)
+		if (master_domain->master == master &&
+		    master_domain->ssid == ssid)
 			return master_domain;
 	}
 	return NULL;
@@ -2559,7 +2582,8 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
 		return;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	master_domain = arm_smmu_find_master_domain(smmu_domain, master);
+	master_domain = arm_smmu_find_master_domain(smmu_domain, master,
+						    IOMMU_NO_PASID);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index f9b4bfb2e6b72..f4061ffc1e612 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -772,6 +772,7 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
 struct arm_smmu_master_domain {
 	struct list_head devices_elm;
 	struct arm_smmu_master *master;
+	ioasid_t ssid;
 };
 
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
@@ -803,8 +804,8 @@ void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 				 size_t granule, bool leaf,
 				 struct arm_smmu_domain *smmu_domain);
 bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd);
-int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain, int ssid,
-			    unsigned long iova, size_t size);
+int arm_smmu_atc_inv_domain_sva(struct arm_smmu_domain *smmu_domain,
+				ioasid_t ssid, unsigned long iova, size_t size);
 
 #ifdef CONFIG_ARM_SMMU_V3_SVA
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);

From b391abc9aae87a0c18d5921e3c011a8fef9907ee Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:37 -0300
Subject: [PATCH 184/352] iommu/arm-smmu-v3: Do not use master->sva_enable to
 restrict attaches

We no longer need a master->sva_enable to control what attaches are
allowed. Instead we can tell if the attach is legal based on the current
configuration of the master.

Keep track of the number of valid CD entries for SSID's in the cd_table
and if the cd_table has been installed in the STE directly so we know what
the configuration is.

The attach logic is then made into:
 - SVA bind, check if the CD is installed
 - RID attach of S2, block if SSIDs are used
 - RID attach of IDENTITY/BLOCKING, block if SSIDs are used

arm_smmu_set_pasid() is already checking if it is possible to setup a CD
entry, at this patch it means the RID path already set a STE pointing at
the CD table.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/6-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit be7c90de39fdebdba4f9cce7575b71c6b2506ea0)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 24 ++++++++++-----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  7 ++++++
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5355ebb1b0956..55d10df8238fe 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1289,6 +1289,8 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 			     struct arm_smmu_cd *cdptr,
 			     const struct arm_smmu_cd *target)
 {
+	bool target_valid = target->data[0] & cpu_to_le64(CTXDESC_CD_0_V);
+	bool cur_valid = cdptr->data[0] & cpu_to_le64(CTXDESC_CD_0_V);
 	struct arm_smmu_cd_writer cd_writer = {
 		.writer = {
 			.ops = &arm_smmu_cd_writer_ops,
@@ -1297,6 +1299,13 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 		.ssid = ssid,
 	};
 
+	if (ssid != IOMMU_NO_PASID && cur_valid != target_valid) {
+		if (cur_valid)
+			master->cd_table.used_ssids--;
+		else
+			master->cd_table.used_ssids++;
+	}
+
 	arm_smmu_write_entry(&cd_writer.writer, cdptr->data, target->data);
 }
 
@@ -2733,16 +2742,6 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	state.master = master = dev_iommu_priv_get(dev);
 	smmu = master->smmu;
 
-	/*
-	 * Checking that SVA is disabled ensures that this device isn't bound to
-	 * any mm, and can be safely detached from its old domain. Bonds cannot
-	 * be removed concurrently since we're holding the group mutex.
-	 */
-	if (arm_smmu_master_sva_enabled(master)) {
-		dev_err(dev, "cannot attach - SVA enabled\n");
-		return -EBUSY;
-	}
-
 	mutex_lock(&smmu_domain->init_mutex);
 
 	if (!smmu_domain->smmu) {
@@ -2758,7 +2757,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		cdptr = arm_smmu_alloc_cd_ptr(master, IOMMU_NO_PASID);
 		if (!cdptr)
 			return -ENOMEM;
-	}
+	} else if (arm_smmu_ssids_in_use(&master->cd_table))
+		return -EBUSY;
 
 	/*
 	 * Prevent arm_smmu_share_asid() from trying to change the ASID
@@ -2831,7 +2831,7 @@ static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 		.old_domain = iommu_get_domain_for_dev(dev),
 	};
 
-	if (arm_smmu_master_sva_enabled(master))
+	if (arm_smmu_ssids_in_use(&master->cd_table))
 		return -EBUSY;
 
 	/*
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index f4061ffc1e612..65b75dbfd1591 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -602,12 +602,19 @@ struct arm_smmu_ctx_desc_cfg {
 	dma_addr_t			cdtab_dma;
 	struct arm_smmu_l1_ctx_desc	*l1_desc;
 	unsigned int			num_l1_ents;
+	unsigned int			used_ssids;
 	u8				in_ste;
 	u8				s1fmt;
 	/* log2 of the maximum number of CDs supported by this table */
 	u8				s1cdmax;
 };
 
+/* True if the cd table has SSIDS > 0 in use. */
+static inline bool arm_smmu_ssids_in_use(struct arm_smmu_ctx_desc_cfg *cd_table)
+{
+	return cd_table->used_ssids;
+}
+
 struct arm_smmu_s2_cfg {
 	u16				vmid;
 };

From be8e5a914dacf6ce43a87ba5199620c12b770d72 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:38 -0300
Subject: [PATCH 185/352] iommu/arm-smmu-v3: Thread SSID through the
 arm_smmu_attach_*() interface

Allow creating and managing arm_smmu_mater_domain's with a non-zero SSID
through the arm_smmu_attach_*() family of functions. This triggers ATC
invalidation for the correct SSID in PASID cases and tracks the
per-attachment SSID in the struct arm_smmu_master_domain.

Generalize arm_smmu_attach_remove() to be able to remove SSID's as well by
ensuring the ATC for the PASID is flushed properly.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/7-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 1d5f34f0002f9f56d0ca153022cfdead07d45dc6)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 ++++++++++++++-------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 55d10df8238fe..36ef0c67fd4e7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2005,13 +2005,14 @@ arm_smmu_atc_inv_to_cmd(int ssid, unsigned long iova, size_t size,
 	cmd->atc.size	= log2_span;
 }
 
-static int arm_smmu_atc_inv_master(struct arm_smmu_master *master)
+static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
+				   ioasid_t ssid)
 {
 	int i;
 	struct arm_smmu_cmdq_ent cmd;
 	struct arm_smmu_cmdq_batch cmds;
 
-	arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
 
 	cmds.num = 0;
 	for (i = 0; i < master->num_streams; i++) {
@@ -2494,7 +2495,7 @@ static void arm_smmu_enable_ats(struct arm_smmu_master *master)
 	/*
 	 * ATC invalidation of PASID 0 causes the entire ATC to be flushed.
 	 */
-	arm_smmu_atc_inv_master(master);
+	arm_smmu_atc_inv_master(master, IOMMU_NO_PASID);
 	if (pci_enable_ats(pdev, stu))
 		dev_err(master->dev, "Failed to enable ATS (STU %zu)\n", stu);
 }
@@ -2581,7 +2582,8 @@ to_smmu_domain_devices(struct iommu_domain *domain)
 }
 
 static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
-					  struct iommu_domain *domain)
+					  struct iommu_domain *domain,
+					  ioasid_t ssid)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
 	struct arm_smmu_master_domain *master_domain;
@@ -2591,8 +2593,7 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
 		return;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	master_domain = arm_smmu_find_master_domain(smmu_domain, master,
-						    IOMMU_NO_PASID);
+	master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
@@ -2606,6 +2607,7 @@ struct arm_smmu_attach_state {
 	/* Inputs */
 	struct iommu_domain *old_domain;
 	struct arm_smmu_master *master;
+	ioasid_t ssid;
 	/* Resulting state */
 	bool ats_enabled;
 };
@@ -2663,6 +2665,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		if (!master_domain)
 			return -ENOMEM;
 		master_domain->master = master;
+		master_domain->ssid = state->ssid;
 
 		/*
 		 * During prepare we want the current smmu_domain and new
@@ -2710,17 +2713,20 @@ static void arm_smmu_attach_commit(struct arm_smmu_attach_state *state)
 
 	if (state->ats_enabled && !master->ats_enabled) {
 		arm_smmu_enable_ats(master);
-	} else if (master->ats_enabled) {
+	} else if (state->ats_enabled && master->ats_enabled) {
 		/*
 		 * The translation has changed, flush the ATC. At this point the
 		 * SMMU is translating for the new domain and both the old&new
 		 * domain will issue invalidations.
 		 */
-		arm_smmu_atc_inv_master(master);
+		arm_smmu_atc_inv_master(master, state->ssid);
+	} else if (!state->ats_enabled && master->ats_enabled) {
+		/* ATS is being switched off, invalidate the entire ATC */
+		arm_smmu_atc_inv_master(master, IOMMU_NO_PASID);
 	}
 	master->ats_enabled = state->ats_enabled;
 
-	arm_smmu_remove_master_domain(master, state->old_domain);
+	arm_smmu_remove_master_domain(master, state->old_domain, state->ssid);
 }
 
 static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
@@ -2732,6 +2738,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_attach_state state = {
 		.old_domain = iommu_get_domain_for_dev(dev),
+		.ssid = IOMMU_NO_PASID,
 	};
 	struct arm_smmu_master *master;
 	struct arm_smmu_cd *cdptr;
@@ -2829,6 +2836,7 @@ static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 	struct arm_smmu_attach_state state = {
 		.master = master,
 		.old_domain = iommu_get_domain_for_dev(dev),
+		.ssid = IOMMU_NO_PASID,
 	};
 
 	if (arm_smmu_ssids_in_use(&master->cd_table))

From 2b7cbc10b714b98311c9d9354ce9aa166bbec30d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:39 -0300
Subject: [PATCH 186/352] iommu/arm-smmu-v3: Make SVA allocate a normal
 arm_smmu_domain

Currently the SVA domain is a naked struct iommu_domain, allocate a struct
arm_smmu_domain instead.

This is necessary to be able to use the struct arm_master_domain
mechanism.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/8-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d7b2d2ba1b84f4ae7cd94de22f74d6c6c5419de6)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 21 ++++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 27 +++++++++++++------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  2 ++
 3 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index d31caceb58498..aa033cd65adc5 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -639,7 +639,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 	}
 
 	arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid);
-	ret = arm_smmu_set_pasid(master, NULL, id, &target);
+	ret = arm_smmu_set_pasid(master, to_smmu_domain(domain), id, &target);
 	if (ret) {
 		list_del(&bond->list);
 		arm_smmu_mmu_notifier_put(bond->smmu_mn);
@@ -653,7 +653,7 @@ static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 
 static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
 {
-	kfree(domain);
+	kfree(to_smmu_domain(domain));
 }
 
 static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
@@ -664,13 +664,16 @@ static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
 struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 					       struct mm_struct *mm)
 {
-	struct iommu_domain *domain;
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_device *smmu = master->smmu;
+	struct arm_smmu_domain *smmu_domain;
 
-	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
-	if (!domain)
-		return ERR_PTR(-ENOMEM);
-	domain->type = IOMMU_DOMAIN_SVA;
-	domain->ops = &arm_smmu_sva_domain_ops;
+	smmu_domain = arm_smmu_domain_alloc();
+	if (IS_ERR(smmu_domain))
+		return ERR_CAST(smmu_domain);
+	smmu_domain->domain.type = IOMMU_DOMAIN_SVA;
+	smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
+	smmu_domain->smmu = smmu;
 
-	return domain;
+	return &smmu_domain->domain;
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 36ef0c67fd4e7..35d3edea48052 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2272,15 +2272,10 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 	}
 }
 
-static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
+struct arm_smmu_domain *arm_smmu_domain_alloc(void)
 {
 	struct arm_smmu_domain *smmu_domain;
 
-	/*
-	 * Allocate the domain and initialise some of its data structures.
-	 * We can't really do anything meaningful until we've added a
-	 * master.
-	 */
 	smmu_domain = kzalloc(sizeof(*smmu_domain), GFP_KERNEL);
 	if (!smmu_domain)
 		return ERR_PTR(-ENOMEM);
@@ -2290,6 +2285,22 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
 	spin_lock_init(&smmu_domain->devices_lock);
 	INIT_LIST_HEAD(&smmu_domain->mmu_notifiers);
 
+	return smmu_domain;
+}
+
+static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
+{
+	struct arm_smmu_domain *smmu_domain;
+
+	/*
+	 * Allocate the domain and initialise some of its data structures.
+	 * We can't really do anything meaningful until we've added a
+	 * master.
+	 */
+	smmu_domain = arm_smmu_domain_alloc();
+	if (IS_ERR(smmu_domain))
+		return ERR_CAST(smmu_domain);
+
 	if (dev) {
 		struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 		int ret;
@@ -2303,7 +2314,7 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
 	return &smmu_domain->domain;
 }
 
-static void arm_smmu_domain_free(struct iommu_domain *domain)
+static void arm_smmu_domain_free_paging(struct iommu_domain *domain)
 {
 	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
@@ -3305,7 +3316,7 @@ static struct iommu_ops arm_smmu_ops = {
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
 		.enable_nesting		= arm_smmu_enable_nesting,
-		.free			= arm_smmu_domain_free,
+		.free			= arm_smmu_domain_free_paging,
 	}
 };
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 65b75dbfd1591..212c18c70fa03 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -790,6 +790,8 @@ static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
 extern struct xarray arm_smmu_asid_xa;
 extern struct mutex arm_smmu_asid_lock;
 
+struct arm_smmu_domain *arm_smmu_domain_alloc(void);
+
 void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid);
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid);

From 8af691a244f0ba2af98f99c2359a76055b95433a Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:40 -0300
Subject: [PATCH 187/352] iommu/arm-smmu-v3: Keep track of
 arm_smmu_master_domain for SVA

Fill in the smmu_domain->devices list in the new struct arm_smmu_domain
that SVA allocates. Keep track of every SSID and master that is using the
domain reusing the logic for the RID attach.

This is the first step to making the SVA invalidation follow the same
design as S1/S2 invalidation. At present nothing will read this list.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/9-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 49db2ed23c52f8371c12ab8646df23fa1daad4b2)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 30 +++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 35d3edea48052..8c9f0a7324603 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2587,7 +2587,8 @@ to_smmu_domain_devices(struct iommu_domain *domain)
 	/* The domain can be NULL only when processing the first attach */
 	if (!domain)
 		return NULL;
-	if (domain->type & __IOMMU_DOMAIN_PAGING)
+	if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
+	    domain->type == IOMMU_DOMAIN_SVA)
 		return to_smmu_domain(domain);
 	return NULL;
 }
@@ -2820,7 +2821,16 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
 		       const struct arm_smmu_cd *cd)
 {
+	struct arm_smmu_attach_state state = {
+		.master = master,
+		/*
+		 * For now the core code prevents calling this when a domain is
+		 * already attached, no need to set old_domain.
+		 */
+		.ssid = pasid,
+	};
 	struct arm_smmu_cd *cdptr;
+	int ret;
 
 	/* The core code validates pasid */
 
@@ -2830,14 +2840,30 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 	cdptr = arm_smmu_alloc_cd_ptr(master, pasid);
 	if (!cdptr)
 		return -ENOMEM;
+
+	mutex_lock(&arm_smmu_asid_lock);
+	ret = arm_smmu_attach_prepare(&state, &smmu_domain->domain);
+	if (ret)
+		goto out_unlock;
+
 	arm_smmu_write_cd_entry(master, pasid, cdptr, cd);
-	return 0;
+
+	arm_smmu_attach_commit(&state);
+
+out_unlock:
+	mutex_unlock(&arm_smmu_asid_lock);
+	return ret;
 }
 
 void arm_smmu_remove_pasid(struct arm_smmu_master *master,
 			   struct arm_smmu_domain *smmu_domain, ioasid_t pasid)
 {
+	mutex_lock(&arm_smmu_asid_lock);
 	arm_smmu_clear_cd(master, pasid);
+	if (master->ats_enabled)
+		arm_smmu_atc_inv_master(master, pasid);
+	arm_smmu_remove_master_domain(master, &smmu_domain->domain, pasid);
+	mutex_unlock(&arm_smmu_asid_lock);
 }
 
 static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,

From 3999dd0f645f1b815bf0332694b70ec278bf98be Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:41 -0300
Subject: [PATCH 188/352] iommu/arm-smmu-v3: Put the SVA mmu notifier in the
 smmu_domain

This removes all the notifier de-duplication logic in the driver and
relies on the core code to de-duplicate and allocate only one SVA domain
per mm per smmu instance. This naturally gives a 1:1 relationship between
SVA domain and mmu notifier.

It is a significant simplication of the flow, as we end up with a single
struct arm_smmu_domain for each MM and the invalidation can then be
shifted to properly use the masters list like S1/S2 do.

Remove all of the previous mmu_notifier, bond, shared cd, and cd refcount
logic entirely.

The logic here is tightly wound together with the unusued BTM
support. Since the BTM logic requires holding all the iommu_domains in a
global ASID xarray it conflicts with the design to have a single SVA
domain per PASID, as multiple SMMU instances will need to have different
domains.

Following patches resolve this by making the ASID xarray per-instance
instead of global. However, converting the BTM code over to this
methodology requires many changes.

Thus, since ARM_SMMU_FEAT_BTM is never enabled, remove the parts of the
BTM support for ASID sharing that interact with SVA as well.

A followup series is already working on fully enabling the BTM support,
that requires iommufd's VIOMMU feature to bring in the KVM's VMID as
well. It will come with an already written patch to bring back the ASID
sharing using a per-instance ASID xarray.

https://lore.kernel.org/linux-iommu/20240208151837.35068-1-shameerali.kolothum.thodi@huawei.com/
https://lore.kernel.org/linux-iommu/26-v6-228e7adf25eb+4155-smmuv3_newapi_p2_jgg@nvidia.com/

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/10-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d38c28dbefeee03d7dd02004ad80d9676ac54d86)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c   | 395 +++---------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  69 +--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  15 +-
 3 files changed, 86 insertions(+), 393 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
index aa033cd65adc5..a7c36654dee5a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-sva.c
@@ -13,29 +13,9 @@
 #include "arm-smmu-v3.h"
 #include "../../io-pgtable-arm.h"
 
-struct arm_smmu_mmu_notifier {
-	struct mmu_notifier		mn;
-	struct arm_smmu_ctx_desc	*cd;
-	bool				cleared;
-	refcount_t			refs;
-	struct list_head		list;
-	struct arm_smmu_domain		*domain;
-};
-
-#define mn_to_smmu(mn) container_of(mn, struct arm_smmu_mmu_notifier, mn)
-
-struct arm_smmu_bond {
-	struct mm_struct		*mm;
-	struct arm_smmu_mmu_notifier	*smmu_mn;
-	struct list_head		list;
-};
-
-#define sva_to_bond(handle) \
-	container_of(handle, struct arm_smmu_bond, sva)
-
 static DEFINE_MUTEX(sva_lock);
 
-static void
+static void __maybe_unused
 arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
 {
 	struct arm_smmu_master_domain *master_domain;
@@ -58,58 +38,6 @@ arm_smmu_update_s1_domain_cd_entry(struct arm_smmu_domain *smmu_domain)
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 }
 
-/*
- * Check if the CPU ASID is available on the SMMU side. If a private context
- * descriptor is using it, try to replace it.
- */
-static struct arm_smmu_ctx_desc *
-arm_smmu_share_asid(struct mm_struct *mm, u16 asid)
-{
-	int ret;
-	u32 new_asid;
-	struct arm_smmu_ctx_desc *cd;
-	struct arm_smmu_device *smmu;
-	struct arm_smmu_domain *smmu_domain;
-
-	cd = xa_load(&arm_smmu_asid_xa, asid);
-	if (!cd)
-		return NULL;
-
-	if (cd->mm) {
-		if (WARN_ON(cd->mm != mm))
-			return ERR_PTR(-EINVAL);
-		/* All devices bound to this mm use the same cd struct. */
-		refcount_inc(&cd->refs);
-		return cd;
-	}
-
-	smmu_domain = container_of(cd, struct arm_smmu_domain, cd);
-	smmu = smmu_domain->smmu;
-
-	ret = xa_alloc(&arm_smmu_asid_xa, &new_asid, cd,
-		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
-	if (ret)
-		return ERR_PTR(-ENOSPC);
-	/*
-	 * Race with unmap: TLB invalidations will start targeting the new ASID,
-	 * which isn't assigned yet. We'll do an invalidate-all on the old ASID
-	 * later, so it doesn't matter.
-	 */
-	cd->asid = new_asid;
-	/*
-	 * Update ASID and invalidate CD in all associated masters. There will
-	 * be some overlap between use of both ASIDs, until we invalidate the
-	 * TLB.
-	 */
-	arm_smmu_update_s1_domain_cd_entry(smmu_domain);
-
-	/* Invalidate TLB entries previously associated with that context */
-	arm_smmu_tlb_inv_asid(smmu, asid);
-
-	xa_erase(&arm_smmu_asid_xa, asid);
-	return NULL;
-}
-
 static u64 page_size_to_cd(void)
 {
 	static_assert(PAGE_SIZE == SZ_4K || PAGE_SIZE == SZ_16K ||
@@ -187,69 +115,6 @@ void arm_smmu_make_sva_cd(struct arm_smmu_cd *target,
 }
 EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_sva_cd);
 
-static struct arm_smmu_ctx_desc *arm_smmu_alloc_shared_cd(struct mm_struct *mm)
-{
-	u16 asid;
-	int err = 0;
-	struct arm_smmu_ctx_desc *cd;
-	struct arm_smmu_ctx_desc *ret = NULL;
-
-	/* Don't free the mm until we release the ASID */
-	mmgrab(mm);
-
-	asid = arm64_mm_context_get(mm);
-	if (!asid) {
-		err = -ESRCH;
-		goto out_drop_mm;
-	}
-
-	cd = kzalloc(sizeof(*cd), GFP_KERNEL);
-	if (!cd) {
-		err = -ENOMEM;
-		goto out_put_context;
-	}
-
-	refcount_set(&cd->refs, 1);
-
-	mutex_lock(&arm_smmu_asid_lock);
-	ret = arm_smmu_share_asid(mm, asid);
-	if (ret) {
-		mutex_unlock(&arm_smmu_asid_lock);
-		goto out_free_cd;
-	}
-
-	err = xa_insert(&arm_smmu_asid_xa, asid, cd, GFP_KERNEL);
-	mutex_unlock(&arm_smmu_asid_lock);
-
-	if (err)
-		goto out_free_asid;
-
-	cd->asid = asid;
-	cd->mm = mm;
-
-	return cd;
-
-out_free_asid:
-	arm_smmu_free_asid(cd);
-out_free_cd:
-	kfree(cd);
-out_put_context:
-	arm64_mm_context_put(mm);
-out_drop_mm:
-	mmdrop(mm);
-	return err < 0 ? ERR_PTR(err) : ret;
-}
-
-static void arm_smmu_free_shared_cd(struct arm_smmu_ctx_desc *cd)
-{
-	if (arm_smmu_free_asid(cd)) {
-		/* Unpin ASID */
-		arm64_mm_context_put(cd->mm);
-		mmdrop(cd->mm);
-		kfree(cd);
-	}
-}
-
 /*
  * Cloned from the MAX_TLBI_OPS in arch/arm64/include/asm/tlbflush.h, this
  * is used as a threshold to replace per-page TLBI commands to issue in the
@@ -264,8 +129,8 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
 						unsigned long start,
 						unsigned long end)
 {
-	struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
-	struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
+	struct arm_smmu_domain *smmu_domain =
+		container_of(mn, struct arm_smmu_domain, mmu_notifier);
 	size_t size;
 
 	/*
@@ -282,34 +147,22 @@ static void arm_smmu_mm_arch_invalidate_secondary_tlbs(struct mmu_notifier *mn,
 			size = 0;
 	}
 
-	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_BTM)) {
-		if (!size)
-			arm_smmu_tlb_inv_asid(smmu_domain->smmu,
-					      smmu_mn->cd->asid);
-		else
-			arm_smmu_tlb_inv_range_asid(start, size,
-						    smmu_mn->cd->asid,
-						    PAGE_SIZE, false,
-						    smmu_domain);
-	}
+	if (!size)
+		arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid);
+	else
+		arm_smmu_tlb_inv_range_asid(start, size, smmu_domain->cd.asid,
+					    PAGE_SIZE, false, smmu_domain);
 
-	arm_smmu_atc_inv_domain_sva(smmu_domain, mm_get_enqcmd_pasid(mm), start,
-				    size);
+	arm_smmu_atc_inv_domain(smmu_domain, start, size);
 }
 
 static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 {
-	struct arm_smmu_mmu_notifier *smmu_mn = mn_to_smmu(mn);
-	struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
+	struct arm_smmu_domain *smmu_domain =
+		container_of(mn, struct arm_smmu_domain, mmu_notifier);
 	struct arm_smmu_master_domain *master_domain;
 	unsigned long flags;
 
-	mutex_lock(&sva_lock);
-	if (smmu_mn->cleared) {
-		mutex_unlock(&sva_lock);
-		return;
-	}
-
 	/*
 	 * DMA may still be running. Keep the cd valid to avoid C_BAD_CD events,
 	 * but disable translation.
@@ -321,25 +174,23 @@ static void arm_smmu_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
 		struct arm_smmu_cd target;
 		struct arm_smmu_cd *cdptr;
 
-		cdptr = arm_smmu_get_cd_ptr(master, mm_get_enqcmd_pasid(mm));
+		cdptr = arm_smmu_get_cd_ptr(master, master_domain->ssid);
 		if (WARN_ON(!cdptr))
 			continue;
-		arm_smmu_make_sva_cd(&target, master, NULL, smmu_mn->cd->asid);
-		arm_smmu_write_cd_entry(master, mm_get_enqcmd_pasid(mm), cdptr,
+		arm_smmu_make_sva_cd(&target, master, NULL,
+				     smmu_domain->cd.asid);
+		arm_smmu_write_cd_entry(master, master_domain->ssid, cdptr,
 					&target);
 	}
 	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
 
-	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_mn->cd->asid);
-	arm_smmu_atc_inv_domain_sva(smmu_domain, mm_get_enqcmd_pasid(mm), 0, 0);
-
-	smmu_mn->cleared = true;
-	mutex_unlock(&sva_lock);
+	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid);
+	arm_smmu_atc_inv_domain(smmu_domain, 0, 0);
 }
 
 static void arm_smmu_mmu_notifier_free(struct mmu_notifier *mn)
 {
-	kfree(mn_to_smmu(mn));
+	kfree(container_of(mn, struct arm_smmu_domain, mmu_notifier));
 }
 
 static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
@@ -348,115 +199,6 @@ static const struct mmu_notifier_ops arm_smmu_mmu_notifier_ops = {
 	.free_notifier			= arm_smmu_mmu_notifier_free,
 };
 
-/* Allocate or get existing MMU notifier for this {domain, mm} pair */
-static struct arm_smmu_mmu_notifier *
-arm_smmu_mmu_notifier_get(struct arm_smmu_domain *smmu_domain,
-			  struct mm_struct *mm)
-{
-	int ret;
-	struct arm_smmu_ctx_desc *cd;
-	struct arm_smmu_mmu_notifier *smmu_mn;
-
-	list_for_each_entry(smmu_mn, &smmu_domain->mmu_notifiers, list) {
-		if (smmu_mn->mn.mm == mm) {
-			refcount_inc(&smmu_mn->refs);
-			return smmu_mn;
-		}
-	}
-
-	cd = arm_smmu_alloc_shared_cd(mm);
-	if (IS_ERR(cd))
-		return ERR_CAST(cd);
-
-	smmu_mn = kzalloc(sizeof(*smmu_mn), GFP_KERNEL);
-	if (!smmu_mn) {
-		ret = -ENOMEM;
-		goto err_free_cd;
-	}
-
-	refcount_set(&smmu_mn->refs, 1);
-	smmu_mn->cd = cd;
-	smmu_mn->domain = smmu_domain;
-	smmu_mn->mn.ops = &arm_smmu_mmu_notifier_ops;
-
-	ret = mmu_notifier_register(&smmu_mn->mn, mm);
-	if (ret) {
-		kfree(smmu_mn);
-		goto err_free_cd;
-	}
-
-	list_add(&smmu_mn->list, &smmu_domain->mmu_notifiers);
-	return smmu_mn;
-
-err_free_cd:
-	arm_smmu_free_shared_cd(cd);
-	return ERR_PTR(ret);
-}
-
-static void arm_smmu_mmu_notifier_put(struct arm_smmu_mmu_notifier *smmu_mn)
-{
-	struct mm_struct *mm = smmu_mn->mn.mm;
-	struct arm_smmu_ctx_desc *cd = smmu_mn->cd;
-	struct arm_smmu_domain *smmu_domain = smmu_mn->domain;
-
-	if (!refcount_dec_and_test(&smmu_mn->refs))
-		return;
-
-	list_del(&smmu_mn->list);
-
-	/*
-	 * If we went through clear(), we've already invalidated, and no
-	 * new TLB entry can have been formed.
-	 */
-	if (!smmu_mn->cleared) {
-		arm_smmu_tlb_inv_asid(smmu_domain->smmu, cd->asid);
-		arm_smmu_atc_inv_domain_sva(smmu_domain,
-					    mm_get_enqcmd_pasid(mm), 0, 0);
-	}
-
-	/* Frees smmu_mn */
-	mmu_notifier_put(&smmu_mn->mn);
-	arm_smmu_free_shared_cd(cd);
-}
-
-static struct arm_smmu_bond *__arm_smmu_sva_bind(struct device *dev,
-						 struct mm_struct *mm)
-{
-	int ret;
-	struct arm_smmu_bond *bond;
-	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
-	struct arm_smmu_domain *smmu_domain;
-
-	if (!(domain->type & __IOMMU_DOMAIN_PAGING))
-		return ERR_PTR(-ENODEV);
-	smmu_domain = to_smmu_domain(domain);
-	if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
-		return ERR_PTR(-ENODEV);
-
-	if (!master || !master->sva_enabled)
-		return ERR_PTR(-ENODEV);
-
-	bond = kzalloc(sizeof(*bond), GFP_KERNEL);
-	if (!bond)
-		return ERR_PTR(-ENOMEM);
-
-	bond->mm = mm;
-
-	bond->smmu_mn = arm_smmu_mmu_notifier_get(smmu_domain, mm);
-	if (IS_ERR(bond->smmu_mn)) {
-		ret = PTR_ERR(bond->smmu_mn);
-		goto err_free_bond;
-	}
-
-	list_add(&bond->list, &master->bonds);
-	return bond;
-
-err_free_bond:
-	kfree(bond);
-	return ERR_PTR(ret);
-}
-
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
 {
 	unsigned long reg, fld;
@@ -573,11 +315,6 @@ int arm_smmu_master_enable_sva(struct arm_smmu_master *master)
 int arm_smmu_master_disable_sva(struct arm_smmu_master *master)
 {
 	mutex_lock(&sva_lock);
-	if (!list_empty(&master->bonds)) {
-		dev_err(master->dev, "cannot disable SVA, device is bound\n");
-		mutex_unlock(&sva_lock);
-		return -EBUSY;
-	}
 	arm_smmu_master_sva_disable_iopf(master);
 	master->sva_enabled = false;
 	mutex_unlock(&sva_lock);
@@ -594,66 +331,51 @@ void arm_smmu_sva_notifier_synchronize(void)
 	mmu_notifier_synchronize();
 }
 
-void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
-				   struct device *dev, ioasid_t id)
-{
-	struct mm_struct *mm = domain->mm;
-	struct arm_smmu_bond *bond = NULL, *t;
-	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-
-	arm_smmu_remove_pasid(master, to_smmu_domain(domain), id);
-
-	mutex_lock(&sva_lock);
-	list_for_each_entry(t, &master->bonds, list) {
-		if (t->mm == mm) {
-			bond = t;
-			break;
-		}
-	}
-
-	if (!WARN_ON(!bond)) {
-		list_del(&bond->list);
-		arm_smmu_mmu_notifier_put(bond->smmu_mn);
-		kfree(bond);
-	}
-	mutex_unlock(&sva_lock);
-}
-
 static int arm_smmu_sva_set_dev_pasid(struct iommu_domain *domain,
 				      struct device *dev, ioasid_t id)
 {
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-	struct mm_struct *mm = domain->mm;
-	struct arm_smmu_bond *bond;
 	struct arm_smmu_cd target;
 	int ret;
 
-	if (mm_get_enqcmd_pasid(mm) != id)
+	/* Prevent arm_smmu_mm_release from being called while we are attaching */
+	if (!mmget_not_zero(domain->mm))
 		return -EINVAL;
 
-	mutex_lock(&sva_lock);
-	bond = __arm_smmu_sva_bind(dev, mm);
-	if (IS_ERR(bond)) {
-		mutex_unlock(&sva_lock);
-		return PTR_ERR(bond);
-	}
+	/*
+	 * This does not need the arm_smmu_asid_lock because SVA domains never
+	 * get reassigned
+	 */
+	arm_smmu_make_sva_cd(&target, master, domain->mm, smmu_domain->cd.asid);
+	ret = arm_smmu_set_pasid(master, smmu_domain, id, &target);
 
-	arm_smmu_make_sva_cd(&target, master, mm, bond->smmu_mn->cd->asid);
-	ret = arm_smmu_set_pasid(master, to_smmu_domain(domain), id, &target);
-	if (ret) {
-		list_del(&bond->list);
-		arm_smmu_mmu_notifier_put(bond->smmu_mn);
-		kfree(bond);
-		mutex_unlock(&sva_lock);
-		return ret;
-	}
-	mutex_unlock(&sva_lock);
-	return 0;
+	mmput(domain->mm);
+	return ret;
 }
 
 static void arm_smmu_sva_domain_free(struct iommu_domain *domain)
 {
-	kfree(to_smmu_domain(domain));
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+
+	/*
+	 * Ensure the ASID is empty in the iommu cache before allowing reuse.
+	 */
+	arm_smmu_tlb_inv_asid(smmu_domain->smmu, smmu_domain->cd.asid);
+
+	/*
+	 * Notice that the arm_smmu_mm_arch_invalidate_secondary_tlbs op can
+	 * still be called/running at this point. We allow the ASID to be
+	 * reused, and if there is a race then it just suffers harmless
+	 * unnecessary invalidation.
+	 */
+	xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid);
+
+	/*
+	 * Actual free is defered to the SRCU callback
+	 * arm_smmu_mmu_notifier_free()
+	 */
+	mmu_notifier_put(&smmu_domain->mmu_notifier);
 }
 
 static const struct iommu_domain_ops arm_smmu_sva_domain_ops = {
@@ -667,6 +389,8 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_domain *smmu_domain;
+	u32 asid;
+	int ret;
 
 	smmu_domain = arm_smmu_domain_alloc();
 	if (IS_ERR(smmu_domain))
@@ -675,5 +399,22 @@ struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 	smmu_domain->domain.ops = &arm_smmu_sva_domain_ops;
 	smmu_domain->smmu = smmu;
 
+	ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
+		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
+	if (ret)
+		goto err_free;
+
+	smmu_domain->cd.asid = asid;
+	smmu_domain->mmu_notifier.ops = &arm_smmu_mmu_notifier_ops;
+	ret = mmu_notifier_register(&smmu_domain->mmu_notifier, mm);
+	if (ret)
+		goto err_asid;
+
 	return &smmu_domain->domain;
+
+err_asid:
+	xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid);
+err_free:
+	kfree(smmu_domain);
+	return ERR_PTR(ret);
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8c9f0a7324603..6120eb644cbda 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1439,22 +1439,6 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 	cd_table->cdtab = NULL;
 }
 
-bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd)
-{
-	bool free;
-	struct arm_smmu_ctx_desc *old_cd;
-
-	if (!cd->asid)
-		return false;
-
-	free = refcount_dec_and_test(&cd->refs);
-	if (free) {
-		old_cd = xa_erase(&arm_smmu_asid_xa, cd->asid);
-		WARN_ON(old_cd != cd);
-	}
-	return free;
-}
-
 /* Stream table manipulation functions */
 static void
 arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
@@ -2023,8 +2007,8 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 	return arm_smmu_cmdq_batch_submit(master->smmu, &cmds);
 }
 
-static int __arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
-				     ioasid_t ssid, unsigned long iova, size_t size)
+int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
+			    unsigned long iova, size_t size)
 {
 	struct arm_smmu_master_domain *master_domain;
 	int i;
@@ -2062,15 +2046,7 @@ static int __arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 		if (!master->ats_enabled)
 			continue;
 
-		/*
-		 * Non-zero ssid means SVA is co-opting the S1 domain to issue
-		 * invalidations for SVA PASIDs.
-		 */
-		if (ssid != IOMMU_NO_PASID)
-			arm_smmu_atc_inv_to_cmd(ssid, iova, size, &cmd);
-		else
-			arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
-						&cmd);
+		arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
 
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
@@ -2082,19 +2058,6 @@ static int __arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	return arm_smmu_cmdq_batch_submit(smmu_domain->smmu, &cmds);
 }
 
-static int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
-				   unsigned long iova, size_t size)
-{
-	return __arm_smmu_atc_inv_domain(smmu_domain, IOMMU_NO_PASID, iova,
-					 size);
-}
-
-int arm_smmu_atc_inv_domain_sva(struct arm_smmu_domain *smmu_domain,
-				ioasid_t ssid, unsigned long iova, size_t size)
-{
-	return __arm_smmu_atc_inv_domain(smmu_domain, ssid, iova, size);
-}
-
 /* IO_PGTABLE API */
 static void arm_smmu_tlb_inv_context(void *cookie)
 {
@@ -2283,7 +2246,6 @@ struct arm_smmu_domain *arm_smmu_domain_alloc(void)
 	mutex_init(&smmu_domain->init_mutex);
 	INIT_LIST_HEAD(&smmu_domain->devices);
 	spin_lock_init(&smmu_domain->devices_lock);
-	INIT_LIST_HEAD(&smmu_domain->mmu_notifiers);
 
 	return smmu_domain;
 }
@@ -2325,7 +2287,7 @@ static void arm_smmu_domain_free_paging(struct iommu_domain *domain)
 	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S1) {
 		/* Prevent SVA from touching the CD while we're freeing it */
 		mutex_lock(&arm_smmu_asid_lock);
-		arm_smmu_free_asid(&smmu_domain->cd);
+		xa_erase(&arm_smmu_asid_xa, smmu_domain->cd.asid);
 		mutex_unlock(&arm_smmu_asid_lock);
 	} else {
 		struct arm_smmu_s2_cfg *cfg = &smmu_domain->s2_cfg;
@@ -2343,11 +2305,9 @@ static int arm_smmu_domain_finalise_s1(struct arm_smmu_device *smmu,
 	u32 asid = 0;
 	struct arm_smmu_ctx_desc *cd = &smmu_domain->cd;
 
-	refcount_set(&cd->refs, 1);
-
 	/* Prevent SVA from modifying the ASID until it is written to the CD */
 	mutex_lock(&arm_smmu_asid_lock);
-	ret = xa_alloc(&arm_smmu_asid_xa, &asid, cd,
+	ret = xa_alloc(&arm_smmu_asid_xa, &asid, smmu_domain,
 		       XA_LIMIT(1, (1 << smmu->asid_bits) - 1), GFP_KERNEL);
 	cd->asid	= (u16)asid;
 	mutex_unlock(&arm_smmu_asid_lock);
@@ -2834,6 +2794,9 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 
 	/* The core code validates pasid */
 
+	if (smmu_domain->smmu != master->smmu)
+		return -EINVAL;
+
 	if (!master->cd_table.in_ste)
 		return -ENODEV;
 
@@ -2855,9 +2818,14 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 	return ret;
 }
 
-void arm_smmu_remove_pasid(struct arm_smmu_master *master,
-			   struct arm_smmu_domain *smmu_domain, ioasid_t pasid)
+static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
+				      struct iommu_domain *domain)
 {
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_domain *smmu_domain;
+
+	smmu_domain = to_smmu_domain(domain);
+
 	mutex_lock(&arm_smmu_asid_lock);
 	arm_smmu_clear_cd(master, pasid);
 	if (master->ats_enabled)
@@ -3128,7 +3096,6 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 
 	master->dev = dev;
 	master->smmu = smmu;
-	INIT_LIST_HEAD(&master->bonds);
 	dev_iommu_priv_set(dev, master);
 
 	ret = arm_smmu_insert_master(smmu, master);
@@ -3310,12 +3277,6 @@ static int arm_smmu_def_domain_type(struct device *dev)
 	return 0;
 }
 
-static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
-				      struct iommu_domain *domain)
-{
-	arm_smmu_sva_remove_dev_pasid(domain, dev, pasid);
-}
-
 static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 212c18c70fa03..d175d9eee6c61 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -587,9 +587,6 @@ struct arm_smmu_strtab_l1_desc {
 
 struct arm_smmu_ctx_desc {
 	u16				asid;
-
-	refcount_t			refs;
-	struct mm_struct		*mm;
 };
 
 struct arm_smmu_l1_ctx_desc {
@@ -712,7 +709,6 @@ struct arm_smmu_master {
 	bool				stall_enabled;
 	bool				sva_enabled;
 	bool				iopf_enabled;
-	struct list_head		bonds;
 	unsigned int			ssid_bits;
 };
 
@@ -741,7 +737,7 @@ struct arm_smmu_domain {
 	struct list_head		devices;
 	spinlock_t			devices_lock;
 
-	struct list_head		mmu_notifiers;
+	struct mmu_notifier		mmu_notifier;
 };
 
 /* The following are exposed for testing purposes. */
@@ -805,16 +801,13 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 int arm_smmu_set_pasid(struct arm_smmu_master *master,
 		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
 		       const struct arm_smmu_cd *cd);
-void arm_smmu_remove_pasid(struct arm_smmu_master *master,
-			   struct arm_smmu_domain *smmu_domain, ioasid_t pasid);
 
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
 void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 				 size_t granule, bool leaf,
 				 struct arm_smmu_domain *smmu_domain);
-bool arm_smmu_free_asid(struct arm_smmu_ctx_desc *cd);
-int arm_smmu_atc_inv_domain_sva(struct arm_smmu_domain *smmu_domain,
-				ioasid_t ssid, unsigned long iova, size_t size);
+int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
+			    unsigned long iova, size_t size);
 
 #ifdef CONFIG_ARM_SMMU_V3_SVA
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
@@ -826,8 +819,6 @@ bool arm_smmu_master_iopf_supported(struct arm_smmu_master *master);
 void arm_smmu_sva_notifier_synchronize(void);
 struct iommu_domain *arm_smmu_sva_domain_alloc(struct device *dev,
 					       struct mm_struct *mm);
-void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
-				   struct device *dev, ioasid_t id);
 #else /* CONFIG_ARM_SMMU_V3_SVA */
 static inline bool arm_smmu_sva_supported(struct arm_smmu_device *smmu)
 {

From 426c2451dcae551e4dc84b702ef69118a5da9fd2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:42 -0300
Subject: [PATCH 189/352] iommu/arm-smmu-v3: Allow IDENTITY/BLOCKED to be set
 while PASID is used

The HW supports this, use the S1DSS bits to configure the behavior
of SSID=0 which is the RID's translation.

If SSID's are currently being used in the CD table then just update the
S1DSS bits in the STE, remove the master_domain and leave ATS alone.

For iommufd the driver design has a small problem that all the unused CD
table entries are set with V=0 which will generate an event if VFIO
userspace tries to use the CD entry. This patch extends this problem to
include the RID as well if PASID is being used.

For BLOCKED with used PASIDs the
F_STREAM_DISABLED (STRTAB_STE_1_S1DSS_TERMINATE) event is generated on
untagged traffic and a substream CD table entry with V=0 (removed pasid)
will generate C_BAD_CD. Arguably there is no advantage to using S1DSS over
the CD entry 0 with V=0.

As we don't yet support PASID in iommufd this is a problem to resolve
later, possibly by using EPD0 for unused CD table entries instead of V=0,
and not using S1DSS for BLOCKED.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/11-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit ce26ea9e6e12df01432bd2a1cb8cbfa025b8a977)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c  |  2 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 60 +++++++++++++++----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  4 +-
 3 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index a460b71f58578..d7e022bb9df53 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -164,7 +164,7 @@ static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
 		.smmu = &smmu,
 	};
 
-	arm_smmu_make_cdtable_ste(ste, &master, true);
+	arm_smmu_make_cdtable_ste(ste, &master, true, STRTAB_STE_1_S1DSS_SSID0);
 }
 
 static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6120eb644cbda..708e75449fcbe 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -991,6 +991,14 @@ void arm_smmu_get_ste_used(const __le64 *ent, __le64 *used_bits)
 				    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_STRW |
 				    STRTAB_STE_1_EATS);
 		used_bits[2] |= cpu_to_le64(STRTAB_STE_2_S2VMID);
+
+		/*
+		 * See 13.5 Summary of attribute/permission configuration fields
+		 * for the SHCFG behavior.
+		 */
+		if (FIELD_GET(STRTAB_STE_1_S1DSS, le64_to_cpu(ent[1])) ==
+		    STRTAB_STE_1_S1DSS_BYPASS)
+			used_bits[1] |= cpu_to_le64(STRTAB_STE_1_SHCFG);
 	}
 
 	/* S2 translates */
@@ -1531,7 +1539,8 @@ EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_bypass_ste);
 
 VISIBLE_IF_KUNIT
 void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
-			       struct arm_smmu_master *master, bool ats_enabled)
+			       struct arm_smmu_master *master, bool ats_enabled,
+			       unsigned int s1dss)
 {
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 	struct arm_smmu_device *smmu = master->smmu;
@@ -1545,7 +1554,7 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 		FIELD_PREP(STRTAB_STE_0_S1CDMAX, cd_table->s1cdmax));
 
 	target->data[1] = cpu_to_le64(
-		FIELD_PREP(STRTAB_STE_1_S1DSS, STRTAB_STE_1_S1DSS_SSID0) |
+		FIELD_PREP(STRTAB_STE_1_S1DSS, s1dss) |
 		FIELD_PREP(STRTAB_STE_1_S1CIR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 		FIELD_PREP(STRTAB_STE_1_S1COR, STRTAB_STE_1_S1C_CACHE_WBRA) |
 		FIELD_PREP(STRTAB_STE_1_S1CSH, ARM_SMMU_SH_ISH) |
@@ -1556,6 +1565,11 @@ void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
 		FIELD_PREP(STRTAB_STE_1_EATS,
 			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
+	if ((smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR) &&
+	    s1dss == STRTAB_STE_1_S1DSS_BYPASS)
+		target->data[1] |= cpu_to_le64(FIELD_PREP(
+			STRTAB_STE_1_SHCFG, STRTAB_STE_1_SHCFG_INCOMING));
+
 	if (smmu->features & ARM_SMMU_FEAT_E2H) {
 		/*
 		 * To support BTM the streamworld needs to match the
@@ -2579,6 +2593,7 @@ struct arm_smmu_attach_state {
 	/* Inputs */
 	struct iommu_domain *old_domain;
 	struct arm_smmu_master *master;
+	bool cd_needs_ats;
 	ioasid_t ssid;
 	/* Resulting state */
 	bool ats_enabled;
@@ -2620,7 +2635,7 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 	 */
 	lockdep_assert_held(&arm_smmu_asid_lock);
 
-	if (smmu_domain) {
+	if (smmu_domain || state->cd_needs_ats) {
 		/*
 		 * The SMMU does not support enabling ATS with bypass/abort.
 		 * When the STE is in bypass (STE.Config[2:0] == 0b100), ATS
@@ -2632,7 +2647,9 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		 * tables.
 		 */
 		state->ats_enabled = arm_smmu_ats_supported(master);
+	}
 
+	if (smmu_domain) {
 		master_domain = kzalloc(sizeof(*master_domain), GFP_KERNEL);
 		if (!master_domain)
 			return -ENOMEM;
@@ -2760,7 +2777,8 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 		arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
 		arm_smmu_write_cd_entry(master, IOMMU_NO_PASID, cdptr,
 					&target_cd);
-		arm_smmu_make_cdtable_ste(&target, master, state.ats_enabled);
+		arm_smmu_make_cdtable_ste(&target, master, state.ats_enabled,
+					  STRTAB_STE_1_S1DSS_SSID0);
 		arm_smmu_install_ste_for_dev(master, &target);
 		break;
 	}
@@ -2834,8 +2852,10 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 	mutex_unlock(&arm_smmu_asid_lock);
 }
 
-static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
-				   struct device *dev, struct arm_smmu_ste *ste)
+static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
+				    struct device *dev,
+				    struct arm_smmu_ste *ste,
+				    unsigned int s1dss)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 	struct arm_smmu_attach_state state = {
@@ -2844,16 +2864,28 @@ static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 		.ssid = IOMMU_NO_PASID,
 	};
 
-	if (arm_smmu_ssids_in_use(&master->cd_table))
-		return -EBUSY;
-
 	/*
 	 * Do not allow any ASID to be changed while are working on the STE,
 	 * otherwise we could miss invalidations.
 	 */
 	mutex_lock(&arm_smmu_asid_lock);
 
-	arm_smmu_attach_prepare(&state, domain);
+	/*
+	 * If the CD table is not in use we can use the provided STE, otherwise
+	 * we use a cdtable STE with the provided S1DSS.
+	 */
+	if (arm_smmu_ssids_in_use(&master->cd_table)) {
+		/*
+		 * If a CD table has to be present then we need to run with ATS
+		 * on even though the RID will fail ATS queries with UR. This is
+		 * because we have no idea what the PASID's need.
+		 */
+		state.cd_needs_ats = true;
+		arm_smmu_attach_prepare(&state, domain);
+		arm_smmu_make_cdtable_ste(ste, master, state.ats_enabled, s1dss);
+	} else {
+		arm_smmu_attach_prepare(&state, domain);
+	}
 	arm_smmu_install_ste_for_dev(master, ste);
 	arm_smmu_attach_commit(&state);
 	mutex_unlock(&arm_smmu_asid_lock);
@@ -2864,7 +2896,6 @@ static int arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 	 * descriptor from arm_smmu_share_asid().
 	 */
 	arm_smmu_clear_cd(master, IOMMU_NO_PASID);
-	return 0;
 }
 
 static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
@@ -2874,7 +2905,8 @@ static int arm_smmu_attach_dev_identity(struct iommu_domain *domain,
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 
 	arm_smmu_make_bypass_ste(master->smmu, &ste);
-	return arm_smmu_attach_dev_ste(domain, dev, &ste);
+	arm_smmu_attach_dev_ste(domain, dev, &ste, STRTAB_STE_1_S1DSS_BYPASS);
+	return 0;
 }
 
 static const struct iommu_domain_ops arm_smmu_identity_ops = {
@@ -2892,7 +2924,9 @@ static int arm_smmu_attach_dev_blocked(struct iommu_domain *domain,
 	struct arm_smmu_ste ste;
 
 	arm_smmu_make_abort_ste(&ste);
-	return arm_smmu_attach_dev_ste(domain, dev, &ste);
+	arm_smmu_attach_dev_ste(domain, dev, &ste,
+				STRTAB_STE_1_S1DSS_TERMINATE);
+	return 0;
 }
 
 static const struct iommu_domain_ops arm_smmu_blocked_ops = {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index d175d9eee6c61..30459a800c7b2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -761,8 +761,8 @@ void arm_smmu_make_abort_ste(struct arm_smmu_ste *target);
 void arm_smmu_make_bypass_ste(struct arm_smmu_device *smmu,
 			      struct arm_smmu_ste *target);
 void arm_smmu_make_cdtable_ste(struct arm_smmu_ste *target,
-			       struct arm_smmu_master *master,
-			       bool ats_enabled);
+			       struct arm_smmu_master *master, bool ats_enabled,
+			       unsigned int s1dss);
 void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 				 struct arm_smmu_master *master,
 				 struct arm_smmu_domain *smmu_domain,

From a1066944f74bdb8a0828ef39fc74c040024e97ff Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:43 -0300
Subject: [PATCH 190/352] iommu/arm-smmu-v3: Test the STE S1DSS functionality

S1DSS brings in quite a few new transition pairs that are
interesting. Test to/from S1DSS_BYPASS <-> S1DSS_SSID0, and
BYPASS <-> S1DSS_SSID0.

Test a contrived non-hitless flow to make sure that the logic works.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Michael Shavit <mshavit@google.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/12-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 3b5302cbb06af6b62022360066944a1ff6aea0d1)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c  | 113 +++++++++++++++++-
 1 file changed, 108 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index d7e022bb9df53..e0fce31eba54d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -144,6 +144,14 @@ static void arm_smmu_v3_test_ste_expect_transition(
 	KUNIT_EXPECT_MEMEQ(test, target->data, cur_copy.data, sizeof(cur_copy));
 }
 
+static void arm_smmu_v3_test_ste_expect_non_hitless_transition(
+	struct kunit *test, const struct arm_smmu_ste *cur,
+	const struct arm_smmu_ste *target, unsigned int num_syncs_expected)
+{
+	arm_smmu_v3_test_ste_expect_transition(test, cur, target,
+					       num_syncs_expected, false);
+}
+
 static void arm_smmu_v3_test_ste_expect_hitless_transition(
 	struct kunit *test, const struct arm_smmu_ste *cur,
 	const struct arm_smmu_ste *target, unsigned int num_syncs_expected)
@@ -155,6 +163,7 @@ static void arm_smmu_v3_test_ste_expect_hitless_transition(
 static const dma_addr_t fake_cdtab_dma_addr = 0xF0F0F0F0F0F0;
 
 static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
+					   unsigned int s1dss,
 					   const dma_addr_t dma_addr)
 {
 	struct arm_smmu_master master = {
@@ -164,7 +173,7 @@ static void arm_smmu_test_make_cdtable_ste(struct arm_smmu_ste *ste,
 		.smmu = &smmu,
 	};
 
-	arm_smmu_make_cdtable_ste(ste, &master, true, STRTAB_STE_1_S1DSS_SSID0);
+	arm_smmu_make_cdtable_ste(ste, &master, true, s1dss);
 }
 
 static void arm_smmu_v3_write_ste_test_bypass_to_abort(struct kunit *test)
@@ -194,7 +203,8 @@ static void arm_smmu_v3_write_ste_test_cdtable_to_abort(struct kunit *test)
 {
 	struct arm_smmu_ste ste;
 
-	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
 	arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &abort_ste,
 						       NUM_EXPECTED_SYNCS(2));
 }
@@ -203,7 +213,8 @@ static void arm_smmu_v3_write_ste_test_abort_to_cdtable(struct kunit *test)
 {
 	struct arm_smmu_ste ste;
 
-	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
 	arm_smmu_v3_test_ste_expect_hitless_transition(test, &abort_ste, &ste,
 						       NUM_EXPECTED_SYNCS(2));
 }
@@ -212,7 +223,8 @@ static void arm_smmu_v3_write_ste_test_cdtable_to_bypass(struct kunit *test)
 {
 	struct arm_smmu_ste ste;
 
-	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
 	arm_smmu_v3_test_ste_expect_hitless_transition(test, &ste, &bypass_ste,
 						       NUM_EXPECTED_SYNCS(3));
 }
@@ -221,11 +233,54 @@ static void arm_smmu_v3_write_ste_test_bypass_to_cdtable(struct kunit *test)
 {
 	struct arm_smmu_ste ste;
 
-	arm_smmu_test_make_cdtable_ste(&ste, fake_cdtab_dma_addr);
+	arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
 	arm_smmu_v3_test_ste_expect_hitless_transition(test, &bypass_ste, &ste,
 						       NUM_EXPECTED_SYNCS(3));
 }
 
+static void arm_smmu_v3_write_ste_test_cdtable_s1dss_change(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+	struct arm_smmu_ste s1dss_bypass;
+
+	arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
+	arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS,
+				       fake_cdtab_dma_addr);
+
+	/*
+	 * Flipping s1dss on a CD table STE only involves changes to the second
+	 * qword of an STE and can be done in a single write.
+	 */
+	arm_smmu_v3_test_ste_expect_hitless_transition(
+		test, &ste, &s1dss_bypass, NUM_EXPECTED_SYNCS(1));
+	arm_smmu_v3_test_ste_expect_hitless_transition(
+		test, &s1dss_bypass, &ste, NUM_EXPECTED_SYNCS(1));
+}
+
+static void
+arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass(struct kunit *test)
+{
+	struct arm_smmu_ste s1dss_bypass;
+
+	arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS,
+				       fake_cdtab_dma_addr);
+	arm_smmu_v3_test_ste_expect_hitless_transition(
+		test, &s1dss_bypass, &bypass_ste, NUM_EXPECTED_SYNCS(2));
+}
+
+static void
+arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass(struct kunit *test)
+{
+	struct arm_smmu_ste s1dss_bypass;
+
+	arm_smmu_test_make_cdtable_ste(&s1dss_bypass, STRTAB_STE_1_S1DSS_BYPASS,
+				       fake_cdtab_dma_addr);
+	arm_smmu_v3_test_ste_expect_hitless_transition(
+		test, &bypass_ste, &s1dss_bypass, NUM_EXPECTED_SYNCS(2));
+}
+
 static void arm_smmu_test_make_s2_ste(struct arm_smmu_ste *ste,
 				      bool ats_enabled)
 {
@@ -285,6 +340,48 @@ static void arm_smmu_v3_write_ste_test_bypass_to_s2(struct kunit *test)
 						       NUM_EXPECTED_SYNCS(2));
 }
 
+static void arm_smmu_v3_write_ste_test_s1_to_s2(struct kunit *test)
+{
+	struct arm_smmu_ste s1_ste;
+	struct arm_smmu_ste s2_ste;
+
+	arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
+	arm_smmu_test_make_s2_ste(&s2_ste, true);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &s1_ste, &s2_ste,
+						       NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_v3_write_ste_test_s2_to_s1(struct kunit *test)
+{
+	struct arm_smmu_ste s1_ste;
+	struct arm_smmu_ste s2_ste;
+
+	arm_smmu_test_make_cdtable_ste(&s1_ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
+	arm_smmu_test_make_s2_ste(&s2_ste, true);
+	arm_smmu_v3_test_ste_expect_hitless_transition(test, &s2_ste, &s1_ste,
+						       NUM_EXPECTED_SYNCS(3));
+}
+
+static void arm_smmu_v3_write_ste_test_non_hitless(struct kunit *test)
+{
+	struct arm_smmu_ste ste;
+	struct arm_smmu_ste ste_2;
+
+	/*
+	 * Although no flow resembles this in practice, one way to force an STE
+	 * update to be non-hitless is to change its CD table pointer as well as
+	 * s1 dss field in the same update.
+	 */
+	arm_smmu_test_make_cdtable_ste(&ste, STRTAB_STE_1_S1DSS_SSID0,
+				       fake_cdtab_dma_addr);
+	arm_smmu_test_make_cdtable_ste(&ste_2, STRTAB_STE_1_S1DSS_BYPASS,
+				       0x4B4B4b4B4B);
+	arm_smmu_v3_test_ste_expect_non_hitless_transition(
+		test, &ste, &ste_2, NUM_EXPECTED_SYNCS(3));
+}
+
 static void arm_smmu_v3_test_cd_expect_transition(
 	struct kunit *test, const struct arm_smmu_cd *cur,
 	const struct arm_smmu_cd *target, unsigned int num_syncs_expected,
@@ -438,10 +535,16 @@ static struct kunit_case arm_smmu_v3_test_cases[] = {
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_cdtable),
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_to_bypass),
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_cdtable),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_cdtable_s1dss_change),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_s1dssbypass_to_stebypass),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_stebypass_to_s1dssbypass),
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_abort),
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_abort_to_s2),
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_bypass),
 	KUNIT_CASE(arm_smmu_v3_write_ste_test_bypass_to_s2),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_s1_to_s2),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_s2_to_s1),
+	KUNIT_CASE(arm_smmu_v3_write_ste_test_non_hitless),
 	KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_clear),
 	KUNIT_CASE(arm_smmu_v3_write_cd_test_s1_change_asid),
 	KUNIT_CASE(arm_smmu_v3_write_cd_test_sva_clear),

From a35869129d2ae366d3320d87e9c90db41e9f4fa5 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:44 -0300
Subject: [PATCH 191/352] iommu/arm-smmu-v3: Allow a PASID to be set when RID
 is IDENTITY/BLOCKED

If the STE doesn't point to the CD table we can upgrade it by
reprogramming the STE with the appropriate S1DSS. We may also need to turn
on ATS at the same time.

Keep track if the installed STE is pointing at the cd_table and the ATS
state to trigger this path.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/13-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 8ee9175c25827240dd84a7adffbfa9c16938ac5d)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 49 ++++++++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +-
 2 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 708e75449fcbe..c60cb53085b8e 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2435,6 +2435,9 @@ static void arm_smmu_install_ste_for_dev(struct arm_smmu_master *master,
 	master->cd_table.in_ste =
 		FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(target->data[0])) ==
 		STRTAB_STE_0_CFG_S1_TRANS;
+	master->ste_ats_enabled =
+		FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(target->data[1])) ==
+		STRTAB_STE_1_EATS_TRANS;
 
 	for (i = 0; i < master->num_streams; ++i) {
 		u32 sid = master->streams[i].id;
@@ -2795,10 +2798,36 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	return 0;
 }
 
+static void arm_smmu_update_ste(struct arm_smmu_master *master,
+				struct iommu_domain *sid_domain,
+				bool ats_enabled)
+{
+	unsigned int s1dss = STRTAB_STE_1_S1DSS_TERMINATE;
+	struct arm_smmu_ste ste;
+
+	if (master->cd_table.in_ste && master->ste_ats_enabled == ats_enabled)
+		return;
+
+	if (sid_domain->type == IOMMU_DOMAIN_IDENTITY)
+		s1dss = STRTAB_STE_1_S1DSS_BYPASS;
+	else
+		WARN_ON(sid_domain->type != IOMMU_DOMAIN_BLOCKED);
+
+	/*
+	 * Change the STE into a cdtable one with SID IDENTITY/BLOCKED behavior
+	 * using s1dss if necessary. If the cd_table is already installed then
+	 * the S1DSS is correct and this will just update the EATS. Otherwise it
+	 * installs the entire thing. This will be hitless.
+	 */
+	arm_smmu_make_cdtable_ste(&ste, master, ats_enabled, s1dss);
+	arm_smmu_install_ste_for_dev(master, &ste);
+}
+
 int arm_smmu_set_pasid(struct arm_smmu_master *master,
 		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
 		       const struct arm_smmu_cd *cd)
 {
+	struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev);
 	struct arm_smmu_attach_state state = {
 		.master = master,
 		/*
@@ -2815,8 +2844,10 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 	if (smmu_domain->smmu != master->smmu)
 		return -EINVAL;
 
-	if (!master->cd_table.in_ste)
-		return -ENODEV;
+	if (!master->cd_table.in_ste &&
+	    sid_domain->type != IOMMU_DOMAIN_IDENTITY &&
+	    sid_domain->type != IOMMU_DOMAIN_BLOCKED)
+		return -EINVAL;
 
 	cdptr = arm_smmu_alloc_cd_ptr(master, pasid);
 	if (!cdptr)
@@ -2828,6 +2859,7 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 		goto out_unlock;
 
 	arm_smmu_write_cd_entry(master, pasid, cdptr, cd);
+	arm_smmu_update_ste(master, sid_domain, state.ats_enabled);
 
 	arm_smmu_attach_commit(&state);
 
@@ -2850,6 +2882,19 @@ static void arm_smmu_remove_dev_pasid(struct device *dev, ioasid_t pasid,
 		arm_smmu_atc_inv_master(master, pasid);
 	arm_smmu_remove_master_domain(master, &smmu_domain->domain, pasid);
 	mutex_unlock(&arm_smmu_asid_lock);
+
+	/*
+	 * When the last user of the CD table goes away downgrade the STE back
+	 * to a non-cd_table one.
+	 */
+	if (!arm_smmu_ssids_in_use(&master->cd_table)) {
+		struct iommu_domain *sid_domain =
+			iommu_get_domain_for_dev(master->dev);
+
+		if (sid_domain->type == IOMMU_DOMAIN_IDENTITY ||
+		    sid_domain->type == IOMMU_DOMAIN_BLOCKED)
+			sid_domain->ops->attach_dev(sid_domain, dev);
+	}
 }
 
 static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 30459a800c7b2..cdd426efb384d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -705,7 +705,8 @@ struct arm_smmu_master {
 	/* Locked by the iommu core using the group mutex */
 	struct arm_smmu_ctx_desc_cfg	cd_table;
 	unsigned int			num_streams;
-	bool				ats_enabled;
+	bool				ats_enabled : 1;
+	bool				ste_ats_enabled : 1;
 	bool				stall_enabled;
 	bool				sva_enabled;
 	bool				iopf_enabled;

From f66d68130b0f095b107a84764e6426a6a68b24aa Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 25 Jun 2024 09:37:45 -0300
Subject: [PATCH 192/352] iommu/arm-smmu-v3: Allow setting a S1 domain to a
 PASID

The SVA cleanup made the SSID logic entirely general so all we need to do
is call it with the correct cd table entry for a S1 domain.

This is slightly tricky because of the ASID and how the locking works, the
simple fix is to just update the ASID once we get the right locks.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Tested-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/14-v9-5cd718286059+79186-smmuv3_newapi_p2b_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit f3b273b7c7e42ff7ef5b6063834d768d33c7ba79)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 41 ++++++++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 +-
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c60cb53085b8e..f486843f40e2a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2798,6 +2798,36 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	return 0;
 }
 
+static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain,
+				      struct device *dev, ioasid_t id)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_device *smmu = master->smmu;
+	struct arm_smmu_cd target_cd;
+	int ret = 0;
+
+	mutex_lock(&smmu_domain->init_mutex);
+	if (!smmu_domain->smmu)
+		ret = arm_smmu_domain_finalise(smmu_domain, smmu);
+	else if (smmu_domain->smmu != smmu)
+		ret = -EINVAL;
+	mutex_unlock(&smmu_domain->init_mutex);
+	if (ret)
+		return ret;
+
+	if (smmu_domain->stage != ARM_SMMU_DOMAIN_S1)
+		return -EINVAL;
+
+	/*
+	 * We can read cd.asid outside the lock because arm_smmu_set_pasid()
+	 * will fix it
+	 */
+	arm_smmu_make_s1_cd(&target_cd, master, smmu_domain);
+	return arm_smmu_set_pasid(master, to_smmu_domain(domain), id,
+				  &target_cd);
+}
+
 static void arm_smmu_update_ste(struct arm_smmu_master *master,
 				struct iommu_domain *sid_domain,
 				bool ats_enabled)
@@ -2825,7 +2855,7 @@ static void arm_smmu_update_ste(struct arm_smmu_master *master,
 
 int arm_smmu_set_pasid(struct arm_smmu_master *master,
 		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
-		       const struct arm_smmu_cd *cd)
+		       struct arm_smmu_cd *cd)
 {
 	struct iommu_domain *sid_domain = iommu_get_domain_for_dev(master->dev);
 	struct arm_smmu_attach_state state = {
@@ -2858,6 +2888,14 @@ int arm_smmu_set_pasid(struct arm_smmu_master *master,
 	if (ret)
 		goto out_unlock;
 
+	/*
+	 * We don't want to obtain to the asid_lock too early, so fix up the
+	 * caller set ASID under the lock in case it changed.
+	 */
+	cd->data[0] &= ~cpu_to_le64(CTXDESC_CD_0_ASID);
+	cd->data[0] |= cpu_to_le64(
+		FIELD_PREP(CTXDESC_CD_0_ASID, smmu_domain->cd.asid));
+
 	arm_smmu_write_cd_entry(master, pasid, cdptr, cd);
 	arm_smmu_update_ste(master, sid_domain, state.ats_enabled);
 
@@ -3376,6 +3414,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.owner			= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
 		.attach_dev		= arm_smmu_attach_dev,
+		.set_dev_pasid		= arm_smmu_s1_set_dev_pasid,
 		.map_pages		= arm_smmu_map_pages,
 		.unmap_pages		= arm_smmu_unmap_pages,
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index cdd426efb384d..91ec2d49ecbf2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -801,7 +801,7 @@ void arm_smmu_write_cd_entry(struct arm_smmu_master *master, int ssid,
 
 int arm_smmu_set_pasid(struct arm_smmu_master *master,
 		       struct arm_smmu_domain *smmu_domain, ioasid_t pasid,
-		       const struct arm_smmu_cd *cd);
+		       struct arm_smmu_cd *cd);
 
 void arm_smmu_tlb_inv_asid(struct arm_smmu_device *smmu, u16 asid);
 void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,

From fd4303d01c3db65eab3fd731a2b0d2d2ccee9d0d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 10 Jun 2024 21:31:10 -0300
Subject: [PATCH 193/352] iommu/arm-smmu-v3: Do not zero the strtab twice

dmam_alloc_coherent() already returns zero'd memory so cfg->strtab.l1_desc
(the list of DMA addresses for the L2 entries) is already zero'd.

arm_smmu_init_l1_strtab() goes through and calls
arm_smmu_write_strtab_l1_desc() on the newly allocated (and zero'd) struct
arm_smmu_strtab_l1_desc, which ends up computing 'val = 0' and zeroing it
again.

Remove arm_smmu_init_l1_strtab() and just call devm_kcalloc() from
arm_smmu_init_strtab_2lvl to allocate the companion struct.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/1-v2-318ed5f6983b+198f-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit c84c5ab76c9c04b5f1c8cc66ee9313198e89bb11)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 26 +++++----------------
 1 file changed, 6 insertions(+), 20 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index f486843f40e2a..0f26c75a1c6a8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3523,25 +3523,6 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 				       PRIQ_ENT_DWORDS, "priq");
 }
 
-static int arm_smmu_init_l1_strtab(struct arm_smmu_device *smmu)
-{
-	unsigned int i;
-	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
-	void *strtab = smmu->strtab_cfg.strtab;
-
-	cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents,
-				    sizeof(*cfg->l1_desc), GFP_KERNEL);
-	if (!cfg->l1_desc)
-		return -ENOMEM;
-
-	for (i = 0; i < cfg->num_l1_ents; ++i) {
-		arm_smmu_write_strtab_l1_desc(strtab, &cfg->l1_desc[i]);
-		strtab += STRTAB_L1_DESC_DWORDS << 3;
-	}
-
-	return 0;
-}
-
 static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 {
 	void *strtab;
@@ -3577,7 +3558,12 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
 	cfg->strtab_base_cfg = reg;
 
-	return arm_smmu_init_l1_strtab(smmu);
+	cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents,
+				    sizeof(*cfg->l1_desc), GFP_KERNEL);
+	if (!cfg->l1_desc)
+		return -ENOMEM;
+
+	return 0;
 }
 
 static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)

From 271f7f449b84bf4638ebac24f677b9f2e630e793 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 10 Jun 2024 21:31:11 -0300
Subject: [PATCH 194/352] iommu/arm-smmu-v3: Shrink the strtab l1_desc array

The top of the 2 level stream table is (at most) 128k entries big, and two
high order allocations are required. One of __le64 which is programmed
into the HW (1M), and one of struct arm_smmu_strtab_l1_desc which holds
the CPU pointer (3M).

There is no reason to store the l2ptr_dma as nothing reads it. devm stores
a copy of it and the DMA memory will be freed via devm mechanisms. span is
a constant of 8+1. Remove both.

This removes 16 bytes from each arm_smmu_l1_ctx_desc and saves up to 2M of
memory per iommu instance.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/2-v2-318ed5f6983b+198f-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a4d75360f7a6d979edd66af577847b0f4dbf4377)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 13 ++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 ---
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0f26c75a1c6a8..bf25ab64ba203 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1448,13 +1448,12 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 }
 
 /* Stream table manipulation functions */
-static void
-arm_smmu_write_strtab_l1_desc(__le64 *dst, struct arm_smmu_strtab_l1_desc *desc)
+static void arm_smmu_write_strtab_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
 {
 	u64 val = 0;
 
-	val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, desc->span);
-	val |= desc->l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
+	val |= FIELD_PREP(STRTAB_L1_DESC_SPAN, STRTAB_SPLIT + 1);
+	val |= l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
 
 	/* The HW has 64 bit atomicity with stores to the L2 STE table */
 	WRITE_ONCE(*dst, cpu_to_le64(val));
@@ -1663,6 +1662,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 {
 	size_t size;
 	void *strtab;
+	dma_addr_t l2ptr_dma;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 	struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT];
 
@@ -1672,8 +1672,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 	size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
 	strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
 
-	desc->span = STRTAB_SPLIT + 1;
-	desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &desc->l2ptr_dma,
+	desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &l2ptr_dma,
 					  GFP_KERNEL);
 	if (!desc->l2ptr) {
 		dev_err(smmu->dev,
@@ -1683,7 +1682,7 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 	}
 
 	arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
-	arm_smmu_write_strtab_l1_desc(strtab, desc);
+	arm_smmu_write_strtab_l1_desc(strtab, l2ptr_dma);
 	return 0;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 91ec2d49ecbf2..a05e02d6afd1d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -579,10 +579,7 @@ struct arm_smmu_priq {
 
 /* High-level stream table and context descriptor structures */
 struct arm_smmu_strtab_l1_desc {
-	u8				span;
-
 	struct arm_smmu_ste		*l2ptr;
-	dma_addr_t			l2ptr_dma;
 };
 
 struct arm_smmu_ctx_desc {

From ef9d9f67b49a7bec39222b1b4ac4741aa9b4f85b Mon Sep 17 00:00:00 2001
From: Jeff Johnson <quic_jjohnson@quicinc.com>
Date: Thu, 13 Jun 2024 12:44:17 -0700
Subject: [PATCH 195/352] iommu/arm-smmu-v3: add missing MODULE_DESCRIPTION()
 macro

With ARCH=arm64, make allmodconfig && make W=1 C=1 reports:
WARNING: modpost: missing MODULE_DESCRIPTION() in drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.o

Add the missing invocation of the MODULE_DESCRIPTION() macro.

Signed-off-by: Jeff Johnson <quic_jjohnson@quicinc.com>
Fixes: da55da5a42d4 ("iommu/arm-smmu-v3: Make the kunit into a module")
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240613-md-arm64-drivers-iommu-arm-arm-smmu-v3-v1-1-0e9f7584a5c8@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a35f443d837ffcd5e73b64c13a46d12701839213)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
index e0fce31eba54d..cceb737a70012 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3-test.c
@@ -567,4 +567,5 @@ static struct kunit_suite arm_smmu_v3_test_module = {
 kunit_test_suites(&arm_smmu_v3_test_module);
 
 MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING);
+MODULE_DESCRIPTION("KUnit tests for arm-smmu-v3 driver");
 MODULE_LICENSE("GPL v2");

From 14c53f9287965dcee3454953c1af3767f6ef2747 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Mon, 1 Jul 2024 09:20:10 -0700
Subject: [PATCH 196/352] iommu/arm-smmu: Add CB prefix to register bitfields

For consistency, add the "CB" prefix to the bitfield defines for context
registers.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Link: https://lore.kernel.org/r/20240701162025.375134-2-robdclark@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d0166022be375ce72e7b220d688740b1c4424ad5)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c  |  2 +-
 .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c  | 18 +++----
 drivers/iommu/arm/arm-smmu/arm-smmu.c         |  8 +--
 drivers/iommu/arm/arm-smmu/arm-smmu.h         | 50 +++++++++----------
 drivers/iommu/arm/arm-smmu/qcom_iommu.c       |  4 +-
 5 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
index 957d988b6d832..4b2994b6126df 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
@@ -200,7 +200,7 @@ static irqreturn_t nvidia_smmu_context_fault_bank(int irq,
 	void __iomem *cb_base = nvidia_smmu_page(smmu, inst, smmu->numpage + idx);
 
 	fsr = readl_relaxed(cb_base + ARM_SMMU_CB_FSR);
-	if (!(fsr & ARM_SMMU_FSR_FAULT))
+	if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
 		return IRQ_NONE;
 
 	fsynr = readl_relaxed(cb_base + ARM_SMMU_CB_FSYNR0);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index 552199cbd9e25..e4ee78fb6a663 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -141,7 +141,7 @@ static int qcom_tbu_halt(struct qcom_tbu *tbu, struct arm_smmu_domain *smmu_doma
 	writel_relaxed(val, tbu->base + DEBUG_SID_HALT_REG);
 
 	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-	if ((fsr & ARM_SMMU_FSR_FAULT) && (fsr & ARM_SMMU_FSR_SS)) {
+	if ((fsr & ARM_SMMU_CB_FSR_FAULT) && (fsr & ARM_SMMU_CB_FSR_SS)) {
 		u32 sctlr_orig, sctlr;
 
 		/*
@@ -298,7 +298,7 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
 	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, sctlr);
 
 	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-	if (fsr & ARM_SMMU_FSR_FAULT) {
+	if (fsr & ARM_SMMU_CB_FSR_FAULT) {
 		/* Clear pending interrupts */
 		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
 
@@ -306,7 +306,7 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
 		 * TBU halt takes care of resuming any stalled transcation.
 		 * Kept it here for completeness sake.
 		 */
-		if (fsr & ARM_SMMU_FSR_SS)
+		if (fsr & ARM_SMMU_CB_FSR_SS)
 			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
 					  ARM_SMMU_RESUME_TERMINATE);
 	}
@@ -320,11 +320,11 @@ static phys_addr_t qcom_iova_to_phys(struct arm_smmu_domain *smmu_domain,
 			phys = qcom_tbu_trigger_atos(smmu_domain, tbu, iova, sid);
 
 			fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-			if (fsr & ARM_SMMU_FSR_FAULT) {
+			if (fsr & ARM_SMMU_CB_FSR_FAULT) {
 				/* Clear pending interrupts */
 				arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
 
-				if (fsr & ARM_SMMU_FSR_SS)
+				if (fsr & ARM_SMMU_CB_FSR_SS)
 					arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME,
 							  ARM_SMMU_RESUME_TERMINATE);
 			}
@@ -394,7 +394,7 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 				      DEFAULT_RATELIMIT_BURST);
 
 	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-	if (!(fsr & ARM_SMMU_FSR_FAULT))
+	if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
 		return IRQ_NONE;
 
 	fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
@@ -403,7 +403,7 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 
 	if (list_empty(&tbu_list)) {
 		ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
-					 fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+					 fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 
 		if (ret == -ENOSYS)
 			dev_err_ratelimited(smmu->dev,
@@ -417,7 +417,7 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 	phys_soft = ops->iova_to_phys(ops, iova);
 
 	tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova,
-				 fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+				 fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 	if (!tmp || tmp == -EBUSY) {
 		dev_dbg(smmu->dev,
 			"Context fault handled by client: iova=0x%08lx, fsr=0x%x, fsynr=0x%x, cb=%d\n",
@@ -481,7 +481,7 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
 
 		/* Retry or terminate any stalled transactions */
-		if (fsr & ARM_SMMU_FSR_SS)
+		if (fsr & ARM_SMMU_CB_FSR_SS)
 			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, resume);
 	}
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 87c81f75cf844..23cf91ac409bc 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -415,7 +415,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	int ret;
 
 	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-	if (!(fsr & ARM_SMMU_FSR_FAULT))
+	if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
 		return IRQ_NONE;
 
 	fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
@@ -423,7 +423,7 @@ static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 	cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
 
 	ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
-		fsynr & ARM_SMMU_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+		fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 
 	if (ret == -ENOSYS)
 		dev_err_ratelimited(smmu->dev,
@@ -1306,7 +1306,7 @@ static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
 		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_ATS1PR, va);
 
 	reg = arm_smmu_page(smmu, ARM_SMMU_CB(smmu, idx)) + ARM_SMMU_CB_ATSR;
-	if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_ATSR_ACTIVE),
+	if (readl_poll_timeout_atomic(reg, tmp, !(tmp & ARM_SMMU_CB_ATSR_ACTIVE),
 				      5, 50)) {
 		spin_unlock_irqrestore(&smmu_domain->cb_lock, flags);
 		dev_err(dev,
@@ -1642,7 +1642,7 @@ static void arm_smmu_device_reset(struct arm_smmu_device *smmu)
 	/* Make sure all context banks are disabled and clear CB_FSR  */
 	for (i = 0; i < smmu->num_context_banks; ++i) {
 		arm_smmu_write_context_bank(smmu, i);
-		arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT);
+		arm_smmu_cb_write(smmu, i, ARM_SMMU_CB_FSR, ARM_SMMU_CB_FSR_FAULT);
 	}
 
 	/* Invalidate the TLB, just in case */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 4765c6945c344..b04a00126a125 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -196,34 +196,34 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_PAR_F		BIT(0)
 
 #define ARM_SMMU_CB_FSR			0x58
-#define ARM_SMMU_FSR_MULTI		BIT(31)
-#define ARM_SMMU_FSR_SS			BIT(30)
-#define ARM_SMMU_FSR_UUT		BIT(8)
-#define ARM_SMMU_FSR_ASF		BIT(7)
-#define ARM_SMMU_FSR_TLBLKF		BIT(6)
-#define ARM_SMMU_FSR_TLBMCF		BIT(5)
-#define ARM_SMMU_FSR_EF			BIT(4)
-#define ARM_SMMU_FSR_PF			BIT(3)
-#define ARM_SMMU_FSR_AFF		BIT(2)
-#define ARM_SMMU_FSR_TF			BIT(1)
-
-#define ARM_SMMU_FSR_IGN		(ARM_SMMU_FSR_AFF |		\
-					 ARM_SMMU_FSR_ASF |		\
-					 ARM_SMMU_FSR_TLBMCF |		\
-					 ARM_SMMU_FSR_TLBLKF)
-
-#define ARM_SMMU_FSR_FAULT		(ARM_SMMU_FSR_MULTI |		\
-					 ARM_SMMU_FSR_SS |		\
-					 ARM_SMMU_FSR_UUT |		\
-					 ARM_SMMU_FSR_EF |		\
-					 ARM_SMMU_FSR_PF |		\
-					 ARM_SMMU_FSR_TF |		\
-					 ARM_SMMU_FSR_IGN)
+#define ARM_SMMU_CB_FSR_MULTI		BIT(31)
+#define ARM_SMMU_CB_FSR_SS		BIT(30)
+#define ARM_SMMU_CB_FSR_UUT		BIT(8)
+#define ARM_SMMU_CB_FSR_ASF		BIT(7)
+#define ARM_SMMU_CB_FSR_TLBLKF		BIT(6)
+#define ARM_SMMU_CB_FSR_TLBMCF		BIT(5)
+#define ARM_SMMU_CB_FSR_EF		BIT(4)
+#define ARM_SMMU_CB_FSR_PF		BIT(3)
+#define ARM_SMMU_CB_FSR_AFF		BIT(2)
+#define ARM_SMMU_CB_FSR_TF		BIT(1)
+
+#define ARM_SMMU_CB_FSR_IGN		(ARM_SMMU_CB_FSR_AFF |		\
+					 ARM_SMMU_CB_FSR_ASF |		\
+					 ARM_SMMU_CB_FSR_TLBMCF |	\
+					 ARM_SMMU_CB_FSR_TLBLKF)
+
+#define ARM_SMMU_CB_FSR_FAULT		(ARM_SMMU_CB_FSR_MULTI |	\
+					 ARM_SMMU_CB_FSR_SS |		\
+					 ARM_SMMU_CB_FSR_UUT |		\
+					 ARM_SMMU_CB_FSR_EF |		\
+					 ARM_SMMU_CB_FSR_PF |		\
+					 ARM_SMMU_CB_FSR_TF |		\
+					 ARM_SMMU_CB_FSR_IGN)
 
 #define ARM_SMMU_CB_FAR			0x60
 
 #define ARM_SMMU_CB_FSYNR0		0x68
-#define ARM_SMMU_FSYNR0_WNR		BIT(4)
+#define ARM_SMMU_CB_FSYNR0_WNR		BIT(4)
 
 #define ARM_SMMU_CB_FSYNR1		0x6c
 
@@ -237,7 +237,7 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_ATS1PR		0x800
 
 #define ARM_SMMU_CB_ATSR		0x8f0
-#define ARM_SMMU_ATSR_ACTIVE		BIT(0)
+#define ARM_SMMU_CB_ATSR_ACTIVE		BIT(0)
 
 #define ARM_SMMU_RESUME_TERMINATE	BIT(0)
 
diff --git a/drivers/iommu/arm/arm-smmu/qcom_iommu.c b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
index e079bb7a993e2..b98a7a598b897 100644
--- a/drivers/iommu/arm/arm-smmu/qcom_iommu.c
+++ b/drivers/iommu/arm/arm-smmu/qcom_iommu.c
@@ -194,7 +194,7 @@ static irqreturn_t qcom_iommu_fault(int irq, void *dev)
 
 	fsr = iommu_readl(ctx, ARM_SMMU_CB_FSR);
 
-	if (!(fsr & ARM_SMMU_FSR_FAULT))
+	if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
 		return IRQ_NONE;
 
 	fsynr = iommu_readl(ctx, ARM_SMMU_CB_FSYNR0);
@@ -274,7 +274,7 @@ static int qcom_iommu_init_domain(struct iommu_domain *domain,
 
 		/* Clear context bank fault address fault status registers */
 		iommu_writel(ctx, ARM_SMMU_CB_FAR, 0);
-		iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_FSR_FAULT);
+		iommu_writel(ctx, ARM_SMMU_CB_FSR, ARM_SMMU_CB_FSR_FAULT);
 
 		/* TTBRs */
 		iommu_writeq(ctx, ARM_SMMU_CB_TTBR0,

From 7a7693673f15ae9109967a6faf5e5cd288d883dc Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Mon, 1 Jul 2024 09:20:11 -0700
Subject: [PATCH 197/352] iommu/arm-smmu-qcom-debug: Do not print for handled
 faults

Handled faults can be "normal", don't spam dmesg about them.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Link: https://lore.kernel.org/r/20240701162025.375134-3-robdclark@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 55089781ff7724dd10040231a6d8b791cf24afcd)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index e4ee78fb6a663..681fbdfc325db 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -419,10 +419,6 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 	tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova,
 				 fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 	if (!tmp || tmp == -EBUSY) {
-		dev_dbg(smmu->dev,
-			"Context fault handled by client: iova=0x%08lx, fsr=0x%x, fsynr=0x%x, cb=%d\n",
-			iova, fsr, fsynr, idx);
-		dev_dbg(smmu->dev, "soft iova-to-phys=%pa\n", &phys_soft);
 		ret = IRQ_HANDLED;
 		resume = ARM_SMMU_RESUME_TERMINATE;
 	} else {

From 3a64baf9007712ac80d3fd452d1f7cba587a4d61 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Mon, 1 Jul 2024 09:20:12 -0700
Subject: [PATCH 198/352] iommu/arm-smmu: Pretty-print context fault related
 regs

Parse out the bitfields for easier-to-read fault messages.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Link: https://lore.kernel.org/r/20240701162025.375134-4-robdclark@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit d525b0af0c3b8275e6f83fa0c0640338ed90661a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c  | 52 +++++---------
 drivers/iommu/arm/arm-smmu/arm-smmu.c         | 70 +++++++++++++++----
 drivers/iommu/arm/arm-smmu/arm-smmu.h         | 21 ++++++
 3 files changed, 92 insertions(+), 51 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index 681fbdfc325db..ef93f825f11f9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -383,64 +383,44 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 	struct arm_smmu_domain *smmu_domain = dev;
 	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
-	u32 fsr, fsynr, cbfrsynra, resume = 0;
+	struct arm_smmu_context_fault_info cfi;
+	u32 resume = 0;
 	int idx = smmu_domain->cfg.cbndx;
 	phys_addr_t phys_soft;
-	unsigned long iova;
 	int ret, tmp;
 
 	static DEFINE_RATELIMIT_STATE(_rs,
 				      DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 
-	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-	if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
-		return IRQ_NONE;
+	arm_smmu_read_context_fault_info(smmu, idx, &cfi);
 
-	fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
-	iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
-	cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+	if (!(cfi.fsr & ARM_SMMU_CB_FSR_FAULT))
+		return IRQ_NONE;
 
 	if (list_empty(&tbu_list)) {
-		ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
-					 fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+		ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova,
+					 cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 
 		if (ret == -ENOSYS)
-			dev_err_ratelimited(smmu->dev,
-					    "Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
-					    fsr, iova, fsynr, cbfrsynra, idx);
+			arm_smmu_print_context_fault_info(smmu, idx, &cfi);
 
-		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr);
 		return IRQ_HANDLED;
 	}
 
-	phys_soft = ops->iova_to_phys(ops, iova);
+	phys_soft = ops->iova_to_phys(ops, cfi.iova);
 
-	tmp = report_iommu_fault(&smmu_domain->domain, NULL, iova,
-				 fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+	tmp = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova,
+				 cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 	if (!tmp || tmp == -EBUSY) {
 		ret = IRQ_HANDLED;
 		resume = ARM_SMMU_RESUME_TERMINATE;
 	} else {
-		phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, iova, fsr);
+		phys_addr_t phys_atos = qcom_smmu_verify_fault(smmu_domain, cfi.iova, cfi.fsr);
 
 		if (__ratelimit(&_rs)) {
-			dev_err(smmu->dev,
-				"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
-				fsr, iova, fsynr, cbfrsynra, idx);
-			dev_err(smmu->dev,
-				"FSR    = %08x [%s%s%s%s%s%s%s%s%s], SID=0x%x\n",
-				fsr,
-				(fsr & 0x02) ? "TF " : "",
-				(fsr & 0x04) ? "AFF " : "",
-				(fsr & 0x08) ? "PF " : "",
-				(fsr & 0x10) ? "EF " : "",
-				(fsr & 0x20) ? "TLBMCF " : "",
-				(fsr & 0x40) ? "TLBLKF " : "",
-				(fsr & 0x80) ? "MHF " : "",
-				(fsr & 0x40000000) ? "SS " : "",
-				(fsr & 0x80000000) ? "MULTI " : "",
-				cbfrsynra);
+			arm_smmu_print_context_fault_info(smmu, idx, &cfi);
 
 			dev_err(smmu->dev,
 				"soft iova-to-phys=%pa\n", &phys_soft);
@@ -474,10 +454,10 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 	 */
 	if (tmp != -EBUSY) {
 		/* Clear the faulting FSR */
-		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+		arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr);
 
 		/* Retry or terminate any stalled transactions */
-		if (fsr & ARM_SMMU_CB_FSR_SS)
+		if (cfi.fsr & ARM_SMMU_CB_FSR_SS)
 			arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_RESUME, resume);
 	}
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 23cf91ac409bc..79ec911ae151f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -405,32 +405,72 @@ static const struct iommu_flush_ops arm_smmu_s2_tlb_ops_v1 = {
 	.tlb_add_page	= arm_smmu_tlb_add_page_s2_v1,
 };
 
+
+void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx,
+				      struct arm_smmu_context_fault_info *cfi)
+{
+	cfi->iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
+	cfi->fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
+	cfi->fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
+	cfi->cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+}
+
+void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx,
+				       const struct arm_smmu_context_fault_info *cfi)
+{
+	dev_dbg(smmu->dev,
+		"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
+		cfi->fsr, cfi->iova, cfi->fsynr, cfi->cbfrsynra, idx);
+
+	dev_err(smmu->dev, "FSR    = %08x [%s%sFormat=%u%s%s%s%s%s%s%s%s], SID=0x%x\n",
+		cfi->fsr,
+		(cfi->fsr & ARM_SMMU_CB_FSR_MULTI)  ? "MULTI " : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_SS)     ? "SS " : "",
+		(u32)FIELD_GET(ARM_SMMU_CB_FSR_FORMAT, cfi->fsr),
+		(cfi->fsr & ARM_SMMU_CB_FSR_UUT)    ? " UUT" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_ASF)    ? " ASF" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_TLBLKF) ? " TLBLKF" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_TLBMCF) ? " TLBMCF" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_EF)     ? " EF" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_PF)     ? " PF" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_AFF)    ? " AFF" : "",
+		(cfi->fsr & ARM_SMMU_CB_FSR_TF)     ? " TF" : "",
+		cfi->cbfrsynra);
+
+	dev_err(smmu->dev, "FSYNR0 = %08x [S1CBNDX=%u%s%s%s%s%s%s PLVL=%u]\n",
+		cfi->fsynr,
+		(u32)FIELD_GET(ARM_SMMU_CB_FSYNR0_S1CBNDX, cfi->fsynr),
+		(cfi->fsynr & ARM_SMMU_CB_FSYNR0_AFR) ? " AFR" : "",
+		(cfi->fsynr & ARM_SMMU_CB_FSYNR0_PTWF) ? " PTWF" : "",
+		(cfi->fsynr & ARM_SMMU_CB_FSYNR0_NSATTR) ? " NSATTR" : "",
+		(cfi->fsynr & ARM_SMMU_CB_FSYNR0_IND) ? " IND" : "",
+		(cfi->fsynr & ARM_SMMU_CB_FSYNR0_PNU) ? " PNU" : "",
+		(cfi->fsynr & ARM_SMMU_CB_FSYNR0_WNR) ? " WNR" : "",
+		(u32)FIELD_GET(ARM_SMMU_CB_FSYNR0_PLVL, cfi->fsynr));
+}
+
 static irqreturn_t arm_smmu_context_fault(int irq, void *dev)
 {
-	u32 fsr, fsynr, cbfrsynra;
-	unsigned long iova;
+	struct arm_smmu_context_fault_info cfi;
 	struct arm_smmu_domain *smmu_domain = dev;
 	struct arm_smmu_device *smmu = smmu_domain->smmu;
+	static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
 	int idx = smmu_domain->cfg.cbndx;
 	int ret;
 
-	fsr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSR);
-	if (!(fsr & ARM_SMMU_CB_FSR_FAULT))
-		return IRQ_NONE;
+	arm_smmu_read_context_fault_info(smmu, idx, &cfi);
 
-	fsynr = arm_smmu_cb_read(smmu, idx, ARM_SMMU_CB_FSYNR0);
-	iova = arm_smmu_cb_readq(smmu, idx, ARM_SMMU_CB_FAR);
-	cbfrsynra = arm_smmu_gr1_read(smmu, ARM_SMMU_GR1_CBFRSYNRA(idx));
+	if (!(cfi.fsr & ARM_SMMU_CB_FSR_FAULT))
+		return IRQ_NONE;
 
-	ret = report_iommu_fault(&smmu_domain->domain, NULL, iova,
-		fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
+	ret = report_iommu_fault(&smmu_domain->domain, NULL, cfi.iova,
+		cfi.fsynr & ARM_SMMU_CB_FSYNR0_WNR ? IOMMU_FAULT_WRITE : IOMMU_FAULT_READ);
 
-	if (ret == -ENOSYS)
-		dev_err_ratelimited(smmu->dev,
-		"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
-			    fsr, iova, fsynr, cbfrsynra, idx);
+	if (ret == -ENOSYS && __ratelimit(&rs))
+		arm_smmu_print_context_fault_info(smmu, idx, &cfi);
 
-	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, fsr);
+	arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_FSR, cfi.fsr);
 	return IRQ_HANDLED;
 }
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index b04a00126a125..e2aeb511ae903 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -198,6 +198,7 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_FSR			0x58
 #define ARM_SMMU_CB_FSR_MULTI		BIT(31)
 #define ARM_SMMU_CB_FSR_SS		BIT(30)
+#define ARM_SMMU_CB_FSR_FORMAT		GENMASK(10, 9)
 #define ARM_SMMU_CB_FSR_UUT		BIT(8)
 #define ARM_SMMU_CB_FSR_ASF		BIT(7)
 #define ARM_SMMU_CB_FSR_TLBLKF		BIT(6)
@@ -223,7 +224,14 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_FAR			0x60
 
 #define ARM_SMMU_CB_FSYNR0		0x68
+#define ARM_SMMU_CB_FSYNR0_PLVL		GENMASK(1, 0)
 #define ARM_SMMU_CB_FSYNR0_WNR		BIT(4)
+#define ARM_SMMU_CB_FSYNR0_PNU		BIT(5)
+#define ARM_SMMU_CB_FSYNR0_IND		BIT(6)
+#define ARM_SMMU_CB_FSYNR0_NSATTR	BIT(8)
+#define ARM_SMMU_CB_FSYNR0_PTWF		BIT(10)
+#define ARM_SMMU_CB_FSYNR0_AFR		BIT(11)
+#define ARM_SMMU_CB_FSYNR0_S1CBNDX	GENMASK(23, 16)
 
 #define ARM_SMMU_CB_FSYNR1		0x6c
 
@@ -533,4 +541,17 @@ struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
 
+struct arm_smmu_context_fault_info {
+	unsigned long iova;
+	u32 fsr;
+	u32 fsynr;
+	u32 cbfrsynra;
+};
+
+void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx,
+				      struct arm_smmu_context_fault_info *cfi);
+
+void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx,
+				       const struct arm_smmu_context_fault_info *cfi);
+
 #endif /* _ARM_SMMU_H */

From 53d357ec1bd3cd76fb37e70fad790d2b50f42830 Mon Sep 17 00:00:00 2001
From: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Date: Tue, 2 Jul 2024 17:01:10 +0800
Subject: [PATCH 199/352] iommu/arm-smmu-qcom: record reason for deferring
 probe

To avoid deferring probe smmu driver silently, record reason for it.
It can be checked through ../debugfs/devices_deferred as well:
/sys/kernel/debug# cat devices_deferred
15000000.iommu  arm-smmu: qcom_scm not ready

Signed-off-by: Zhenhua Huang <quic_zhenhuah@quicinc.com>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Link: https://lore.kernel.org/r/1719910870-25079-1-git-send-email-quic_zhenhuah@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 9796cf9b3eb9a0b9502dfe0b3acf63610090ef44)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 25f034677f568..971c6a2e592b9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -469,7 +469,8 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu,
 
 	/* Check to make sure qcom_scm has finished probing */
 	if (!qcom_scm_is_available())
-		return ERR_PTR(-EPROBE_DEFER);
+		return ERR_PTR(dev_err_probe(smmu->dev, -EPROBE_DEFER,
+			"qcom_scm not ready\n"));
 
 	qsmmu = devm_krealloc(smmu->dev, smmu, sizeof(*qsmmu), GFP_KERNEL);
 	if (!qsmmu)

From 7705d5034617e53bab07dd8d3f354f5de368a5b5 Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 3 Jul 2024 11:16:00 +0100
Subject: [PATCH 200/352] iommu/arm-smmu-v3: Add support for domain_alloc_user
 fn

This will be used by iommufd for allocating usr managed domains and is
also required when we add support for iommufd based dirty tracking
support.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20240703101604.2576-2-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 52acd7d8a4130ad4dda6540dbbb821a92e1c0138)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 33 +++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index bf25ab64ba203..ebaa5a4139164 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -36,6 +36,8 @@ module_param(disable_msipolling, bool, 0444);
 MODULE_PARM_DESC(disable_msipolling,
 	"Disable MSI-based polling for CMD_SYNC completion.");
 
+static struct iommu_ops arm_smmu_ops;
+
 enum arm_smmu_msi_index {
 	EVTQ_MSI_INDEX,
 	GERROR_MSI_INDEX,
@@ -3020,6 +3022,34 @@ static struct iommu_domain arm_smmu_blocked_domain = {
 	.ops = &arm_smmu_blocked_ops,
 };
 
+static struct iommu_domain *
+arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
+			   struct iommu_domain *parent,
+			   const struct iommu_user_data *user_data)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_domain *smmu_domain;
+	int ret;
+
+	if (flags || parent || user_data)
+		return ERR_PTR(-EOPNOTSUPP);
+
+	smmu_domain = arm_smmu_domain_alloc();
+	if (!smmu_domain)
+		return ERR_PTR(-ENOMEM);
+
+	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
+	smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;
+	ret = arm_smmu_domain_finalise(smmu_domain, master->smmu);
+	if (ret)
+		goto err_free;
+	return &smmu_domain->domain;
+
+err_free:
+	kfree(smmu_domain);
+	return ERR_PTR(ret);
+}
+
 static int arm_smmu_map_pages(struct iommu_domain *domain, unsigned long iova,
 			      phys_addr_t paddr, size_t pgsize, size_t pgcount,
 			      int prot, gfp_t gfp, size_t *mapped)
@@ -3190,8 +3220,6 @@ static void arm_smmu_remove_master(struct arm_smmu_master *master)
 	kfree(master->streams);
 }
 
-static struct iommu_ops arm_smmu_ops;
-
 static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 {
 	int ret;
@@ -3399,6 +3427,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.capable		= arm_smmu_capable,
 	.domain_alloc_paging    = arm_smmu_domain_alloc_paging,
 	.domain_alloc_sva       = arm_smmu_sva_domain_alloc,
+	.domain_alloc_user	= arm_smmu_domain_alloc_user,
 	.probe_device		= arm_smmu_probe_device,
 	.release_device		= arm_smmu_release_device,
 	.device_group		= arm_smmu_device_group,

From 4f320e44397de6282b589cce76ff1de32b70d584 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 3 Jul 2024 11:16:01 +0100
Subject: [PATCH 201/352] iommu/arm-smmu-v3: Add feature detection for HTTU

If the SMMU supports it and the kernel was built with HTTU support,
Probe support for Hardware Translation Table Update (HTTU) which is
essentially to enable hardware update of access and dirty flags.

Probe and set the smmu::features for Hardware Dirty and Hardware Access
bits. This is in preparation, to enable it on the context descriptors of
stage 1 format.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20240703101604.2576-3-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 2f8d6178b4fe3e2f50782fa640921a9ee46b6d6f)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 32 +++++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  5 ++++
 2 files changed, 37 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ebaa5a4139164..ed4b06521ee37 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -4013,6 +4013,28 @@ static void arm_smmu_device_iidr_probe(struct arm_smmu_device *smmu)
 	}
 }
 
+static void arm_smmu_get_httu(struct arm_smmu_device *smmu, u32 reg)
+{
+	u32 fw_features = smmu->features & (ARM_SMMU_FEAT_HA | ARM_SMMU_FEAT_HD);
+	u32 hw_features = 0;
+
+	switch (FIELD_GET(IDR0_HTTU, reg)) {
+	case IDR0_HTTU_ACCESS_DIRTY:
+		hw_features |= ARM_SMMU_FEAT_HD;
+		fallthrough;
+	case IDR0_HTTU_ACCESS:
+		hw_features |= ARM_SMMU_FEAT_HA;
+	}
+
+	if (smmu->dev->of_node)
+		smmu->features |= hw_features;
+	else if (hw_features != fw_features)
+		/* ACPI IORT sets the HTTU bits */
+		dev_warn(smmu->dev,
+			 "IDR0.HTTU features(0x%x) overridden by FW configuration (0x%x)\n",
+			  hw_features, fw_features);
+}
+
 static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 {
 	u32 reg;
@@ -4073,6 +4095,8 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 			smmu->features |= ARM_SMMU_FEAT_E2H;
 	}
 
+	arm_smmu_get_httu(smmu, reg);
+
 	/*
 	 * The coherency feature as set by FW is used in preference to the ID
 	 * register, but warn on mismatch.
@@ -4268,6 +4292,14 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
 	if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE)
 		smmu->features |= ARM_SMMU_FEAT_COHERENCY;
 
+	switch (FIELD_GET(ACPI_IORT_SMMU_V3_HTTU_OVERRIDE, iort_smmu->flags)) {
+	case IDR0_HTTU_ACCESS_DIRTY:
+		smmu->features |= ARM_SMMU_FEAT_HD;
+		fallthrough;
+	case IDR0_HTTU_ACCESS:
+		smmu->features |= ARM_SMMU_FEAT_HA;
+	}
+
 	return 0;
 }
 #else
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index a05e02d6afd1d..af74b59032b55 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -33,6 +33,9 @@
 #define IDR0_ASID16			(1 << 12)
 #define IDR0_ATS			(1 << 10)
 #define IDR0_HYP			(1 << 9)
+#define IDR0_HTTU			GENMASK(7, 6)
+#define IDR0_HTTU_ACCESS		1
+#define IDR0_HTTU_ACCESS_DIRTY		2
 #define IDR0_COHACC			(1 << 4)
 #define IDR0_TTF			GENMASK(3, 2)
 #define IDR0_TTF_AARCH64		2
@@ -650,6 +653,8 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_E2H		(1 << 18)
 #define ARM_SMMU_FEAT_NESTING		(1 << 19)
 #define ARM_SMMU_FEAT_ATTR_TYPES_OVR	(1 << 20)
+#define ARM_SMMU_FEAT_HA		(1 << 21)
+#define ARM_SMMU_FEAT_HD		(1 << 22)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)

From 5576ffe8cfe182790b007f58e7cd1b66c2909e3b Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Wed, 3 Jul 2024 11:16:02 +0100
Subject: [PATCH 202/352] iommu/io-pgtable-arm: Add read_and_clear_dirty()
 support

.read_and_clear_dirty() IOMMU domain op takes care of reading the dirty
bits (i.e. PTE has DBM set and AP[2] clear) and marshalling into a
bitmap of a given page size.

While reading the dirty bits we also set the PTE AP[2] bit to mark it
as writeable-clean depending on read_and_clear_dirty() flags.

PTE states with respect to DBM bit:

                       DBM bit        AP[2]("RDONLY" bit)
1. writable_clean        1                 1
2. writable_dirty        1                 0
3. read-only             0                 1

Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20240703101604.2576-4-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 4fe88fd8b4aecb7f9680bf898811db76b94095a9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgtable-arm.c | 114 ++++++++++++++++++++++++++++++++-
 1 file changed, 112 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 3d23b924cec16..2e57e86163877 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -76,6 +76,7 @@
 
 #define ARM_LPAE_PTE_NSTABLE		(((arm_lpae_iopte)1) << 63)
 #define ARM_LPAE_PTE_XN			(((arm_lpae_iopte)3) << 53)
+#define ARM_LPAE_PTE_DBM		(((arm_lpae_iopte)1) << 51)
 #define ARM_LPAE_PTE_AF			(((arm_lpae_iopte)1) << 10)
 #define ARM_LPAE_PTE_SH_NS		(((arm_lpae_iopte)0) << 8)
 #define ARM_LPAE_PTE_SH_OS		(((arm_lpae_iopte)2) << 8)
@@ -85,7 +86,7 @@
 
 #define ARM_LPAE_PTE_ATTR_LO_MASK	(((arm_lpae_iopte)0x3ff) << 2)
 /* Ignore the contiguous bit for block splitting */
-#define ARM_LPAE_PTE_ATTR_HI_MASK	(((arm_lpae_iopte)6) << 52)
+#define ARM_LPAE_PTE_ATTR_HI_MASK	(ARM_LPAE_PTE_XN | ARM_LPAE_PTE_DBM)
 #define ARM_LPAE_PTE_ATTR_MASK		(ARM_LPAE_PTE_ATTR_LO_MASK |	\
 					 ARM_LPAE_PTE_ATTR_HI_MASK)
 /* Software bit for solving coherency races */
@@ -93,7 +94,11 @@
 
 /* Stage-1 PTE */
 #define ARM_LPAE_PTE_AP_UNPRIV		(((arm_lpae_iopte)1) << 6)
-#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)2) << 6)
+#define ARM_LPAE_PTE_AP_RDONLY_BIT	7
+#define ARM_LPAE_PTE_AP_RDONLY		(((arm_lpae_iopte)1) << \
+					   ARM_LPAE_PTE_AP_RDONLY_BIT)
+#define ARM_LPAE_PTE_AP_WR_CLEAN_MASK	(ARM_LPAE_PTE_AP_RDONLY | \
+					 ARM_LPAE_PTE_DBM)
 #define ARM_LPAE_PTE_ATTRINDX_SHIFT	2
 #define ARM_LPAE_PTE_nG			(((arm_lpae_iopte)1) << 11)
 
@@ -139,6 +144,12 @@
 
 #define iopte_prot(pte)	((pte) & ARM_LPAE_PTE_ATTR_MASK)
 
+#define iopte_writeable_dirty(pte)				\
+	(((pte) & ARM_LPAE_PTE_AP_WR_CLEAN_MASK) == ARM_LPAE_PTE_DBM)
+
+#define iopte_set_writeable_clean(ptep)				\
+	set_bit(ARM_LPAE_PTE_AP_RDONLY_BIT, (unsigned long *)(ptep))
+
 struct arm_lpae_io_pgtable {
 	struct io_pgtable	iop;
 
@@ -160,6 +171,13 @@ static inline bool iopte_leaf(arm_lpae_iopte pte, int lvl,
 	return iopte_type(pte) == ARM_LPAE_PTE_TYPE_BLOCK;
 }
 
+static inline bool iopte_table(arm_lpae_iopte pte, int lvl)
+{
+	if (lvl == (ARM_LPAE_MAX_LEVELS - 1))
+		return false;
+	return iopte_type(pte) == ARM_LPAE_PTE_TYPE_TABLE;
+}
+
 static arm_lpae_iopte paddr_to_iopte(phys_addr_t paddr,
 				     struct arm_lpae_io_pgtable *data)
 {
@@ -726,6 +744,97 @@ static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
 	return iopte_to_paddr(pte, data) | iova;
 }
 
+struct io_pgtable_walk_data {
+	struct iommu_dirty_bitmap	*dirty;
+	unsigned long			flags;
+	u64				addr;
+	const u64			end;
+};
+
+static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
+				       struct io_pgtable_walk_data *walk_data,
+				       arm_lpae_iopte *ptep,
+				       int lvl);
+
+static int io_pgtable_visit_dirty(struct arm_lpae_io_pgtable *data,
+				  struct io_pgtable_walk_data *walk_data,
+				  arm_lpae_iopte *ptep, int lvl)
+{
+	struct io_pgtable *iop = &data->iop;
+	arm_lpae_iopte pte = READ_ONCE(*ptep);
+
+	if (iopte_leaf(pte, lvl, iop->fmt)) {
+		size_t size = ARM_LPAE_BLOCK_SIZE(lvl, data);
+
+		if (iopte_writeable_dirty(pte)) {
+			iommu_dirty_bitmap_record(walk_data->dirty,
+						  walk_data->addr, size);
+			if (!(walk_data->flags & IOMMU_DIRTY_NO_CLEAR))
+				iopte_set_writeable_clean(ptep);
+		}
+		walk_data->addr += size;
+		return 0;
+	}
+
+	if (WARN_ON(!iopte_table(pte, lvl)))
+		return -EINVAL;
+
+	ptep = iopte_deref(pte, data);
+	return __arm_lpae_iopte_walk_dirty(data, walk_data, ptep, lvl + 1);
+}
+
+static int __arm_lpae_iopte_walk_dirty(struct arm_lpae_io_pgtable *data,
+				       struct io_pgtable_walk_data *walk_data,
+				       arm_lpae_iopte *ptep,
+				       int lvl)
+{
+	u32 idx;
+	int max_entries, ret;
+
+	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
+		return -EINVAL;
+
+	if (lvl == data->start_level)
+		max_entries = ARM_LPAE_PGD_SIZE(data) / sizeof(arm_lpae_iopte);
+	else
+		max_entries = ARM_LPAE_PTES_PER_TABLE(data);
+
+	for (idx = ARM_LPAE_LVL_IDX(walk_data->addr, lvl, data);
+	     (idx < max_entries) && (walk_data->addr < walk_data->end); ++idx) {
+		ret = io_pgtable_visit_dirty(data, walk_data, ptep + idx, lvl);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static int arm_lpae_read_and_clear_dirty(struct io_pgtable_ops *ops,
+					 unsigned long iova, size_t size,
+					 unsigned long flags,
+					 struct iommu_dirty_bitmap *dirty)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
+	struct io_pgtable_cfg *cfg = &data->iop.cfg;
+	struct io_pgtable_walk_data walk_data = {
+		.dirty = dirty,
+		.flags = flags,
+		.addr = iova,
+		.end = iova + size,
+	};
+	arm_lpae_iopte *ptep = data->pgd;
+	int lvl = data->start_level;
+
+	if (WARN_ON(!size))
+		return -EINVAL;
+	if (WARN_ON((iova + size - 1) & ~(BIT(cfg->ias) - 1)))
+		return -EINVAL;
+	if (data->iop.fmt != ARM_64_LPAE_S1)
+		return -EINVAL;
+
+	return __arm_lpae_iopte_walk_dirty(data, &walk_data, ptep, lvl);
+}
+
 static void arm_lpae_restrict_pgsizes(struct io_pgtable_cfg *cfg)
 {
 	unsigned long granule, page_sizes;
@@ -804,6 +913,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 		.map_pages	= arm_lpae_map_pages,
 		.unmap_pages	= arm_lpae_unmap_pages,
 		.iova_to_phys	= arm_lpae_iova_to_phys,
+		.read_and_clear_dirty = arm_lpae_read_and_clear_dirty,
 	};
 
 	return data;

From 0ac0f407fa46962d36d9491e13038be646658c97 Mon Sep 17 00:00:00 2001
From: Joao Martins <joao.m.martins@oracle.com>
Date: Wed, 3 Jul 2024 11:16:03 +0100
Subject: [PATCH 203/352] iommu/arm-smmu-v3: Add support for dirty tracking in
 domain alloc

This provides all the infrastructure to enable dirty tracking if the
hardware has the capability and domain alloc request for it.

Also, add a device_iommu_capable() check in iommufd core for
IOMMU_CAP_DIRTY_TRACKING before we request a user domain with dirty
tracking support.

Please note, we still report no support for IOMMU_CAP_DIRTY_TRACKING
as it will finally be enabled in a subsequent patch.

Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20240703101604.2576-5-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit eb054d67b21a53f6ccf3af49a62fb99397b48fc2)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 84 +++++++++++++++------
 drivers/iommu/iommufd/hw_pagetable.c        |  3 +
 include/linux/io-pgtable.h                  |  3 +
 3 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index ed4b06521ee37..1a1c17bf3b7f4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -27,6 +27,7 @@
 #include <linux/pci-ats.h>
 #include <linux/platform_device.h>
 #include <kunit/visibility.h>
+#include <uapi/linux/iommufd.h>
 
 #include "arm-smmu-v3.h"
 #include "../../dma-iommu.h"
@@ -37,6 +38,7 @@ MODULE_PARM_DESC(disable_msipolling,
 	"Disable MSI-based polling for CMD_SYNC completion.");
 
 static struct iommu_ops arm_smmu_ops;
+static struct iommu_dirty_ops arm_smmu_dirty_ops;
 
 enum arm_smmu_msi_index {
 	EVTQ_MSI_INDEX,
@@ -82,7 +84,7 @@ static struct arm_smmu_option_prop arm_smmu_options[] = {
 };
 
 static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
-				    struct arm_smmu_device *smmu);
+				    struct arm_smmu_device *smmu, u32 flags);
 static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master);
 
 static void parse_driver_options(struct arm_smmu_device *smmu)
@@ -2282,7 +2284,7 @@ static struct iommu_domain *arm_smmu_domain_alloc_paging(struct device *dev)
 		struct arm_smmu_master *master = dev_iommu_priv_get(dev);
 		int ret;
 
-		ret = arm_smmu_domain_finalise(smmu_domain, master->smmu);
+		ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, 0);
 		if (ret) {
 			kfree(smmu_domain);
 			return ERR_PTR(ret);
@@ -2346,15 +2348,15 @@ static int arm_smmu_domain_finalise_s2(struct arm_smmu_device *smmu,
 }
 
 static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
-				    struct arm_smmu_device *smmu)
+				    struct arm_smmu_device *smmu, u32 flags)
 {
 	int ret;
-	unsigned long ias, oas;
 	enum io_pgtable_fmt fmt;
 	struct io_pgtable_cfg pgtbl_cfg;
 	struct io_pgtable_ops *pgtbl_ops;
 	int (*finalise_stage_fn)(struct arm_smmu_device *smmu,
 				 struct arm_smmu_domain *smmu_domain);
+	bool enable_dirty = flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
 
 	/* Restrict the stage to what we can actually support */
 	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S1))
@@ -2362,17 +2364,31 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 	if (!(smmu->features & ARM_SMMU_FEAT_TRANS_S2))
 		smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
 
+	pgtbl_cfg = (struct io_pgtable_cfg) {
+		.pgsize_bitmap	= smmu->pgsize_bitmap,
+		.coherent_walk	= smmu->features & ARM_SMMU_FEAT_COHERENCY,
+		.tlb		= &arm_smmu_flush_ops,
+		.iommu_dev	= smmu->dev,
+	};
+
 	switch (smmu_domain->stage) {
-	case ARM_SMMU_DOMAIN_S1:
-		ias = (smmu->features & ARM_SMMU_FEAT_VAX) ? 52 : 48;
-		ias = min_t(unsigned long, ias, VA_BITS);
-		oas = smmu->ias;
+	case ARM_SMMU_DOMAIN_S1: {
+		unsigned long ias = (smmu->features &
+				     ARM_SMMU_FEAT_VAX) ? 52 : 48;
+
+		pgtbl_cfg.ias = min_t(unsigned long, ias, VA_BITS);
+		pgtbl_cfg.oas = smmu->ias;
+		if (enable_dirty)
+			pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_HD;
 		fmt = ARM_64_LPAE_S1;
 		finalise_stage_fn = arm_smmu_domain_finalise_s1;
 		break;
+	}
 	case ARM_SMMU_DOMAIN_S2:
-		ias = smmu->ias;
-		oas = smmu->oas;
+		if (enable_dirty)
+			return -EOPNOTSUPP;
+		pgtbl_cfg.ias = smmu->ias;
+		pgtbl_cfg.oas = smmu->oas;
 		fmt = ARM_64_LPAE_S2;
 		finalise_stage_fn = arm_smmu_domain_finalise_s2;
 		break;
@@ -2380,15 +2396,6 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 		return -EINVAL;
 	}
 
-	pgtbl_cfg = (struct io_pgtable_cfg) {
-		.pgsize_bitmap	= smmu->pgsize_bitmap,
-		.ias		= ias,
-		.oas		= oas,
-		.coherent_walk	= smmu->features & ARM_SMMU_FEAT_COHERENCY,
-		.tlb		= &arm_smmu_flush_ops,
-		.iommu_dev	= smmu->dev,
-	};
-
 	pgtbl_ops = alloc_io_pgtable_ops(fmt, &pgtbl_cfg, smmu_domain);
 	if (!pgtbl_ops)
 		return -ENOMEM;
@@ -2396,6 +2403,8 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 	smmu_domain->domain.pgsize_bitmap = pgtbl_cfg.pgsize_bitmap;
 	smmu_domain->domain.geometry.aperture_end = (1UL << pgtbl_cfg.ias) - 1;
 	smmu_domain->domain.geometry.force_aperture = true;
+	if (enable_dirty && smmu_domain->stage == ARM_SMMU_DOMAIN_S1)
+		smmu_domain->domain.dirty_ops = &arm_smmu_dirty_ops;
 
 	ret = finalise_stage_fn(smmu, smmu_domain);
 	if (ret < 0) {
@@ -2745,7 +2754,7 @@ static int arm_smmu_attach_dev(struct iommu_domain *domain, struct device *dev)
 	mutex_lock(&smmu_domain->init_mutex);
 
 	if (!smmu_domain->smmu) {
-		ret = arm_smmu_domain_finalise(smmu_domain, smmu);
+		ret = arm_smmu_domain_finalise(smmu_domain, smmu, 0);
 	} else if (smmu_domain->smmu != smmu)
 		ret = -EINVAL;
 
@@ -2810,7 +2819,7 @@ static int arm_smmu_s1_set_dev_pasid(struct iommu_domain *domain,
 
 	mutex_lock(&smmu_domain->init_mutex);
 	if (!smmu_domain->smmu)
-		ret = arm_smmu_domain_finalise(smmu_domain, smmu);
+		ret = arm_smmu_domain_finalise(smmu_domain, smmu, 0);
 	else if (smmu_domain->smmu != smmu)
 		ret = -EINVAL;
 	mutex_unlock(&smmu_domain->init_mutex);
@@ -3028,10 +3037,13 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			   const struct iommu_user_data *user_data)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
 	struct arm_smmu_domain *smmu_domain;
 	int ret;
 
-	if (flags || parent || user_data)
+	if (flags & ~PAGING_FLAGS)
+		return ERR_PTR(-EOPNOTSUPP);
+	if (parent || user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	smmu_domain = arm_smmu_domain_alloc();
@@ -3040,7 +3052,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 
 	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
 	smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;
-	ret = arm_smmu_domain_finalise(smmu_domain, master->smmu);
+	ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags);
 	if (ret)
 		goto err_free;
 	return &smmu_domain->domain;
@@ -3295,6 +3307,27 @@ static void arm_smmu_release_device(struct device *dev)
 	kfree(master);
 }
 
+static int arm_smmu_read_and_clear_dirty(struct iommu_domain *domain,
+					 unsigned long iova, size_t size,
+					 unsigned long flags,
+					 struct iommu_dirty_bitmap *dirty)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops = smmu_domain->pgtbl_ops;
+
+	return ops->read_and_clear_dirty(ops, iova, size, flags, dirty);
+}
+
+static int arm_smmu_set_dirty_tracking(struct iommu_domain *domain,
+				       bool enabled)
+{
+	/*
+	 * Always enabled and the dirty bitmap is cleared prior to
+	 * set_dirty_tracking().
+	 */
+	return 0;
+}
+
 static struct iommu_group *arm_smmu_device_group(struct device *dev)
 {
 	struct iommu_group *group;
@@ -3453,6 +3486,11 @@ static struct iommu_ops arm_smmu_ops = {
 	}
 };
 
+static struct iommu_dirty_ops arm_smmu_dirty_ops = {
+	.read_and_clear_dirty	= arm_smmu_read_and_clear_dirty,
+	.set_dirty_tracking     = arm_smmu_set_dirty_tracking,
+};
+
 /* Probing and initialisation functions */
 static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 				   struct arm_smmu_queue *q,
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 33d142f8057d7..6d5b2fffeea05 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -114,6 +114,9 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 		return ERR_PTR(-EOPNOTSUPP);
 	if (flags & ~valid_flags)
 		return ERR_PTR(-EOPNOTSUPP);
+	if ((flags & IOMMU_HWPT_ALLOC_DIRTY_TRACKING) &&
+	    !device_iommu_capable(idev->dev, IOMMU_CAP_DIRTY_TRACKING))
+		return ERR_PTR(-EOPNOTSUPP);
 
 	hwpt_paging = __iommufd_object_alloc(
 		ictx, hwpt_paging, IOMMUFD_OBJ_HWPT_PAGING, common.obj);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 86cf1f7ae389a..f9a81761bfced 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -85,6 +85,8 @@ struct io_pgtable_cfg {
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
 	 *	attributes set in the TCR for a non-coherent page-table walker.
+	 *
+	 * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
@@ -92,6 +94,7 @@ struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_MTK_TTBR_EXT	BIT(4)
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
 	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
+	#define IO_PGTABLE_QUIRK_ARM_HD			BIT(7)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;

From f2379c6d0bd98c2ff0c4ad7899f2f423c10fdfcd Mon Sep 17 00:00:00 2001
From: Kunkun Jiang <jiangkunkun@huawei.com>
Date: Wed, 3 Jul 2024 11:16:04 +0100
Subject: [PATCH 204/352] iommu/arm-smmu-v3: Enable HTTU for stage1 with
 io-pgtable mapping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If io-pgtable quirk flag indicates support for hardware update of
dirty state, enable HA/HD bits in the SMMU CD and also set the DBM
bit in the page descriptor.

Now report the dirty page tracking capability of SMMUv3 and
select IOMMUFD_DRIVER for ARM_SMMU_V3 if IOMMUFD is enabled.

Co-developed-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Keqian Zhu <zhukeqian1@huawei.com>
Signed-off-by: Kunkun Jiang <jiangkunkun@huawei.com>
Signed-off-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Link: https://lore.kernel.org/r/20240703101604.2576-6-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 25c776dd03b3e3ee16ad3402feabe20d811c7cb2)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/Kconfig                       |  1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 +++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +++
 drivers/iommu/io-pgtable-arm.c              |  5 ++++-
 4 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 2591fea0a2bee..ea413284af28e 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -394,6 +394,7 @@ config ARM_SMMU_V3
 	select IOMMU_API
 	select IOMMU_IO_PGTABLE_LPAE
 	select GENERIC_MSI_IRQ
+	select IOMMUFD_DRIVER if IOMMUFD
 	help
 	  Support for implementations of the ARM System MMU architecture
 	  version 3 providing translation support to a PCIe root complex.
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 1a1c17bf3b7f4..09011f6b94ff2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1352,6 +1352,12 @@ void arm_smmu_make_s1_cd(struct arm_smmu_cd *target,
 		CTXDESC_CD_0_ASET |
 		FIELD_PREP(CTXDESC_CD_0_ASID, cd->asid)
 		);
+
+	/* To enable dirty flag update, set both Access flag and dirty state update */
+	if (pgtbl_cfg->quirks & IO_PGTABLE_QUIRK_ARM_HD)
+		target->data[0] |= cpu_to_le64(CTXDESC_CD_0_TCR_HA |
+					       CTXDESC_CD_0_TCR_HD);
+
 	target->data[1] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.ttbr &
 				      CTXDESC_CD_1_TTB0_MASK);
 	target->data[3] = cpu_to_le64(pgtbl_cfg->arm_lpae_s1_cfg.mair);
@@ -2235,6 +2241,13 @@ static const struct iommu_flush_ops arm_smmu_flush_ops = {
 	.tlb_add_page	= arm_smmu_tlb_inv_page_nosync,
 };
 
+static bool arm_smmu_dbm_capable(struct arm_smmu_device *smmu)
+{
+	u32 features = (ARM_SMMU_FEAT_HD | ARM_SMMU_FEAT_COHERENCY);
+
+	return (smmu->features & features) == features;
+}
+
 /* IOMMU API */
 static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 {
@@ -2247,6 +2260,8 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 	case IOMMU_CAP_NOEXEC:
 	case IOMMU_CAP_DEFERRED_FLUSH:
 		return true;
+	case IOMMU_CAP_DIRTY_TRACKING:
+		return arm_smmu_dbm_capable(master->smmu);
 	default:
 		return false;
 	}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index af74b59032b55..14bca41a981b4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -304,6 +304,9 @@ struct arm_smmu_cd {
 #define CTXDESC_CD_0_TCR_IPS		GENMASK_ULL(34, 32)
 #define CTXDESC_CD_0_TCR_TBI0		(1ULL << 38)
 
+#define CTXDESC_CD_0_TCR_HA            (1UL << 43)
+#define CTXDESC_CD_0_TCR_HD            (1UL << 42)
+
 #define CTXDESC_CD_0_AA64		(1UL << 41)
 #define CTXDESC_CD_0_S			(1UL << 44)
 #define CTXDESC_CD_0_R			(1UL << 45)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 2e57e86163877..f5d9fd1f45bf4 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -440,6 +440,8 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 		pte = ARM_LPAE_PTE_nG;
 		if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
 			pte |= ARM_LPAE_PTE_AP_RDONLY;
+		else if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_HD)
+			pte |= ARM_LPAE_PTE_DBM;
 		if (!(prot & IOMMU_PRIV))
 			pte |= ARM_LPAE_PTE_AP_UNPRIV;
 	} else {
@@ -929,7 +931,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
-			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
+			    IO_PGTABLE_QUIRK_ARM_HD))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);

From 7b5082f19cdf9ff22b6c00ffec43a03f025af992 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:35 +0800
Subject: [PATCH 205/352] iommu: Introduce domain attachment handle

Currently, when attaching a domain to a device or its PASID, domain is
stored within the iommu group. It could be retrieved for use during the
window between attachment and detachment.

With new features introduced, there's a need to store more information
than just a domain pointer. This information essentially represents the
association between a domain and a device. For example, the SVA code
already has a custom struct iommu_sva which represents a bond between
sva domain and a PASID of a device. Looking forward, the IOMMUFD needs
a place to store the iommufd_device pointer in the core, so that the
device object ID could be quickly retrieved in the critical fault handling
path.

Introduce domain attachment handle that explicitly represents the
attachment relationship between a domain and a device or its PASID.

Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240702063444.105814-2-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 14678219cf4093e897ab353fd78eab7994d1be7d)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/dma/idxd/init.c   |  2 +-
 drivers/iommu/iommu-sva.c | 13 ++++++++-----
 drivers/iommu/iommu.c     | 26 ++++++++++++++++----------
 include/linux/iommu.h     | 18 +++++++++++++++---
 4 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c
index bbe1776d94d8c..34122ca2ceaa4 100644
--- a/drivers/dma/idxd/init.c
+++ b/drivers/dma/idxd/init.c
@@ -584,7 +584,7 @@ static int idxd_enable_system_pasid(struct idxd_device *idxd)
 	 * DMA domain is owned by the driver, it should support all valid
 	 * types such as DMA-FQ, identity, etc.
 	 */
-	ret = iommu_attach_device_pasid(domain, dev, pasid);
+	ret = iommu_attach_device_pasid(domain, dev, pasid, NULL);
 	if (ret) {
 		dev_err(dev, "failed to attach device pasid %d, domain type %d",
 			pasid, domain->type);
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 25e5812992264..1a737ab0a73ad 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -101,7 +101,9 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 
 	/* Search for an existing domain. */
 	list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) {
-		ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid);
+		handle->handle.domain = domain;
+		ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid,
+						&handle->handle);
 		if (!ret) {
 			domain->users++;
 			goto out;
@@ -115,7 +117,9 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 		goto out_free_handle;
 	}
 
-	ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid);
+	handle->handle.domain = domain;
+	ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid,
+					&handle->handle);
 	if (ret)
 		goto out_free_domain;
 	domain->users = 1;
@@ -126,7 +130,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 	list_add(&handle->handle_item, &mm->iommu_mm->sva_handles);
 	mutex_unlock(&iommu_sva_lock);
 	handle->dev = dev;
-	handle->domain = domain;
 	return handle;
 
 out_free_domain:
@@ -149,7 +152,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_bind_device);
  */
 void iommu_sva_unbind_device(struct iommu_sva *handle)
 {
-	struct iommu_domain *domain = handle->domain;
+	struct iommu_domain *domain = handle->handle.domain;
 	struct iommu_mm_data *iommu_mm = domain->mm->iommu_mm;
 	struct device *dev = handle->dev;
 
@@ -172,7 +175,7 @@ EXPORT_SYMBOL_GPL(iommu_sva_unbind_device);
 
 u32 iommu_sva_get_pasid(struct iommu_sva *handle)
 {
-	struct iommu_domain *domain = handle->domain;
+	struct iommu_domain *domain = handle->handle.domain;
 
 	return mm_get_enqcmd_pasid(domain->mm);
 }
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 3183b0ed4cdb9..9db39bad5bb03 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3358,16 +3358,17 @@ static void __iommu_remove_group_pasid(struct iommu_group *group,
  * @domain: the iommu domain.
  * @dev: the attached device.
  * @pasid: the pasid of the device.
+ * @handle: the attach handle.
  *
  * Return: 0 on success, or an error.
  */
 int iommu_attach_device_pasid(struct iommu_domain *domain,
-			      struct device *dev, ioasid_t pasid)
+			      struct device *dev, ioasid_t pasid,
+			      struct iommu_attach_handle *handle)
 {
 	/* Caller must be a probed driver on dev */
 	struct iommu_group *group = dev->iommu_group;
 	struct group_device *device;
-	void *curr;
 	int ret;
 
 	if (!domain->ops->set_dev_pasid)
@@ -3388,11 +3389,12 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
 		}
 	}
 
-	curr = xa_cmpxchg(&group->pasid_array, pasid, NULL, domain, GFP_KERNEL);
-	if (curr) {
-		ret = xa_err(curr) ? : -EBUSY;
+	if (handle)
+		handle->domain = domain;
+
+	ret = xa_insert(&group->pasid_array, pasid, handle, GFP_KERNEL);
+	if (ret)
 		goto out_unlock;
-	}
 
 	ret = __iommu_set_group_pasid(domain, group, pasid);
 	if (ret)
@@ -3420,7 +3422,7 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
 
 	mutex_lock(&group->mutex);
 	__iommu_remove_group_pasid(group, pasid, domain);
-	WARN_ON(xa_erase(&group->pasid_array, pasid) != domain);
+	xa_erase(&group->pasid_array, pasid);
 	mutex_unlock(&group->mutex);
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device_pasid);
@@ -3445,15 +3447,19 @@ struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev,
 {
 	/* Caller must be a probed driver on dev */
 	struct iommu_group *group = dev->iommu_group;
-	struct iommu_domain *domain;
+	struct iommu_attach_handle *handle;
+	struct iommu_domain *domain = NULL;
 
 	if (!group)
 		return NULL;
 
 	xa_lock(&group->pasid_array);
-	domain = xa_load(&group->pasid_array, pasid);
+	handle = xa_load(&group->pasid_array, pasid);
+	if (handle)
+		domain = handle->domain;
+
 	if (type && domain && domain->type != type)
-		domain = ERR_PTR(-EBUSY);
+		domain = NULL;
 	xa_unlock(&group->pasid_array);
 
 	return domain;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 365e7d3d418c5..f4ed90e8a6153 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -974,12 +974,22 @@ struct iommu_fwspec {
 /* ATS is supported */
 #define IOMMU_FWSPEC_PCI_RC_ATS			(1 << 0)
 
+/*
+ * An iommu attach handle represents a relationship between an iommu domain
+ * and a PASID or RID of a device. It is allocated and managed by the component
+ * that manages the domain and is stored in the iommu group during the time the
+ * domain is attached.
+ */
+struct iommu_attach_handle {
+	struct iommu_domain		*domain;
+};
+
 /**
  * struct iommu_sva - handle to a device-mm bond
  */
 struct iommu_sva {
+	struct iommu_attach_handle	handle;
 	struct device			*dev;
-	struct iommu_domain		*domain;
 	struct list_head		handle_item;
 	refcount_t			users;
 };
@@ -1037,7 +1047,8 @@ int iommu_device_claim_dma_owner(struct device *dev, void *owner);
 void iommu_device_release_dma_owner(struct device *dev);
 
 int iommu_attach_device_pasid(struct iommu_domain *domain,
-			      struct device *dev, ioasid_t pasid);
+			      struct device *dev, ioasid_t pasid,
+			      struct iommu_attach_handle *handle);
 void iommu_detach_device_pasid(struct iommu_domain *domain,
 			       struct device *dev, ioasid_t pasid);
 struct iommu_domain *
@@ -1373,7 +1384,8 @@ static inline int iommu_device_claim_dma_owner(struct device *dev, void *owner)
 }
 
 static inline int iommu_attach_device_pasid(struct iommu_domain *domain,
-					    struct device *dev, ioasid_t pasid)
+					    struct device *dev, ioasid_t pasid,
+					    struct iommu_attach_handle *handle)
 {
 	return -ENODEV;
 }

From a2cae7bf2b408afa563eb29880b4ed40fe9277f0 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:36 +0800
Subject: [PATCH 206/352] iommu: Remove sva handle list

The struct sva_iommu represents an association between an SVA domain and
a PASID of a device. It's stored in the iommu group's pasid array and also
tracked by a list in the per-mm data structure. Removes duplicate tracking
of sva_iommu by eliminating the list.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240702063444.105814-3-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 3e7f57d1ef3f5fbed58974fae38d35e430f57d35)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu-priv.h |  3 +++
 drivers/iommu/iommu-sva.c  | 30 ++++++++++++++++++++----------
 drivers/iommu/iommu.c      | 31 +++++++++++++++++++++++++++++++
 include/linux/iommu.h      |  2 --
 4 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index 5f731d994803c..f1536a5ebb0dc 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -28,4 +28,7 @@ void iommu_device_unregister_bus(struct iommu_device *iommu,
 				 const struct bus_type *bus,
 				 struct notifier_block *nb);
 
+struct iommu_attach_handle *iommu_attach_handle_get(struct iommu_group *group,
+						    ioasid_t pasid,
+						    unsigned int type);
 #endif /* __LINUX_IOMMU_PRIV_H */
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 1a737ab0a73ad..8d1b8c897b608 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -43,7 +43,6 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
 	}
 	iommu_mm->pasid = pasid;
 	INIT_LIST_HEAD(&iommu_mm->sva_domains);
-	INIT_LIST_HEAD(&iommu_mm->sva_handles);
 	/*
 	 * Make sure the write to mm->iommu_mm is not reordered in front of
 	 * initialization to iommu_mm fields. If it does, readers may see a
@@ -71,11 +70,16 @@ static struct iommu_mm_data *iommu_alloc_mm_data(struct mm_struct *mm, struct de
  */
 struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm)
 {
+	struct iommu_group *group = dev->iommu_group;
+	struct iommu_attach_handle *attach_handle;
 	struct iommu_mm_data *iommu_mm;
 	struct iommu_domain *domain;
 	struct iommu_sva *handle;
 	int ret;
 
+	if (!group)
+		return ERR_PTR(-ENODEV);
+
 	mutex_lock(&iommu_sva_lock);
 
 	/* Allocate mm->pasid if necessary. */
@@ -85,12 +89,22 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 		goto out_unlock;
 	}
 
-	list_for_each_entry(handle, &mm->iommu_mm->sva_handles, handle_item) {
-		if (handle->dev == dev) {
-			refcount_inc(&handle->users);
-			mutex_unlock(&iommu_sva_lock);
-			return handle;
+	/* A bond already exists, just take a reference`. */
+	attach_handle = iommu_attach_handle_get(group, iommu_mm->pasid, IOMMU_DOMAIN_SVA);
+	if (!IS_ERR(attach_handle)) {
+		handle = container_of(attach_handle, struct iommu_sva, handle);
+		if (attach_handle->domain->mm != mm) {
+			ret = -EBUSY;
+			goto out_unlock;
 		}
+		refcount_inc(&handle->users);
+		mutex_unlock(&iommu_sva_lock);
+		return handle;
+	}
+
+	if (PTR_ERR(attach_handle) != -ENOENT) {
+		ret = PTR_ERR(attach_handle);
+		goto out_unlock;
 	}
 
 	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
@@ -101,7 +115,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 
 	/* Search for an existing domain. */
 	list_for_each_entry(domain, &mm->iommu_mm->sva_domains, next) {
-		handle->handle.domain = domain;
 		ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid,
 						&handle->handle);
 		if (!ret) {
@@ -117,7 +130,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 		goto out_free_handle;
 	}
 
-	handle->handle.domain = domain;
 	ret = iommu_attach_device_pasid(domain, dev, iommu_mm->pasid,
 					&handle->handle);
 	if (ret)
@@ -127,7 +139,6 @@ struct iommu_sva *iommu_sva_bind_device(struct device *dev, struct mm_struct *mm
 
 out:
 	refcount_set(&handle->users, 1);
-	list_add(&handle->handle_item, &mm->iommu_mm->sva_handles);
 	mutex_unlock(&iommu_sva_lock);
 	handle->dev = dev;
 	return handle;
@@ -161,7 +172,6 @@ void iommu_sva_unbind_device(struct iommu_sva *handle)
 		mutex_unlock(&iommu_sva_lock);
 		return;
 	}
-	list_del(&handle->handle_item);
 
 	iommu_detach_device_pasid(domain, dev, iommu_mm->pasid);
 	if (--domain->users == 0) {
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 9db39bad5bb03..7ac085d297605 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3492,3 +3492,34 @@ void iommu_free_global_pasid(ioasid_t pasid)
 	ida_free(&iommu_global_pasid_ida, pasid);
 }
 EXPORT_SYMBOL_GPL(iommu_free_global_pasid);
+
+/**
+ * iommu_attach_handle_get - Return the attach handle
+ * @group: the iommu group that domain was attached to
+ * @pasid: the pasid within the group
+ * @type: matched domain type, 0 for any match
+ *
+ * Return handle or ERR_PTR(-ENOENT) on none, ERR_PTR(-EBUSY) on mismatch.
+ *
+ * Return the attach handle to the caller. The life cycle of an iommu attach
+ * handle is from the time when the domain is attached to the time when the
+ * domain is detached. Callers are required to synchronize the call of
+ * iommu_attach_handle_get() with domain attachment and detachment. The attach
+ * handle can only be used during its life cycle.
+ */
+struct iommu_attach_handle *
+iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int type)
+{
+	struct iommu_attach_handle *handle;
+
+	xa_lock(&group->pasid_array);
+	handle = xa_load(&group->pasid_array, pasid);
+	if (!handle)
+		handle = ERR_PTR(-ENOENT);
+	else if (type && handle->domain->type != type)
+		handle = ERR_PTR(-EBUSY);
+	xa_unlock(&group->pasid_array);
+
+	return handle;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, IOMMUFD_INTERNAL);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f4ed90e8a6153..5ae2a7eb54fa3 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -990,14 +990,12 @@ struct iommu_attach_handle {
 struct iommu_sva {
 	struct iommu_attach_handle	handle;
 	struct device			*dev;
-	struct list_head		handle_item;
 	refcount_t			users;
 };
 
 struct iommu_mm_data {
 	u32			pasid;
 	struct list_head	sva_domains;
-	struct list_head	sva_handles;
 };
 
 int iommu_fwspec_init(struct device *dev, struct fwnode_handle *iommu_fwnode,

From b8817df0f88460ed28c3296267297b8dc015413e Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:37 +0800
Subject: [PATCH 207/352] iommu: Add attach handle to struct iopf_group

Previously, the domain that a page fault targets is stored in an
iopf_group, which represents a minimal set of page faults. With the
introduction of attach handle, replace the domain with the handle
so that the fault handler can obtain more information as needed
when handling the faults.

iommu_report_device_fault() is currently used for SVA page faults,
which handles the page fault in an internal cycle. The domain is retrieved
with iommu_get_domain_for_dev_pasid() if the pasid in the fault message
is valid. This doesn't work in IOMMUFD case, where if the pasid table of
a device is wholly managed by user space, there is no domain attached to
the PASID of the device, and all page faults are forwarded through a
NESTING domain attaching to RID.

Add a static flag in iommu ops, which indicates if the IOMMU driver
supports user-managed PASID tables. In the iopf deliver path, if no
attach handle found for the iopf PASID, roll back to RID domain when
the IOMMU driver supports this capability.

iommu_get_domain_for_dev_pasid() is no longer used and can be removed.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240702063444.105814-4-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 06cdcc32d65759d42c6340700796e2906045b6a5)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c | 61 +++++++++++++++++++++-----------------
 drivers/iommu/iommu-sva.c  |  3 +-
 drivers/iommu/iommu.c      | 39 ------------------------
 include/linux/iommu.h      | 17 ++++-------
 4 files changed, 42 insertions(+), 78 deletions(-)

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 06d78fcc79fdb..7c9011992d3f0 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -59,30 +59,6 @@ void iopf_free_group(struct iopf_group *group)
 }
 EXPORT_SYMBOL_GPL(iopf_free_group);
 
-static struct iommu_domain *get_domain_for_iopf(struct device *dev,
-						struct iommu_fault *fault)
-{
-	struct iommu_domain *domain;
-
-	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
-		domain = iommu_get_domain_for_dev_pasid(dev, fault->prm.pasid, 0);
-		if (IS_ERR(domain))
-			domain = NULL;
-	} else {
-		domain = iommu_get_domain_for_dev(dev);
-	}
-
-	if (!domain || !domain->iopf_handler) {
-		dev_warn_ratelimited(dev,
-			"iopf (pasid %d) without domain attached or handler installed\n",
-			 fault->prm.pasid);
-
-		return NULL;
-	}
-
-	return domain;
-}
-
 /* Non-last request of a group. Postpone until the last one. */
 static int report_partial_fault(struct iommu_fault_param *fault_param,
 				struct iommu_fault *fault)
@@ -206,20 +182,51 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	if (group == &abort_group)
 		goto err_abort;
 
-	group->domain = get_domain_for_iopf(dev, fault);
-	if (!group->domain)
+	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
+		group->attach_handle = iommu_attach_handle_get(dev->iommu_group,
+							       fault->prm.pasid,
+							       0);
+		if (IS_ERR(group->attach_handle)) {
+			const struct iommu_ops *ops = dev_iommu_ops(dev);
+
+			if (!ops->user_pasid_table)
+				goto err_abort;
+
+			/*
+			 * The iommu driver for this device supports user-
+			 * managed PASID table. Therefore page faults for
+			 * any PASID should go through the NESTING domain
+			 * attached to the device RID.
+			 */
+			group->attach_handle =
+				iommu_attach_handle_get(dev->iommu_group,
+							IOMMU_NO_PASID,
+							IOMMU_DOMAIN_NESTED);
+			if (IS_ERR(group->attach_handle))
+				goto err_abort;
+		}
+	} else {
+		group->attach_handle =
+			iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
+		if (IS_ERR(group->attach_handle))
+			goto err_abort;
+	}
+
+	if (!group->attach_handle->domain->iopf_handler)
 		goto err_abort;
 
 	/*
 	 * On success iopf_handler must call iopf_group_response() and
 	 * iopf_free_group()
 	 */
-	if (group->domain->iopf_handler(group))
+	if (group->attach_handle->domain->iopf_handler(group))
 		goto err_abort;
 
 	return;
 
 err_abort:
+	dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n",
+			     fault->prm.pasid);
 	iopf_group_response(group, IOMMU_PAGE_RESP_FAILURE);
 	if (group == &abort_group)
 		__iopf_free_group(group);
diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c
index 8d1b8c897b608..503c5d23c1ea2 100644
--- a/drivers/iommu/iommu-sva.c
+++ b/drivers/iommu/iommu-sva.c
@@ -274,7 +274,8 @@ static void iommu_sva_handle_iopf(struct work_struct *work)
 		if (status != IOMMU_PAGE_RESP_SUCCESS)
 			break;
 
-		status = iommu_sva_handle_mm(&iopf->fault, group->domain->mm);
+		status = iommu_sva_handle_mm(&iopf->fault,
+					     group->attach_handle->domain->mm);
 	}
 
 	iopf_group_response(group, status);
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 7ac085d297605..26abe52f3d61b 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3427,45 +3427,6 @@ void iommu_detach_device_pasid(struct iommu_domain *domain, struct device *dev,
 }
 EXPORT_SYMBOL_GPL(iommu_detach_device_pasid);
 
-/*
- * iommu_get_domain_for_dev_pasid() - Retrieve domain for @pasid of @dev
- * @dev: the queried device
- * @pasid: the pasid of the device
- * @type: matched domain type, 0 for any match
- *
- * This is a variant of iommu_get_domain_for_dev(). It returns the existing
- * domain attached to pasid of a device. Callers must hold a lock around this
- * function, and both iommu_attach/detach_dev_pasid() whenever a domain of
- * type is being manipulated. This API does not internally resolve races with
- * attach/detach.
- *
- * Return: attached domain on success, NULL otherwise.
- */
-struct iommu_domain *iommu_get_domain_for_dev_pasid(struct device *dev,
-						    ioasid_t pasid,
-						    unsigned int type)
-{
-	/* Caller must be a probed driver on dev */
-	struct iommu_group *group = dev->iommu_group;
-	struct iommu_attach_handle *handle;
-	struct iommu_domain *domain = NULL;
-
-	if (!group)
-		return NULL;
-
-	xa_lock(&group->pasid_array);
-	handle = xa_load(&group->pasid_array, pasid);
-	if (handle)
-		domain = handle->domain;
-
-	if (type && domain && domain->type != type)
-		domain = NULL;
-	xa_unlock(&group->pasid_array);
-
-	return domain;
-}
-EXPORT_SYMBOL_GPL(iommu_get_domain_for_dev_pasid);
-
 ioasid_t iommu_alloc_global_pasid(struct device *dev)
 {
 	int ret;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5ae2a7eb54fa3..5c0c91afc1ab5 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -128,7 +128,7 @@ struct iopf_group {
 	/* list node for iommu_fault_param::faults */
 	struct list_head pending_node;
 	struct work_struct work;
-	struct iommu_domain *domain;
+	struct iommu_attach_handle *attach_handle;
 	/* The device's fault data parameter. */
 	struct iommu_fault_param *fault_param;
 };
@@ -548,6 +548,10 @@ static inline int __iommu_copy_struct_from_user_array(
  * @default_domain: If not NULL this will always be set as the default domain.
  *                  This should be an IDENTITY/BLOCKED/PLATFORM domain.
  *                  Do not use in new drivers.
+ * @user_pasid_table: IOMMU driver supports user-managed PASID table. There is
+ *                    no user domain for each PASID and the I/O page faults are
+ *                    forwarded through the user domain attached to the device
+ *                    RID.
  */
 struct iommu_ops {
 	bool (*capable)(struct device *dev, enum iommu_cap);
@@ -591,6 +595,7 @@ struct iommu_ops {
 	struct iommu_domain *blocked_domain;
 	struct iommu_domain *release_domain;
 	struct iommu_domain *default_domain;
+	u8 user_pasid_table:1;
 };
 
 /**
@@ -1049,9 +1054,6 @@ int iommu_attach_device_pasid(struct iommu_domain *domain,
 			      struct iommu_attach_handle *handle);
 void iommu_detach_device_pasid(struct iommu_domain *domain,
 			       struct device *dev, ioasid_t pasid);
-struct iommu_domain *
-iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid,
-			       unsigned int type);
 ioasid_t iommu_alloc_global_pasid(struct device *dev);
 void iommu_free_global_pasid(ioasid_t pasid);
 #else /* CONFIG_IOMMU_API */
@@ -1393,13 +1395,6 @@ static inline void iommu_detach_device_pasid(struct iommu_domain *domain,
 {
 }
 
-static inline struct iommu_domain *
-iommu_get_domain_for_dev_pasid(struct device *dev, ioasid_t pasid,
-			       unsigned int type)
-{
-	return NULL;
-}
-
 static inline ioasid_t iommu_alloc_global_pasid(struct device *dev)
 {
 	return IOMMU_PASID_INVALID;

From 254f151e3d7a2e4002e24e6e0308ee9ab8ea6279 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:38 +0800
Subject: [PATCH 208/352] iommu: Extend domain attach group with handle support

Unlike the SVA case where each PASID of a device has an SVA domain
attached to it, the I/O page faults are handled by the fault handler
of the SVA domain. The I/O page faults for a user page table might
be handled by the domain attached to RID or the domain attached to
the PASID, depending on whether the PASID table is managed by user
space or kernel. As a result, there is a need for the domain attach
group interfaces to have attach handle support. The attach handle
will be forwarded to the fault handler of the user domain.

Add some variants of the domain attaching group interfaces so that they
could support the attach handle and export them for use in IOMMUFD.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Link: https://lore.kernel.org/r/20240702063444.105814-5-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 8519e689834a3ecf9a36332a9abb1844bd34e459)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu-priv.h |   8 +++
 drivers/iommu/iommu.c      | 103 +++++++++++++++++++++++++++++++++++++
 2 files changed, 111 insertions(+)

diff --git a/drivers/iommu/iommu-priv.h b/drivers/iommu/iommu-priv.h
index f1536a5ebb0dc..c37801c32f331 100644
--- a/drivers/iommu/iommu-priv.h
+++ b/drivers/iommu/iommu-priv.h
@@ -31,4 +31,12 @@ void iommu_device_unregister_bus(struct iommu_device *iommu,
 struct iommu_attach_handle *iommu_attach_handle_get(struct iommu_group *group,
 						    ioasid_t pasid,
 						    unsigned int type);
+int iommu_attach_group_handle(struct iommu_domain *domain,
+			      struct iommu_group *group,
+			      struct iommu_attach_handle *handle);
+void iommu_detach_group_handle(struct iommu_domain *domain,
+			       struct iommu_group *group);
+int iommu_replace_group_handle(struct iommu_group *group,
+			       struct iommu_domain *new_domain,
+			       struct iommu_attach_handle *handle);
 #endif /* __LINUX_IOMMU_PRIV_H */
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 26abe52f3d61b..5ddb21e30a180 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -3484,3 +3484,106 @@ iommu_attach_handle_get(struct iommu_group *group, ioasid_t pasid, unsigned int
 	return handle;
 }
 EXPORT_SYMBOL_NS_GPL(iommu_attach_handle_get, IOMMUFD_INTERNAL);
+
+/**
+ * iommu_attach_group_handle - Attach an IOMMU domain to an IOMMU group
+ * @domain: IOMMU domain to attach
+ * @group: IOMMU group that will be attached
+ * @handle: attach handle
+ *
+ * Returns 0 on success and error code on failure.
+ *
+ * This is a variant of iommu_attach_group(). It allows the caller to provide
+ * an attach handle and use it when the domain is attached. This is currently
+ * used by IOMMUFD to deliver the I/O page faults.
+ */
+int iommu_attach_group_handle(struct iommu_domain *domain,
+			      struct iommu_group *group,
+			      struct iommu_attach_handle *handle)
+{
+	int ret;
+
+	if (handle)
+		handle->domain = domain;
+
+	mutex_lock(&group->mutex);
+	ret = xa_insert(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL);
+	if (ret)
+		goto err_unlock;
+
+	ret = __iommu_attach_group(domain, group);
+	if (ret)
+		goto err_erase;
+	mutex_unlock(&group->mutex);
+
+	return 0;
+err_erase:
+	xa_erase(&group->pasid_array, IOMMU_NO_PASID);
+err_unlock:
+	mutex_unlock(&group->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_attach_group_handle, IOMMUFD_INTERNAL);
+
+/**
+ * iommu_detach_group_handle - Detach an IOMMU domain from an IOMMU group
+ * @domain: IOMMU domain to attach
+ * @group: IOMMU group that will be attached
+ *
+ * Detach the specified IOMMU domain from the specified IOMMU group.
+ * It must be used in conjunction with iommu_attach_group_handle().
+ */
+void iommu_detach_group_handle(struct iommu_domain *domain,
+			       struct iommu_group *group)
+{
+	mutex_lock(&group->mutex);
+	__iommu_group_set_core_domain(group);
+	xa_erase(&group->pasid_array, IOMMU_NO_PASID);
+	mutex_unlock(&group->mutex);
+}
+EXPORT_SYMBOL_NS_GPL(iommu_detach_group_handle, IOMMUFD_INTERNAL);
+
+/**
+ * iommu_replace_group_handle - replace the domain that a group is attached to
+ * @group: IOMMU group that will be attached to the new domain
+ * @new_domain: new IOMMU domain to replace with
+ * @handle: attach handle
+ *
+ * This is a variant of iommu_group_replace_domain(). It allows the caller to
+ * provide an attach handle for the new domain and use it when the domain is
+ * attached.
+ */
+int iommu_replace_group_handle(struct iommu_group *group,
+			       struct iommu_domain *new_domain,
+			       struct iommu_attach_handle *handle)
+{
+	void *curr;
+	int ret;
+
+	if (!new_domain)
+		return -EINVAL;
+
+	mutex_lock(&group->mutex);
+	if (handle) {
+		ret = xa_reserve(&group->pasid_array, IOMMU_NO_PASID, GFP_KERNEL);
+		if (ret)
+			goto err_unlock;
+	}
+
+	ret = __iommu_group_set_domain(group, new_domain);
+	if (ret)
+		goto err_release;
+
+	curr = xa_store(&group->pasid_array, IOMMU_NO_PASID, handle, GFP_KERNEL);
+	WARN_ON(xa_is_err(curr));
+
+	mutex_unlock(&group->mutex);
+
+	return 0;
+err_release:
+	xa_release(&group->pasid_array, IOMMU_NO_PASID);
+err_unlock:
+	mutex_unlock(&group->mutex);
+	return ret;
+}
+EXPORT_SYMBOL_NS_GPL(iommu_replace_group_handle, IOMMUFD_INTERNAL);

From 567f711a8515f575654d49a845294453c29ae4fe Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 10 Jun 2024 16:55:35 +0800
Subject: [PATCH 209/352] iommu: Add iommu_paging_domain_alloc() interface

Commit <17de3f5fdd35> ("iommu: Retire bus ops") removes iommu ops from
bus. The iommu subsystem no longer relies on bus for operations. So the
bus parameter in iommu_domain_alloc() is no longer relevant.

Add a new interface named iommu_paging_domain_alloc(), which explicitly
indicates the allocation of a paging domain for DMA managed by a kernel
driver. The new interface takes a device pointer as its parameter, that
better aligns with the current iommu subsystem.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Vasant Hegde <vasant.hegde@amd.com>
Link: https://lore.kernel.org/r/20240610085555.88197-2-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a27bf2743cb80d3b36b5b43e8e2e702412c41668)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommu.c | 20 ++++++++++++++++++++
 include/linux/iommu.h |  6 ++++++
 2 files changed, 26 insertions(+)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5ddb21e30a180..6579bda375652 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2013,6 +2013,10 @@ static int __iommu_domain_alloc_dev(struct device *dev, void *data)
 	return 0;
 }
 
+/*
+ * The iommu ops in bus has been retired. Do not use this interface in
+ * new drivers.
+ */
 struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
 {
 	const struct iommu_ops *ops = NULL;
@@ -2029,6 +2033,22 @@ struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus)
 }
 EXPORT_SYMBOL_GPL(iommu_domain_alloc);
 
+/**
+ * iommu_paging_domain_alloc() - Allocate a paging domain
+ * @dev: device for which the domain is allocated
+ *
+ * Allocate a paging domain which will be managed by a kernel driver. Return
+ * allocated domain if successful, or a ERR pointer for failure.
+ */
+struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+{
+	if (!dev_has_iommu(dev))
+		return ERR_PTR(-ENODEV);
+
+	return __iommu_domain_alloc(dev_iommu_ops(dev), dev, IOMMU_DOMAIN_UNMANAGED);
+}
+EXPORT_SYMBOL_GPL(iommu_paging_domain_alloc);
+
 void iommu_domain_free(struct iommu_domain *domain)
 {
 	if (domain->type == IOMMU_DOMAIN_SVA)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 5c0c91afc1ab5..010d19279f90c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -770,6 +770,7 @@ extern bool iommu_present(const struct bus_type *bus);
 extern bool device_iommu_capable(struct device *dev, enum iommu_cap cap);
 extern bool iommu_group_has_isolated_msi(struct iommu_group *group);
 extern struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus);
+struct iommu_domain *iommu_paging_domain_alloc(struct device *dev);
 extern void iommu_domain_free(struct iommu_domain *domain);
 extern int iommu_attach_device(struct iommu_domain *domain,
 			       struct device *dev);
@@ -1082,6 +1083,11 @@ static inline struct iommu_domain *iommu_domain_alloc(const struct bus_type *bus
 	return NULL;
 }
 
+static inline struct iommu_domain *iommu_paging_domain_alloc(struct device *dev)
+{
+	return ERR_PTR(-ENODEV);
+}
+
 static inline void iommu_domain_free(struct iommu_domain *domain)
 {
 }

From 3fe42ae3705d78cb3de3fab8c4d11d234ab01e27 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 10 Jun 2024 16:55:36 +0800
Subject: [PATCH 210/352] iommufd: Use iommu_paging_domain_alloc()

If the iommu driver doesn't implement its domain_alloc_user callback,
iommufd_hwpt_paging_alloc() rolls back to allocate an iommu paging domain.
Replace iommu_domain_alloc() with iommu_user_domain_alloc() to pass the
device pointer along the path.

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240610085555.88197-3-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 26a581606fab44ff76b394f0ba44cd19c6ec0a6e)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 6d5b2fffeea05..9020da52c10f4 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -140,9 +140,10 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 		}
 		hwpt->domain->owner = ops;
 	} else {
-		hwpt->domain = iommu_domain_alloc(idev->dev->bus);
-		if (!hwpt->domain) {
-			rc = -ENOMEM;
+		hwpt->domain = iommu_paging_domain_alloc(idev->dev);
+		if (IS_ERR(hwpt->domain)) {
+			rc = PTR_ERR(hwpt->domain);
+			hwpt->domain = NULL;
 			goto out_abort;
 		}
 	}

From 43c38fa66faf82052afa1051f4307e7e30c83844 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Mon, 10 Jun 2024 16:55:37 +0800
Subject: [PATCH 211/352] vfio/type1: Use iommu_paging_domain_alloc()

Replace iommu_domain_alloc() with iommu_paging_domain_alloc().

Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240610085555.88197-4-baolu.lu@linux.intel.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 60ffc45017229ee8288ba139ee12c5ebf07c6f6a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/vfio_iommu_type1.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index b5c15fe8f9fcf..49e1c52aa5a82 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2135,7 +2135,7 @@ static int vfio_iommu_domain_alloc(struct device *dev, void *data)
 {
 	struct iommu_domain **domain = data;
 
-	*domain = iommu_domain_alloc(dev->bus);
+	*domain = iommu_paging_domain_alloc(dev);
 	return 1; /* Don't iterate */
 }
 
@@ -2192,11 +2192,12 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 	 * us a representative device for the IOMMU API call. We don't actually
 	 * want to iterate beyond the first device (if any).
 	 */
-	ret = -EIO;
 	iommu_group_for_each_dev(iommu_group, &domain->domain,
 				 vfio_iommu_domain_alloc);
-	if (!domain->domain)
+	if (IS_ERR(domain->domain)) {
+		ret = PTR_ERR(domain->domain);
 		goto out_free_domain;
+	}
 
 	if (iommu->nesting) {
 		ret = iommu_enable_nesting(domain->domain);

From 0c33c9508a91f7bf9b2ee53665e4f448fc2f5ec4 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Fri, 7 Jun 2024 11:54:15 +0100
Subject: [PATCH 212/352] iommu/of: Support ats-supported device-tree property

Device-tree declares whether a PCI root-complex supports ATS by setting
the "ats-supported" property. Copy this flag into device fwspec to let
IOMMU drivers quickly check if they can enable ATS for a device.

Tested-by: Ketan Patil <ketanp@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Liviu Dudau <liviu.dudau@arm.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20240607105415.2501934-4-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 86e02a88bedc1072beb5445d408e379674b0b7f3)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/of_iommu.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/iommu/of_iommu.c b/drivers/iommu/of_iommu.c
index 3afe0b48a48db..082b94c2b3291 100644
--- a/drivers/iommu/of_iommu.c
+++ b/drivers/iommu/of_iommu.c
@@ -105,6 +105,14 @@ static int of_iommu_configure_device(struct device_node *master_np,
 		      of_iommu_configure_dev(master_np, dev);
 }
 
+static void of_pci_check_device_ats(struct device *dev, struct device_node *np)
+{
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+
+	if (fwspec && of_property_read_bool(np, "ats-supported"))
+		fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+}
+
 /*
  * Returns:
  *  0 on success, an iommu was configured
@@ -147,6 +155,7 @@ int of_iommu_configure(struct device *dev, struct device_node *master_np,
 		pci_request_acs();
 		err = pci_for_each_dma_alias(to_pci_dev(dev),
 					     of_pci_iommu_init, &info);
+		of_pci_check_device_ats(dev, master_np);
 	} else {
 		err = of_iommu_configure_device(master_np, dev, id);
 	}

From b30303db314c2221b0453ad2edf552f334ea2929 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:39 +0800
Subject: [PATCH 213/352] iommufd: Add fault and response message definitions

iommu_hwpt_pgfaults represent fault messages that the userspace can
retrieve. Multiple iommu_hwpt_pgfaults might be put in an iopf group,
with the IOMMU_PGFAULT_FLAGS_LAST_PAGE flag set only for the last
iommu_hwpt_pgfault.

An iommu_hwpt_page_response is a response message that the userspace
should send to the kernel after finishing handling a group of fault
messages. The @dev_id, @pasid, and @grpid fields in the message
identify an outstanding iopf group for a device. The @cookie field,
which matches the cookie field of the last fault in the group, will
be used by the kernel to look up the pending message.

Link: https://lore.kernel.org/r/20240702063444.105814-6-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit c714f15860fcca02fe0fd7c3f1f1fc35b1768ac1)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/uapi/linux/iommufd.h | 83 ++++++++++++++++++++++++++++++++++++
 1 file changed, 83 insertions(+)

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 1dfeaa2e649ee..4d89ed97b533f 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -692,4 +692,87 @@ struct iommu_hwpt_invalidate {
 	__u32 __reserved;
 };
 #define IOMMU_HWPT_INVALIDATE _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_INVALIDATE)
+
+/**
+ * enum iommu_hwpt_pgfault_flags - flags for struct iommu_hwpt_pgfault
+ * @IOMMU_PGFAULT_FLAGS_PASID_VALID: The pasid field of the fault data is
+ *                                   valid.
+ * @IOMMU_PGFAULT_FLAGS_LAST_PAGE: It's the last fault of a fault group.
+ */
+enum iommu_hwpt_pgfault_flags {
+	IOMMU_PGFAULT_FLAGS_PASID_VALID		= (1 << 0),
+	IOMMU_PGFAULT_FLAGS_LAST_PAGE		= (1 << 1),
+};
+
+/**
+ * enum iommu_hwpt_pgfault_perm - perm bits for struct iommu_hwpt_pgfault
+ * @IOMMU_PGFAULT_PERM_READ: request for read permission
+ * @IOMMU_PGFAULT_PERM_WRITE: request for write permission
+ * @IOMMU_PGFAULT_PERM_EXEC: (PCIE 10.4.1) request with a PASID that has the
+ *                           Execute Requested bit set in PASID TLP Prefix.
+ * @IOMMU_PGFAULT_PERM_PRIV: (PCIE 10.4.1) request with a PASID that has the
+ *                           Privileged Mode Requested bit set in PASID TLP
+ *                           Prefix.
+ */
+enum iommu_hwpt_pgfault_perm {
+	IOMMU_PGFAULT_PERM_READ			= (1 << 0),
+	IOMMU_PGFAULT_PERM_WRITE		= (1 << 1),
+	IOMMU_PGFAULT_PERM_EXEC			= (1 << 2),
+	IOMMU_PGFAULT_PERM_PRIV			= (1 << 3),
+};
+
+/**
+ * struct iommu_hwpt_pgfault - iommu page fault data
+ * @flags: Combination of enum iommu_hwpt_pgfault_flags
+ * @dev_id: id of the originated device
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: Combination of enum iommu_hwpt_pgfault_perm
+ * @addr: Fault address
+ * @length: a hint of how much data the requestor is expecting to fetch. For
+ *          example, if the PRI initiator knows it is going to do a 10MB
+ *          transfer, it could fill in 10MB and the OS could pre-fault in
+ *          10MB of IOVA. It's default to 0 if there's no such hint.
+ * @cookie: kernel-managed cookie identifying a group of fault messages. The
+ *          cookie number encoded in the last page fault of the group should
+ *          be echoed back in the response message.
+ */
+struct iommu_hwpt_pgfault {
+	__u32 flags;
+	__u32 dev_id;
+	__u32 pasid;
+	__u32 grpid;
+	__u32 perm;
+	__u64 addr;
+	__u32 length;
+	__u32 cookie;
+};
+
+/**
+ * enum iommufd_page_response_code - Return status of fault handlers
+ * @IOMMUFD_PAGE_RESP_SUCCESS: Fault has been handled and the page tables
+ *                             populated, retry the access. This is the
+ *                             "Success" defined in PCI 10.4.2.1.
+ * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
+ *                             access. This is the "Invalid Request" in PCI
+ *                             10.4.2.1.
+ * @IOMMUFD_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
+ *                             this device if possible. This is the "Response
+ *                             Failure" in PCI 10.4.2.1.
+ */
+enum iommufd_page_response_code {
+	IOMMUFD_PAGE_RESP_SUCCESS = 0,
+	IOMMUFD_PAGE_RESP_INVALID,
+	IOMMUFD_PAGE_RESP_FAILURE,
+};
+
+/**
+ * struct iommu_hwpt_page_response - IOMMU page fault response
+ * @cookie: The kernel-managed cookie reported in the fault message.
+ * @code: One of response code in enum iommufd_page_response_code.
+ */
+struct iommu_hwpt_page_response {
+	__u32 cookie;
+	__u32 code;
+};
 #endif

From 29d1b7542795fa41df41d3e02c3ff238a8c192f9 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:40 +0800
Subject: [PATCH 214/352] iommufd: Add iommufd fault object

An iommufd fault object provides an interface for delivering I/O page
faults to user space. These objects are created and destroyed by user
space, and they can be associated with or dissociated from hardware page
table objects during page table allocation or destruction.

User space interacts with the fault object through a file interface. This
interface offers a straightforward and efficient way for user space to
handle page faults. It allows user space to read fault messages
sequentially and respond to them by writing to the same file. The file
interface supports reading messages in poll mode, so it's recommended that
user space applications use io_uring to enhance read and write efficiency.

A fault object can be associated with any iopf-capable iommufd_hw_pgtable
during the pgtable's allocation. All I/O page faults triggered by devices
when accessing the I/O addresses of an iommufd_hw_pgtable are routed
through the fault object to user space. Similarly, user space's responses
to these page faults are routed back to the iommu device driver through
the same fault object.

Link: https://lore.kernel.org/r/20240702063444.105814-7-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 07838f7fd529c8a6de44b601d4b7057e6c8d36ed)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/io-pgfault.c              |   2 +
 drivers/iommu/iommufd/Makefile          |   1 +
 drivers/iommu/iommufd/fault.c           | 226 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  30 ++++
 drivers/iommu/iommufd/main.c            |   6 +
 include/linux/iommu.h                   |   4 +
 include/uapi/linux/iommufd.h            |  18 ++
 7 files changed, 287 insertions(+)
 create mode 100644 drivers/iommu/iommufd/fault.c

diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index 7c9011992d3f0..cd679c13752e0 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -110,6 +110,8 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
 	list_add(&group->pending_node, &iopf_param->faults);
 	mutex_unlock(&iopf_param->lock);
 
+	group->fault_count = list_count_nodes(&group->faults);
+
 	return group;
 }
 
diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 34b446146961c..cf4605962bea6 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
 	device.o \
+	fault.o \
 	hw_pagetable.o \
 	io_pagetable.o \
 	ioas.o \
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
new file mode 100644
index 0000000000000..68ff94671d489
--- /dev/null
+++ b/drivers/iommu/iommufd/fault.c
@@ -0,0 +1,226 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2024 Intel Corporation
+ */
+#define pr_fmt(fmt) "iommufd: " fmt
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/iommufd.h>
+#include <linux/poll.h>
+#include <linux/anon_inodes.h>
+#include <uapi/linux/iommufd.h>
+
+#include "../iommu-priv.h"
+#include "iommufd_private.h"
+
+void iommufd_fault_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+	struct iopf_group *group, *next;
+
+	/*
+	 * The iommufd object's reference count is zero at this point.
+	 * We can be confident that no other threads are currently
+	 * accessing this pointer. Therefore, acquiring the mutex here
+	 * is unnecessary.
+	 */
+	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+		list_del(&group->node);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+}
+
+static void iommufd_compose_fault_message(struct iommu_fault *fault,
+					  struct iommu_hwpt_pgfault *hwpt_fault,
+					  struct iommufd_device *idev,
+					  u32 cookie)
+{
+	hwpt_fault->flags = fault->prm.flags;
+	hwpt_fault->dev_id = idev->obj.id;
+	hwpt_fault->pasid = fault->prm.pasid;
+	hwpt_fault->grpid = fault->prm.grpid;
+	hwpt_fault->perm = fault->prm.perm;
+	hwpt_fault->addr = fault->prm.addr;
+	hwpt_fault->length = 0;
+	hwpt_fault->cookie = cookie;
+}
+
+static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_pgfault data;
+	struct iommufd_device *idev;
+	struct iopf_group *group;
+	struct iopf_fault *iopf;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % fault_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (!list_empty(&fault->deliver) && count > done) {
+		group = list_first_entry(&fault->deliver,
+					 struct iopf_group, node);
+
+		if (group->fault_count * fault_size > count - done)
+			break;
+
+		rc = xa_alloc(&fault->response, &group->cookie, group,
+			      xa_limit_32b, GFP_KERNEL);
+		if (rc)
+			break;
+
+		idev = to_iommufd_handle(group->attach_handle)->idev;
+		list_for_each_entry(iopf, &group->faults, list) {
+			iommufd_compose_fault_message(&iopf->fault,
+						      &data, idev,
+						      group->cookie);
+			if (copy_to_user(buf + done, &data, fault_size)) {
+				xa_erase(&fault->response, group->cookie);
+				rc = -EFAULT;
+				break;
+			}
+			done += fault_size;
+		}
+
+		list_del(&group->node);
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	size_t response_size = sizeof(struct iommu_hwpt_page_response);
+	struct iommufd_fault *fault = filep->private_data;
+	struct iommu_hwpt_page_response response;
+	struct iopf_group *group;
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos || count % response_size)
+		return -ESPIPE;
+
+	mutex_lock(&fault->mutex);
+	while (count > done) {
+		rc = copy_from_user(&response, buf + done, response_size);
+		if (rc)
+			break;
+
+		group = xa_erase(&fault->response, response.cookie);
+		if (!group) {
+			rc = -EINVAL;
+			break;
+		}
+
+		iopf_group_response(group, response.code);
+		iopf_free_group(group);
+		done += response_size;
+	}
+	mutex_unlock(&fault->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static __poll_t iommufd_fault_fops_poll(struct file *filep,
+					struct poll_table_struct *wait)
+{
+	struct iommufd_fault *fault = filep->private_data;
+	__poll_t pollflags = EPOLLOUT;
+
+	poll_wait(filep, &fault->wait_queue, wait);
+	mutex_lock(&fault->mutex);
+	if (!list_empty(&fault->deliver))
+		pollflags |= EPOLLIN | EPOLLRDNORM;
+	mutex_unlock(&fault->mutex);
+
+	return pollflags;
+}
+
+static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
+{
+	struct iommufd_fault *fault = filep->private_data;
+
+	refcount_dec(&fault->obj.users);
+	iommufd_ctx_put(fault->ictx);
+	return 0;
+}
+
+static const struct file_operations iommufd_fault_fops = {
+	.owner		= THIS_MODULE,
+	.open		= nonseekable_open,
+	.read		= iommufd_fault_fops_read,
+	.write		= iommufd_fault_fops_write,
+	.poll		= iommufd_fault_fops_poll,
+	.release	= iommufd_fault_fops_release,
+	.llseek		= no_llseek,
+};
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_fault_alloc *cmd = ucmd->cmd;
+	struct iommufd_fault *fault;
+	struct file *filep;
+	int fdno;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
+	if (IS_ERR(fault))
+		return PTR_ERR(fault);
+
+	fault->ictx = ucmd->ictx;
+	INIT_LIST_HEAD(&fault->deliver);
+	xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
+	mutex_init(&fault->mutex);
+	init_waitqueue_head(&fault->wait_queue);
+
+	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
+				   fault, O_RDWR);
+	if (IS_ERR(filep)) {
+		rc = PTR_ERR(filep);
+		goto out_abort;
+	}
+
+	refcount_inc(&fault->obj.users);
+	iommufd_ctx_get(fault->ictx);
+	fault->filep = filep;
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0) {
+		rc = fdno;
+		goto out_fput;
+	}
+
+	cmd->out_fault_id = fault->obj.id;
+	cmd->out_fault_fd = fdno;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put_fdno;
+	iommufd_object_finalize(ucmd->ictx, &fault->obj);
+
+	fd_install(fdno, fault->filep);
+
+	return 0;
+out_put_fdno:
+	put_unused_fd(fdno);
+out_fput:
+	fput(filep);
+	refcount_dec(&fault->obj.users);
+	iommufd_ctx_put(fault->ictx);
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 991f864d1f9bc..c8a4519f14056 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -128,6 +128,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_HWPT_NESTED,
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
+	IOMMUFD_OBJ_FAULT,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -426,6 +427,35 @@ void iopt_remove_access(struct io_pagetable *iopt,
 			u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
+/*
+ * An iommufd_fault object represents an interface to deliver I/O page faults
+ * to the user space. These objects are created/destroyed by the user space and
+ * associated with hardware page table objects during page-table allocation.
+ */
+struct iommufd_fault {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct file *filep;
+
+	/* The lists of outstanding faults protected by below mutex. */
+	struct mutex mutex;
+	struct list_head deliver;
+	struct xarray response;
+
+	struct wait_queue_head wait_queue;
+};
+
+struct iommufd_attach_handle {
+	struct iommu_attach_handle handle;
+	struct iommufd_device *idev;
+};
+
+/* Convert an iommu attach handle to iommufd handle. */
+#define to_iommufd_handle(hdl)	container_of(hdl, struct iommufd_attach_handle, handle)
+
+int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_fault_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 39b32932c61ee..83bbd7c5d1608 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -319,6 +319,7 @@ static int iommufd_option(struct iommufd_ucmd *ucmd)
 
 union ucmd_buffer {
 	struct iommu_destroy destroy;
+	struct iommu_fault_alloc fault;
 	struct iommu_hw_info info;
 	struct iommu_hwpt_alloc hwpt;
 	struct iommu_hwpt_get_dirty_bitmap get_dirty_bitmap;
@@ -355,6 +356,8 @@ struct iommufd_ioctl_op {
 	}
 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
+	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc,
+		 out_fault_fd),
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
@@ -513,6 +516,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_hwpt_nested_destroy,
 		.abort = iommufd_hwpt_nested_abort,
 	},
+	[IOMMUFD_OBJ_FAULT] = {
+		.destroy = iommufd_fault_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 010d19279f90c..85ad9d2fa8f6d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -125,12 +125,16 @@ struct iopf_fault {
 struct iopf_group {
 	struct iopf_fault last_fault;
 	struct list_head faults;
+	size_t fault_count;
 	/* list node for iommu_fault_param::faults */
 	struct list_head pending_node;
 	struct work_struct work;
 	struct iommu_attach_handle *attach_handle;
 	/* The device's fault data parameter. */
 	struct iommu_fault_param *fault_param;
+	/* Used by handler provider to hook the group on its own lists. */
+	struct list_head node;
+	u32 cookie;
 };
 
 /**
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 4d89ed97b533f..70b8a38fcd464 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -50,6 +50,7 @@ enum {
 	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
 	IOMMUFD_CMD_HWPT_INVALIDATE,
+	IOMMUFD_CMD_FAULT_QUEUE_ALLOC,
 };
 
 /**
@@ -775,4 +776,21 @@ struct iommu_hwpt_page_response {
 	__u32 cookie;
 	__u32 code;
 };
+
+/**
+ * struct iommu_fault_alloc - ioctl(IOMMU_FAULT_QUEUE_ALLOC)
+ * @size: sizeof(struct iommu_fault_alloc)
+ * @flags: Must be 0
+ * @out_fault_id: The ID of the new FAULT
+ * @out_fault_fd: The fd of the new FAULT
+ *
+ * Explicitly allocate a fault handling object.
+ */
+struct iommu_fault_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 out_fault_id;
+	__u32 out_fault_fd;
+};
+#define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
 #endif

From 3e288bf2fa15d627b8386d171ae383f4c4be2a97 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:41 +0800
Subject: [PATCH 215/352] iommufd: Fault-capable hwpt attach/detach/replace

Add iopf-capable hw page table attach/detach/replace helpers. The pointer
to iommufd_device is stored in the domain attachment handle, so that it
can be echo'ed back in the iopf_group.

The iopf-capable hw page tables can only be attached to devices that
support the IOMMU_DEV_FEAT_IOPF feature. On the first attachment of an
iopf-capable hw_pagetable to the device, the IOPF feature is enabled on
the device. Similarly, after the last iopf-capable hwpt is detached from
the device, the IOPF feature is disabled on the device.

The current implementation allows a replacement between iopf-capable and
non-iopf-capable hw page tables. This matches the nested translation use
case, where a parent domain is attached by default and can then be
replaced with a nested user domain with iopf support.

Link: https://lore.kernel.org/r/20240702063444.105814-8-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit b7d8833677baad8c80ed1aac8c396d687e64a376)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/device.c          |   7 +-
 drivers/iommu/iommufd/fault.c           | 190 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  41 +++++
 3 files changed, 235 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 873630c111c1f..9a7ec5997c61c 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -215,6 +215,7 @@ struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 	refcount_inc(&idev->obj.users);
 	/* igroup refcount moves into iommufd_device */
 	idev->igroup = igroup;
+	mutex_init(&idev->iopf_lock);
 
 	/*
 	 * If the caller fails after this success it must call
@@ -376,7 +377,7 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	 * attachment.
 	 */
 	if (list_empty(&idev->igroup->device_list)) {
-		rc = iommu_attach_group(hwpt->domain, idev->igroup->group);
+		rc = iommufd_hwpt_attach_device(hwpt, idev);
 		if (rc)
 			goto err_unresv;
 		idev->igroup->hwpt = hwpt;
@@ -402,7 +403,7 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 	mutex_lock(&idev->igroup->lock);
 	list_del(&idev->group_item);
 	if (list_empty(&idev->igroup->device_list)) {
-		iommu_detach_group(hwpt->domain, idev->igroup->group);
+		iommufd_hwpt_detach_device(hwpt, idev);
 		idev->igroup->hwpt = NULL;
 	}
 	if (hwpt_is_paging(hwpt))
@@ -497,7 +498,7 @@ iommufd_device_do_replace(struct iommufd_device *idev,
 			goto err_unlock;
 	}
 
-	rc = iommu_group_replace_domain(igroup->group, hwpt->domain);
+	rc = iommufd_hwpt_replace_device(idev, hwpt, old_hwpt);
 	if (rc)
 		goto err_unresv;
 
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 68ff94671d489..4934ae5726383 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/iommufd.h>
+#include <linux/pci.h>
 #include <linux/poll.h>
 #include <linux/anon_inodes.h>
 #include <uapi/linux/iommufd.h>
@@ -15,6 +16,195 @@
 #include "../iommu-priv.h"
 #include "iommufd_private.h"
 
+static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
+{
+	struct device *dev = idev->dev;
+	int ret;
+
+	/*
+	 * Once we turn on PCI/PRI support for VF, the response failure code
+	 * should not be forwarded to the hardware due to PRI being a shared
+	 * resource between PF and VFs. There is no coordination for this
+	 * shared capability. This waits for a vPRI reset to recover.
+	 */
+	if (dev_is_pci(dev) && to_pci_dev(dev)->is_virtfn)
+		return -EINVAL;
+
+	mutex_lock(&idev->iopf_lock);
+	/* Device iopf has already been on. */
+	if (++idev->iopf_enabled > 1) {
+		mutex_unlock(&idev->iopf_lock);
+		return 0;
+	}
+
+	ret = iommu_dev_enable_feature(dev, IOMMU_DEV_FEAT_IOPF);
+	if (ret)
+		--idev->iopf_enabled;
+	mutex_unlock(&idev->iopf_lock);
+
+	return ret;
+}
+
+static void iommufd_fault_iopf_disable(struct iommufd_device *idev)
+{
+	mutex_lock(&idev->iopf_lock);
+	if (!WARN_ON(idev->iopf_enabled == 0)) {
+		if (--idev->iopf_enabled == 0)
+			iommu_dev_disable_feature(idev->dev, IOMMU_DEV_FEAT_IOPF);
+	}
+	mutex_unlock(&idev->iopf_lock);
+}
+
+static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_device *idev)
+{
+	struct iommufd_attach_handle *handle;
+	int ret;
+
+	handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+	if (!handle)
+		return -ENOMEM;
+
+	handle->idev = idev;
+	ret = iommu_attach_group_handle(hwpt->domain, idev->igroup->group,
+					&handle->handle);
+	if (ret)
+		kfree(handle);
+
+	return ret;
+}
+
+int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+				    struct iommufd_device *idev)
+{
+	int ret;
+
+	if (!hwpt->fault)
+		return -EINVAL;
+
+	ret = iommufd_fault_iopf_enable(idev);
+	if (ret)
+		return ret;
+
+	ret = __fault_domain_attach_dev(hwpt, idev);
+	if (ret)
+		iommufd_fault_iopf_disable(idev);
+
+	return ret;
+}
+
+static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
+					 struct iommufd_attach_handle *handle)
+{
+	struct iommufd_fault *fault = hwpt->fault;
+	struct iopf_group *group, *next;
+	unsigned long index;
+
+	if (!fault)
+		return;
+
+	mutex_lock(&fault->mutex);
+	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+		if (group->attach_handle != &handle->handle)
+			continue;
+		list_del(&group->node);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+
+	xa_for_each(&fault->response, index, group) {
+		if (group->attach_handle != &handle->handle)
+			continue;
+		xa_erase(&fault->response, index);
+		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
+		iopf_free_group(group);
+	}
+	mutex_unlock(&fault->mutex);
+}
+
+static struct iommufd_attach_handle *
+iommufd_device_get_attach_handle(struct iommufd_device *idev)
+{
+	struct iommu_attach_handle *handle;
+
+	handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0);
+	if (!handle)
+		return NULL;
+
+	return to_iommufd_handle(handle);
+}
+
+void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_device *idev)
+{
+	struct iommufd_attach_handle *handle;
+
+	handle = iommufd_device_get_attach_handle(idev);
+	iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
+	iommufd_auto_response_faults(hwpt, handle);
+	iommufd_fault_iopf_disable(idev);
+	kfree(handle);
+}
+
+static int __fault_domain_replace_dev(struct iommufd_device *idev,
+				      struct iommufd_hw_pagetable *hwpt,
+				      struct iommufd_hw_pagetable *old)
+{
+	struct iommufd_attach_handle *handle, *curr = NULL;
+	int ret;
+
+	if (old->fault)
+		curr = iommufd_device_get_attach_handle(idev);
+
+	if (hwpt->fault) {
+		handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+		if (!handle)
+			return -ENOMEM;
+
+		handle->handle.domain = hwpt->domain;
+		handle->idev = idev;
+		ret = iommu_replace_group_handle(idev->igroup->group,
+						 hwpt->domain, &handle->handle);
+	} else {
+		ret = iommu_replace_group_handle(idev->igroup->group,
+						 hwpt->domain, NULL);
+	}
+
+	if (!ret && curr) {
+		iommufd_auto_response_faults(old, curr);
+		kfree(curr);
+	}
+
+	return ret;
+}
+
+int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
+				     struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_hw_pagetable *old)
+{
+	bool iopf_off = !hwpt->fault && old->fault;
+	bool iopf_on = hwpt->fault && !old->fault;
+	int ret;
+
+	if (iopf_on) {
+		ret = iommufd_fault_iopf_enable(idev);
+		if (ret)
+			return ret;
+	}
+
+	ret = __fault_domain_replace_dev(idev, hwpt, old);
+	if (ret) {
+		if (iopf_on)
+			iommufd_fault_iopf_disable(idev);
+		return ret;
+	}
+
+	if (iopf_off)
+		iommufd_fault_iopf_disable(idev);
+
+	return 0;
+}
+
 void iommufd_fault_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index c8a4519f14056..aa4c26c87cb9f 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -11,6 +11,7 @@
 #include <linux/iommu.h>
 #include <linux/iova_bitmap.h>
 #include <uapi/linux/iommufd.h>
+#include "../iommu-priv.h"
 
 struct iommu_domain;
 struct iommu_group;
@@ -293,6 +294,7 @@ int iommufd_check_iova_range(struct io_pagetable *iopt,
 struct iommufd_hw_pagetable {
 	struct iommufd_object obj;
 	struct iommu_domain *domain;
+	struct iommufd_fault *fault;
 };
 
 struct iommufd_hwpt_paging {
@@ -396,6 +398,9 @@ struct iommufd_device {
 	/* always the physical device */
 	struct device *dev;
 	bool enforce_cache_coherency;
+	/* protect iopf_enabled counter */
+	struct mutex iopf_lock;
+	unsigned int iopf_enabled;
 };
 
 static inline struct iommufd_device *
@@ -456,6 +461,42 @@ struct iommufd_attach_handle {
 int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
 void iommufd_fault_destroy(struct iommufd_object *obj);
 
+int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+				    struct iommufd_device *idev);
+void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_device *idev);
+int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
+				     struct iommufd_hw_pagetable *hwpt,
+				     struct iommufd_hw_pagetable *old);
+
+static inline int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt,
+					     struct iommufd_device *idev)
+{
+	if (hwpt->fault)
+		return iommufd_fault_domain_attach_dev(hwpt, idev);
+
+	return iommu_attach_group(hwpt->domain, idev->igroup->group);
+}
+
+static inline void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt,
+					      struct iommufd_device *idev)
+{
+	if (hwpt->fault)
+		iommufd_fault_domain_detach_dev(hwpt, idev);
+
+	iommu_detach_group(hwpt->domain, idev->igroup->group);
+}
+
+static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
+					      struct iommufd_hw_pagetable *hwpt,
+					      struct iommufd_hw_pagetable *old)
+{
+	if (old->fault || hwpt->fault)
+		return iommufd_fault_domain_replace_dev(idev, hwpt, old);
+
+	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
+}
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);

From 48c69e68541635210083a63a276c34f99b2ffc99 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:42 +0800
Subject: [PATCH 216/352] iommufd: Associate fault object with
 iommufd_hw_pgtable

When allocating a user iommufd_hw_pagetable, the user space is allowed to
associate a fault object with the hw_pagetable by specifying the fault
object ID in the page table allocation data and setting the
IOMMU_HWPT_FAULT_ID_VALID flag bit.

On a successful return of hwpt allocation, the user can retrieve and
respond to page faults by reading and writing the file interface of the
fault object.

Once a fault object has been associated with a hwpt, the hwpt is
iopf-capable, indicated by hwpt->fault is non NULL. Attaching,
detaching, or replacing an iopf-capable hwpt to an RID or PASID will
differ from those that are not iopf-capable.

Link: https://lore.kernel.org/r/20240702063444.105814-9-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 34765cbc679c59ea5d952d738d2d16bf4aadc497)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/fault.c           | 17 +++++++++++
 drivers/iommu/iommufd/hw_pagetable.c    | 38 +++++++++++++++++++------
 drivers/iommu/iommufd/iommufd_private.h |  9 ++++++
 include/uapi/linux/iommufd.h            |  8 ++++++
 4 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 4934ae5726383..54d6cd20a6730 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -414,3 +414,20 @@ int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
 
 	return rc;
 }
+
+int iommufd_fault_iopf_handler(struct iopf_group *group)
+{
+	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_fault *fault;
+
+	hwpt = group->attach_handle->domain->fault_data;
+	fault = hwpt->fault;
+
+	mutex_lock(&fault->mutex);
+	list_add_tail(&group->node, &fault->deliver);
+	mutex_unlock(&fault->mutex);
+
+	wake_up_interruptible(&fault->wait_queue);
+
+	return 0;
+}
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 9020da52c10f4..5ea1e6e79dff8 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -8,6 +8,15 @@
 #include "../iommu-priv.h"
 #include "iommufd_private.h"
 
+static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt)
+{
+	if (hwpt->domain)
+		iommu_domain_free(hwpt->domain);
+
+	if (hwpt->fault)
+		refcount_dec(&hwpt->fault->obj.users);
+}
+
 void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_hwpt_paging *hwpt_paging =
@@ -22,9 +31,7 @@ void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
 					 hwpt_paging->common.domain);
 	}
 
-	if (hwpt_paging->common.domain)
-		iommu_domain_free(hwpt_paging->common.domain);
-
+	__iommufd_hwpt_destroy(&hwpt_paging->common);
 	refcount_dec(&hwpt_paging->ioas->obj.users);
 }
 
@@ -49,9 +56,7 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
 	struct iommufd_hwpt_nested *hwpt_nested =
 		container_of(obj, struct iommufd_hwpt_nested, common.obj);
 
-	if (hwpt_nested->common.domain)
-		iommu_domain_free(hwpt_nested->common.domain);
-
+	__iommufd_hwpt_destroy(&hwpt_nested->common);
 	refcount_dec(&hwpt_nested->parent->common.obj.users);
 }
 
@@ -217,7 +222,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	struct iommufd_hw_pagetable *hwpt;
 	int rc;
 
-	if (flags || !user_data->len || !ops->domain_alloc_user)
+	if ((flags & ~IOMMU_HWPT_FAULT_ID_VALID) ||
+	    !user_data->len || !ops->domain_alloc_user)
 		return ERR_PTR(-EOPNOTSUPP);
 	if (parent->auto_domain || !parent->nest_parent)
 		return ERR_PTR(-EINVAL);
@@ -231,7 +237,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	refcount_inc(&parent->common.obj.users);
 	hwpt_nested->parent = parent;
 
-	hwpt->domain = ops->domain_alloc_user(idev->dev, flags,
+	hwpt->domain = ops->domain_alloc_user(idev->dev,
+					      flags & ~IOMMU_HWPT_FAULT_ID_VALID,
 					      parent->common.domain, user_data);
 	if (IS_ERR(hwpt->domain)) {
 		rc = PTR_ERR(hwpt->domain);
@@ -312,6 +319,21 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 		goto out_put_pt;
 	}
 
+	if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) {
+		struct iommufd_fault *fault;
+
+		fault = iommufd_get_fault(ucmd, cmd->fault_id);
+		if (IS_ERR(fault)) {
+			rc = PTR_ERR(fault);
+			goto out_hwpt;
+		}
+		hwpt->fault = fault;
+		hwpt->domain->iopf_handler = iommufd_fault_iopf_handler;
+		hwpt->domain->fault_data = hwpt;
+		refcount_inc(&fault->obj.users);
+		iommufd_put_object(ucmd->ictx, &fault->obj);
+	}
+
 	cmd->out_hwpt_id = hwpt->obj.id;
 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 	if (rc)
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index aa4c26c87cb9f..92efe30a8f0d0 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -458,8 +458,17 @@ struct iommufd_attach_handle {
 /* Convert an iommu attach handle to iommufd handle. */
 #define to_iommufd_handle(hdl)	container_of(hdl, struct iommufd_attach_handle, handle)
 
+static inline struct iommufd_fault *
+iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_FAULT),
+			    struct iommufd_fault, obj);
+}
+
 int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
 void iommufd_fault_destroy(struct iommufd_object *obj);
+int iommufd_fault_iopf_handler(struct iopf_group *group);
 
 int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
 				    struct iommufd_device *idev);
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 70b8a38fcd464..ede2b464a7619 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -357,10 +357,13 @@ struct iommu_vfio_ioas {
  *                                the parent HWPT in a nesting configuration.
  * @IOMMU_HWPT_ALLOC_DIRTY_TRACKING: Dirty tracking support for device IOMMU is
  *                                   enforced on device attachment
+ * @IOMMU_HWPT_FAULT_ID_VALID: The fault_id field of hwpt allocation data is
+ *                             valid.
  */
 enum iommufd_hwpt_alloc_flags {
 	IOMMU_HWPT_ALLOC_NEST_PARENT = 1 << 0,
 	IOMMU_HWPT_ALLOC_DIRTY_TRACKING = 1 << 1,
+	IOMMU_HWPT_FAULT_ID_VALID = 1 << 2,
 };
 
 /**
@@ -412,6 +415,9 @@ enum iommu_hwpt_data_type {
  * @data_type: One of enum iommu_hwpt_data_type
  * @data_len: Length of the type specific data
  * @data_uptr: User pointer to the type specific data
+ * @fault_id: The ID of IOMMUFD_FAULT object. Valid only if flags field of
+ *            IOMMU_HWPT_FAULT_ID_VALID is set.
+ * @__reserved2: Padding to 64-bit alignment. Must be 0.
  *
  * Explicitly allocate a hardware page table object. This is the same object
  * type that is returned by iommufd_device_attach() and represents the
@@ -442,6 +448,8 @@ struct iommu_hwpt_alloc {
 	__u32 data_type;
 	__u32 data_len;
 	__aligned_u64 data_uptr;
+	__u32 fault_id;
+	__u32 __reserved2;
 };
 #define IOMMU_HWPT_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_HWPT_ALLOC)
 

From e2a6082d1735e61f5430f5fd18b94a288775b0ba Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:43 +0800
Subject: [PATCH 217/352] iommufd/selftest: Add IOPF support for mock device

Extend the selftest mock device to support generating and responding to
an IOPF. Also add an ioctl interface to userspace applications to trigger
the IOPF on the mock device. This would allow userspace applications to
test the IOMMUFD's handling of IOPFs without having to rely on any real
hardware.

Link: https://lore.kernel.org/r/20240702063444.105814-10-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit ddee19971081b42615d62f4fdada21274708ed4d)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_test.h |  8 ++++
 drivers/iommu/iommufd/selftest.c     | 64 ++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index e854d3f672051..acbbba1c66716 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -22,6 +22,7 @@ enum {
 	IOMMU_TEST_OP_MOCK_DOMAIN_FLAGS,
 	IOMMU_TEST_OP_DIRTY,
 	IOMMU_TEST_OP_MD_CHECK_IOTLB,
+	IOMMU_TEST_OP_TRIGGER_IOPF,
 };
 
 enum {
@@ -127,6 +128,13 @@ struct iommu_test_cmd {
 			__u32 id;
 			__u32 iotlb;
 		} check_iotlb;
+		struct {
+			__u32 dev_id;
+			__u32 pasid;
+			__u32 grpid;
+			__u32 perm;
+			__u64 addr;
+		} trigger_iopf;
 	};
 	__u32 last;
 };
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 7a70a3e0fee6a..f95e32e291333 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -504,6 +504,8 @@ static bool mock_domain_capable(struct device *dev, enum iommu_cap cap)
 	return false;
 }
 
+static struct iopf_queue *mock_iommu_iopf_queue;
+
 static struct iommu_device mock_iommu_device = {
 };
 
@@ -514,6 +516,29 @@ static struct iommu_device *mock_probe_device(struct device *dev)
 	return &mock_iommu_device;
 }
 
+static void mock_domain_page_response(struct device *dev, struct iopf_fault *evt,
+				      struct iommu_page_response *msg)
+{
+}
+
+static int mock_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+	if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue)
+		return -ENODEV;
+
+	return iopf_queue_add_device(mock_iommu_iopf_queue, dev);
+}
+
+static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
+{
+	if (feat != IOMMU_DEV_FEAT_IOPF || !mock_iommu_iopf_queue)
+		return -ENODEV;
+
+	iopf_queue_remove_device(mock_iommu_iopf_queue, dev);
+
+	return 0;
+}
+
 static const struct iommu_ops mock_ops = {
 	/*
 	 * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type()
@@ -529,6 +554,10 @@ static const struct iommu_ops mock_ops = {
 	.capable = mock_domain_capable,
 	.device_group = generic_device_group,
 	.probe_device = mock_probe_device,
+	.page_response = mock_domain_page_response,
+	.dev_enable_feat = mock_dev_enable_feat,
+	.dev_disable_feat = mock_dev_disable_feat,
+	.user_pasid_table = true,
 	.default_domain_ops =
 		&(struct iommu_domain_ops){
 			.free = mock_domain_free,
@@ -1375,6 +1404,31 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 	return rc;
 }
 
+static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd,
+				     struct iommu_test_cmd *cmd)
+{
+	struct iopf_fault event = { };
+	struct iommufd_device *idev;
+
+	idev = iommufd_get_device(ucmd, cmd->trigger_iopf.dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	event.fault.prm.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE;
+	if (cmd->trigger_iopf.pasid != IOMMU_NO_PASID)
+		event.fault.prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+	event.fault.type = IOMMU_FAULT_PAGE_REQ;
+	event.fault.prm.addr = cmd->trigger_iopf.addr;
+	event.fault.prm.pasid = cmd->trigger_iopf.pasid;
+	event.fault.prm.grpid = cmd->trigger_iopf.grpid;
+	event.fault.prm.perm = cmd->trigger_iopf.perm;
+
+	iommu_report_device_fault(idev->dev, &event);
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+
+	return 0;
+}
+
 void iommufd_selftest_destroy(struct iommufd_object *obj)
 {
 	struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
@@ -1450,6 +1504,8 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
 					  cmd->dirty.page_size,
 					  u64_to_user_ptr(cmd->dirty.uptr),
 					  cmd->dirty.flags);
+	case IOMMU_TEST_OP_TRIGGER_IOPF:
+		return iommufd_test_trigger_iopf(ucmd, cmd);
 	default:
 		return -EOPNOTSUPP;
 	}
@@ -1491,6 +1547,9 @@ int __init iommufd_test_init(void)
 				  &iommufd_mock_bus_type.nb);
 	if (rc)
 		goto err_sysfs;
+
+	mock_iommu_iopf_queue = iopf_queue_alloc("mock-iopfq");
+
 	return 0;
 
 err_sysfs:
@@ -1506,6 +1565,11 @@ int __init iommufd_test_init(void)
 
 void iommufd_test_exit(void)
 {
+	if (mock_iommu_iopf_queue) {
+		iopf_queue_free(mock_iommu_iopf_queue);
+		mock_iommu_iopf_queue = NULL;
+	}
+
 	iommu_device_sysfs_remove(&mock_iommu_device);
 	iommu_device_unregister_bus(&mock_iommu_device,
 				    &iommufd_mock_bus_type.bus,

From 6ca31cc9b0905ddef16a8a5a746a109121ef07bc Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Tue, 2 Jul 2024 14:34:44 +0800
Subject: [PATCH 218/352] iommufd/selftest: Add coverage for IOPF test

Extend the selftest tool to add coverage of testing IOPF handling. This
would include the following tests:

- Allocating and destroying an iommufd fault object.
- Allocating and destroying an IOPF-capable HWPT.
- Attaching/detaching/replacing an IOPF-capable HWPT on a device.
- Triggering an IOPF on the mock device.
- Retrieving and responding to the IOPF through the file interface.

Link: https://lore.kernel.org/r/20240702063444.105814-11-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit d1211768b62d02e27b46a3ff78f739c4776a0f03)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 22 +++++
 .../selftests/iommu/iommufd_fail_nth.c        |  2 +-
 tools/testing/selftests/iommu/iommufd_utils.h | 86 +++++++++++++++++--
 3 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 5f7d5a5ba89b0..6343f4053bd46 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -279,6 +279,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 	uint32_t parent_hwpt_id = 0;
 	uint32_t parent_hwpt_id_not_work = 0;
 	uint32_t test_hwpt_id = 0;
+	uint32_t iopf_hwpt_id;
+	uint32_t fault_id;
+	uint32_t fault_fd;
 
 	if (self->device_id) {
 		/* Negative tests */
@@ -326,6 +329,7 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 					   sizeof(data));
 
 		/* Allocate two nested hwpts sharing one common parent hwpt */
+		test_ioctl_fault_alloc(&fault_id, &fault_fd);
 		test_cmd_hwpt_alloc_nested(self->device_id, parent_hwpt_id, 0,
 					   &nested_hwpt_id[0],
 					   IOMMU_HWPT_DATA_SELFTEST, &data,
@@ -334,6 +338,14 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 					   &nested_hwpt_id[1],
 					   IOMMU_HWPT_DATA_SELFTEST, &data,
 					   sizeof(data));
+		test_err_hwpt_alloc_iopf(ENOENT, self->device_id, parent_hwpt_id,
+					 UINT32_MAX, IOMMU_HWPT_FAULT_ID_VALID,
+					 &iopf_hwpt_id, IOMMU_HWPT_DATA_SELFTEST,
+					 &data, sizeof(data));
+		test_cmd_hwpt_alloc_iopf(self->device_id, parent_hwpt_id, fault_id,
+					 IOMMU_HWPT_FAULT_ID_VALID, &iopf_hwpt_id,
+					 IOMMU_HWPT_DATA_SELFTEST, &data,
+					 sizeof(data));
 		test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[0],
 					      IOMMU_TEST_IOTLB_DEFAULT);
 		test_cmd_hwpt_check_iotlb_all(nested_hwpt_id[1],
@@ -504,14 +516,24 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 			     _test_ioctl_destroy(self->fd, nested_hwpt_id[1]));
 		test_ioctl_destroy(nested_hwpt_id[0]);
 
+		/* Switch from nested_hwpt_id[1] to iopf_hwpt_id */
+		test_cmd_mock_domain_replace(self->stdev_id, iopf_hwpt_id);
+		EXPECT_ERRNO(EBUSY,
+			     _test_ioctl_destroy(self->fd, iopf_hwpt_id));
+		/* Trigger an IOPF on the device */
+		test_cmd_trigger_iopf(self->device_id, fault_fd);
+
 		/* Detach from nested_hwpt_id[1] and destroy it */
 		test_cmd_mock_domain_replace(self->stdev_id, parent_hwpt_id);
 		test_ioctl_destroy(nested_hwpt_id[1]);
+		test_ioctl_destroy(iopf_hwpt_id);
 
 		/* Detach from the parent hw_pagetable and destroy it */
 		test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
 		test_ioctl_destroy(parent_hwpt_id);
 		test_ioctl_destroy(parent_hwpt_id_not_work);
+		close(fault_fd);
+		test_ioctl_destroy(fault_id);
 	} else {
 		test_err_hwpt_alloc(ENOENT, self->device_id, self->ioas_id, 0,
 				    &parent_hwpt_id);
diff --git a/tools/testing/selftests/iommu/iommufd_fail_nth.c b/tools/testing/selftests/iommu/iommufd_fail_nth.c
index f590417cd67a9..c5d5e69452b01 100644
--- a/tools/testing/selftests/iommu/iommufd_fail_nth.c
+++ b/tools/testing/selftests/iommu/iommufd_fail_nth.c
@@ -615,7 +615,7 @@ TEST_FAIL_NTH(basic_fail_nth, device)
 	if (_test_cmd_get_hw_info(self->fd, idev_id, &info, sizeof(info), NULL))
 		return -1;
 
-	if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, &hwpt_id,
+	if (_test_cmd_hwpt_alloc(self->fd, idev_id, ioas_id, 0, 0, &hwpt_id,
 				 IOMMU_HWPT_DATA_NONE, 0, 0))
 		return -1;
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index c612fbf0195ba..40f6f14ce136f 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -155,7 +155,7 @@ static int _test_cmd_mock_domain_replace(int fd, __u32 stdev_id, __u32 pt_id,
 	EXPECT_ERRNO(_errno, _test_cmd_mock_domain_replace(self->fd, stdev_id, \
 							   pt_id, NULL))
 
-static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
+static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_id,
 				__u32 flags, __u32 *hwpt_id, __u32 data_type,
 				void *data, size_t data_len)
 {
@@ -167,6 +167,7 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
 		.data_type = data_type,
 		.data_len = data_len,
 		.data_uptr = (uint64_t)data,
+		.fault_id = ft_id,
 	};
 	int ret;
 
@@ -179,24 +180,36 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id,
 }
 
 #define test_cmd_hwpt_alloc(device_id, pt_id, flags, hwpt_id)                  \
-	ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags,   \
+	ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags,   \
 					  hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \
 					  0))
 #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id)   \
 	EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc(                      \
-				     self->fd, device_id, pt_id, flags, \
+				     self->fd, device_id, pt_id, 0, flags, \
 				     hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, 0))
 
 #define test_cmd_hwpt_alloc_nested(device_id, pt_id, flags, hwpt_id,         \
 				   data_type, data, data_len)                \
-	ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+	ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \
 					  hwpt_id, data_type, data, data_len))
 #define test_err_hwpt_alloc_nested(_errno, device_id, pt_id, flags, hwpt_id, \
 				   data_type, data, data_len)                \
 	EXPECT_ERRNO(_errno,                                                 \
-		     _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, flags, \
+		     _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \
 					  hwpt_id, data_type, data, data_len))
 
+#define test_cmd_hwpt_alloc_iopf(device_id, pt_id, fault_id, flags, hwpt_id,    \
+				   data_type, data, data_len)                   \
+	ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, fault_id, \
+					  flags, hwpt_id, data_type, data,      \
+					  data_len))
+#define test_err_hwpt_alloc_iopf(_errno, device_id, pt_id, fault_id, flags,     \
+				 hwpt_id, data_type, data, data_len)            \
+	EXPECT_ERRNO(_errno,                                                    \
+		     _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, fault_id, \
+					  flags, hwpt_id, data_type, data,      \
+					  data_len))
+
 #define test_cmd_hwpt_check_iotlb(hwpt_id, iotlb_id, expected)                 \
 	({                                                                     \
 		struct iommu_test_cmd test_cmd = {                             \
@@ -686,3 +699,66 @@ static int _test_cmd_get_hw_info(int fd, __u32 device_id, void *data,
 
 #define test_cmd_get_hw_capabilities(device_id, caps, mask) \
 	ASSERT_EQ(0, _test_cmd_get_hw_info(self->fd, device_id, NULL, 0, &caps))
+
+static int _test_ioctl_fault_alloc(int fd, __u32 *fault_id, __u32 *fault_fd)
+{
+	struct iommu_fault_alloc cmd = {
+		.size = sizeof(cmd),
+	};
+	int ret;
+
+	ret = ioctl(fd, IOMMU_FAULT_QUEUE_ALLOC, &cmd);
+	if (ret)
+		return ret;
+	*fault_id = cmd.out_fault_id;
+	*fault_fd = cmd.out_fault_fd;
+	return 0;
+}
+
+#define test_ioctl_fault_alloc(fault_id, fault_fd)                       \
+	({                                                               \
+		ASSERT_EQ(0, _test_ioctl_fault_alloc(self->fd, fault_id, \
+						     fault_fd));         \
+		ASSERT_NE(0, *(fault_id));                               \
+		ASSERT_NE(0, *(fault_fd));                               \
+	})
+
+static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd)
+{
+	struct iommu_test_cmd trigger_iopf_cmd = {
+		.size = sizeof(trigger_iopf_cmd),
+		.op = IOMMU_TEST_OP_TRIGGER_IOPF,
+		.trigger_iopf = {
+			.dev_id = device_id,
+			.pasid = 0x1,
+			.grpid = 0x2,
+			.perm = IOMMU_PGFAULT_PERM_READ | IOMMU_PGFAULT_PERM_WRITE,
+			.addr = 0xdeadbeaf,
+		},
+	};
+	struct iommu_hwpt_page_response response = {
+		.code = IOMMUFD_PAGE_RESP_SUCCESS,
+	};
+	struct iommu_hwpt_pgfault fault = {};
+	ssize_t bytes;
+	int ret;
+
+	ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_IOPF), &trigger_iopf_cmd);
+	if (ret)
+		return ret;
+
+	bytes = read(fault_fd, &fault, sizeof(fault));
+	if (bytes <= 0)
+		return -EIO;
+
+	response.cookie = fault.cookie;
+
+	bytes = write(fault_fd, &response, sizeof(response));
+	if (bytes <= 0)
+		return -EIO;
+
+	return 0;
+}
+
+#define test_cmd_trigger_iopf(device_id, fault_fd) \
+	ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd))

From e47734610df760ebd5d42e3bba1f5a777f2e3ab1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 28 Jun 2024 13:11:11 -0300
Subject: [PATCH 219/352] iommufd: Require drivers to supply the
 cache_invalidate_user ops

If drivers don't do this then iommufd will oops invalidation ioctls with
something like:

  Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
  Mem abort info:
    ESR = 0x0000000086000004
    EC = 0x21: IABT (current EL), IL = 32 bits
    SET = 0, FnV = 0
    EA = 0, S1PTW = 0
    FSC = 0x04: level 0 translation fault
  user pgtable: 4k pages, 48-bit VAs, pgdp=0000000101059000
  [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
  Internal error: Oops: 0000000086000004 [#1] PREEMPT SMP
  Modules linked in:
  CPU: 2 PID: 371 Comm: qemu-system-aar Not tainted 6.8.0-rc7-gde77230ac23a #9
  Hardware name: linux,dummy-virt (DT)
  pstate: 81400809 (Nzcv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=-c)
  pc : 0x0
  lr : iommufd_hwpt_invalidate+0xa4/0x204
  sp : ffff800080f3bcc0
  x29: ffff800080f3bcf0 x28: ffff0000c369b300 x27: 0000000000000000
  x26: 0000000000000000 x25: 0000000000000000 x24: 0000000000000000
  x23: 0000000000000000 x22: 00000000c1e334a0 x21: ffff0000c1e334a0
  x20: ffff800080f3bd38 x19: ffff800080f3bd58 x18: 0000000000000000
  x17: 0000000000000000 x16: 0000000000000000 x15: 0000ffff8240d6d8
  x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000
  x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000
  x8 : 0000001000000002 x7 : 0000fffeac1ec950 x6 : 0000000000000000
  x5 : ffff800080f3bd78 x4 : 0000000000000003 x3 : 0000000000000002
  x2 : 0000000000000000 x1 : ffff800080f3bcc8 x0 : ffff0000c6034d80
  Call trace:
   0x0
   iommufd_fops_ioctl+0x154/0x274
   __arm64_sys_ioctl+0xac/0xf0
   invoke_syscall+0x48/0x110
   el0_svc_common.constprop.0+0x40/0xe0
   do_el0_svc+0x1c/0x28
   el0_svc+0x34/0xb4
   el0t_64_sync_handler+0x120/0x12c
   el0t_64_sync+0x190/0x194

All existing drivers implement this op for nesting, this is mostly a
bisection aid.

Fixes: 8c6eabae3807 ("iommufd: Add IOMMU_HWPT_INVALIDATE")
Link: https://lore.kernel.org/r/0-v1-e153859bd707+61-iommufd_check_ops_jgg@nvidia.com
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit a11dda723c6493bb1853bbc61c093377f96e2d47)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 5ea1e6e79dff8..aefde4443671e 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -247,7 +247,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 	}
 	hwpt->domain->owner = ops;
 
-	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED)) {
+	if (WARN_ON_ONCE(hwpt->domain->type != IOMMU_DOMAIN_NESTED ||
+			 !hwpt->domain->ops->cache_invalidate_user)) {
 		rc = -EINVAL;
 		goto out_abort;
 	}

From ad7849ef79fc0b1d3b795e0d193bbb35dd4fdb84 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi.l.liu@intel.com>
Date: Tue, 9 Jul 2024 17:41:50 -0700
Subject: [PATCH 220/352] vfio/pci: Init the count variable in collecting
 hot-reset devices
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The count variable is used without initialization, it results in mistakes
in the device counting and crashes the userspace if the get hot reset info
path is triggered.

Fixes: f6944d4a0b87 ("vfio/pci: Collect hot-reset devices to local buffer")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=219010
Reported-by: Žilvinas Žaltiena <zaltys@natrix.lt>
Cc: Beld Zhang <beldzhang@gmail.com>
Signed-off-by: Yi Liu <yi.l.liu@intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240710004150.319105-1-yi.l.liu@intel.com
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
(cherry picked from commit 5a88a3f67e37e39f933b38ebb4985ba5822e9eca)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c
index 80cae87fff36e..7741ac9825278 100644
--- a/drivers/vfio/pci/vfio_pci_core.c
+++ b/drivers/vfio/pci/vfio_pci_core.c
@@ -1260,7 +1260,7 @@ static int vfio_pci_ioctl_get_pci_hot_reset_info(
 	struct vfio_pci_hot_reset_info hdr;
 	struct vfio_pci_fill_info fill = {};
 	bool slot = false;
-	int ret, count;
+	int ret, count = 0;
 
 	if (copy_from_user(&hdr, arg, minsz))
 		return -EFAULT;

From 87c8a25e2069f93d2bff74920af36286932fd471 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 10 Jul 2024 16:33:39 +0800
Subject: [PATCH 221/352] iommufd: Remove IOMMUFD_PAGE_RESP_FAILURE

The response code of IOMMUFD_PAGE_RESP_FAILURE was defined to be
equivalent to the "Response Failure" in PCI spec, section 10.4.2.1.
This response code indicates that one or more pages within the
associated request group have encountered or caused an unrecoverable
error. Therefore, this response disables the PRI at the function.

Modern I/O virtualization technologies, like SR-IOV, share PRI among
the assignable device units. Therefore, a response failure on one unit
might cause I/O failure on other units.

Remove this response code so that user space can only respond with
SUCCESS or INVALID. The VMM is recommended to emulate a failure response
as a PRI reset, or PRI disable and changing to a non-PRI domain.

Fixes: c714f15860fc ("iommufd: Add fault and response message definitions")
Link: https://lore.kernel.org/r/20240710083341.44617-2-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 861f96a785149a0062cce6578e0fa7cb95435a7e)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/uapi/linux/iommufd.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index ede2b464a7619..e31385b75d0be 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -765,14 +765,10 @@ struct iommu_hwpt_pgfault {
  * @IOMMUFD_PAGE_RESP_INVALID: Could not handle this fault, don't retry the
  *                             access. This is the "Invalid Request" in PCI
  *                             10.4.2.1.
- * @IOMMUFD_PAGE_RESP_FAILURE: General error. Drop all subsequent faults from
- *                             this device if possible. This is the "Response
- *                             Failure" in PCI 10.4.2.1.
  */
 enum iommufd_page_response_code {
 	IOMMUFD_PAGE_RESP_SUCCESS = 0,
 	IOMMUFD_PAGE_RESP_INVALID,
-	IOMMUFD_PAGE_RESP_FAILURE,
 };
 
 /**

From 285c412767a819029acac7b2e70bc527b1b64e85 Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Wed, 10 Jul 2024 16:33:40 +0800
Subject: [PATCH 222/352] iommufd: Add check on user response code

The response code from user space is only allowed to be SUCCESS or
INVALID. All other values are treated by the device as a response code of
Response Failure according to PCI spec, section 10.4.2.1.  This response
disables the Page Request Interface for the Function.

Add a check in iommufd_fault_fops_write() to avoid invalid response
code.

Fixes: 07838f7fd529 ("iommufd: Add iommufd fault object")
Link: https://lore.kernel.org/r/20240710083341.44617-3-baolu.lu@linux.intel.com
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit d73cf5ff743b5a8de6fa20651baba5bd56ba98a3)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/fault.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 54d6cd20a6730..9c142cefa2d2c 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -305,6 +305,16 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b
 		if (rc)
 			break;
 
+		static_assert((int)IOMMUFD_PAGE_RESP_SUCCESS ==
+			      (int)IOMMU_PAGE_RESP_SUCCESS);
+		static_assert((int)IOMMUFD_PAGE_RESP_INVALID ==
+			      (int)IOMMU_PAGE_RESP_INVALID);
+		if (response.code != IOMMUFD_PAGE_RESP_SUCCESS &&
+		    response.code != IOMMUFD_PAGE_RESP_INVALID) {
+			rc = -EINVAL;
+			break;
+		}
+
 		group = xa_erase(&fault->response, response.cookie);
 		if (!group) {
 			rc = -EINVAL;

From 1cd1c2e28f030c1f3dc195f6f1ebad77b064403c Mon Sep 17 00:00:00 2001
From: Lu Baolu <baolu.lu@linux.intel.com>
Date: Fri, 12 Jul 2024 10:58:19 +0800
Subject: [PATCH 223/352] iommufd: Fix error pointer checking

Smatch static checker reported below warning:

    drivers/iommu/iommufd/fault.c:131 iommufd_device_get_attach_handle()
    warn: 'handle' is an error pointer or valid

Fix it by checking 'handle' with IS_ERR().

Fixes: b7d8833677ba ("iommufd: Fault-capable hwpt attach/detach/replace")
Link: https://lore.kernel.org/r/20240712025819.63147-1-baolu.lu@linux.intel.com
Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
Closes: https://lore.kernel.org/linux-iommu/8bb4f37a-4514-4dea-aabb-7380be303895@stanley.mountain/
Signed-off-by: Lu Baolu <baolu.lu@linux.intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 595572aae3d0c3bf295ea759b74b948e7493a9ff)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index 9c142cefa2d2c..a643d5c7c535f 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -128,7 +128,7 @@ iommufd_device_get_attach_handle(struct iommufd_device *idev)
 	struct iommu_attach_handle *handle;
 
 	handle = iommu_attach_handle_get(idev->igroup->group, IOMMU_NO_PASID, 0);
-	if (!handle)
+	if (IS_ERR(handle))
 		return NULL;
 
 	return to_iommufd_handle(handle);

From e5418b20afc2a47462136cdf4719ef9276b62fee Mon Sep 17 00:00:00 2001
From: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Date: Fri, 12 Jul 2024 12:31:32 +0100
Subject: [PATCH 224/352] iommu: Move IOMMU_DIRTY_NO_CLEAR define

Fixes the compile issue when CONFIG_IOMMU_API is not set.

Fixes: 4fe88fd8b4ae ("iommu/io-pgtable-arm: Add read_and_clear_dirty() support")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202407121602.HL9ih1it-lkp@intel.com/
Signed-off-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/20240712113132.45100-1-shameerali.kolothum.thodi@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 9b2bc6b9a264b863a2273c02db5ee9e214e0a526)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/iommu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 85ad9d2fa8f6d..b1c94e691afd2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -322,6 +322,9 @@ enum iommu_dev_features {
 #define IOMMU_PASID_INVALID	(-1U)
 typedef unsigned int ioasid_t;
 
+/* Read but do not clear any dirty bits */
+#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
+
 #ifdef CONFIG_IOMMU_API
 
 /**
@@ -358,9 +361,6 @@ struct iommu_dirty_bitmap {
 	struct iommu_iotlb_gather *gather;
 };
 
-/* Read but do not clear any dirty bits */
-#define IOMMU_DIRTY_NO_CLEAR (1 << 0)
-
 /**
  * struct iommu_dirty_ops - domain specific dirty tracking operations
  * @set_dirty_tracking: Enable or Disable dirty tracking on the iommu domain

From 4b9a02c0ddfe73a518ad973bfce5e336610a485e Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 11 Jul 2024 21:11:03 -0300
Subject: [PATCH 225/352] iommufd: Put constants for all the uAPI enums

Relying on position in the enum makes it subtly harder when doing merge
resolutions or backporting as it is easy to grab a patch and not notice it
is a uAPI change with a differently ordered enum. This may become a bigger
problem in next cycles when iommu_hwpt_invalidate_data_type and other
per-driver enums have patches flowing through different trees.

So lets start including constants for all the uAPI enums to make this
safer.

No functional change.

Link: https://lore.kernel.org/r/0-v1-2c06ec044924+133-iommufd_uapi_const_jgg@nvidia.com
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Reviewed-by: Yi Liu <yi.l.liu@intel.com>
Tested-by: Yi Liu <yi.l.liu@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 136a8066676e593cd29627219467fc222c8f3b04)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/uapi/linux/iommufd.h | 40 ++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index e31385b75d0be..4dde745cfb7e2 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -37,20 +37,20 @@
 enum {
 	IOMMUFD_CMD_BASE = 0x80,
 	IOMMUFD_CMD_DESTROY = IOMMUFD_CMD_BASE,
-	IOMMUFD_CMD_IOAS_ALLOC,
-	IOMMUFD_CMD_IOAS_ALLOW_IOVAS,
-	IOMMUFD_CMD_IOAS_COPY,
-	IOMMUFD_CMD_IOAS_IOVA_RANGES,
-	IOMMUFD_CMD_IOAS_MAP,
-	IOMMUFD_CMD_IOAS_UNMAP,
-	IOMMUFD_CMD_OPTION,
-	IOMMUFD_CMD_VFIO_IOAS,
-	IOMMUFD_CMD_HWPT_ALLOC,
-	IOMMUFD_CMD_GET_HW_INFO,
-	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING,
-	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP,
-	IOMMUFD_CMD_HWPT_INVALIDATE,
-	IOMMUFD_CMD_FAULT_QUEUE_ALLOC,
+	IOMMUFD_CMD_IOAS_ALLOC = 0x81,
+	IOMMUFD_CMD_IOAS_ALLOW_IOVAS = 0x82,
+	IOMMUFD_CMD_IOAS_COPY = 0x83,
+	IOMMUFD_CMD_IOAS_IOVA_RANGES = 0x84,
+	IOMMUFD_CMD_IOAS_MAP = 0x85,
+	IOMMUFD_CMD_IOAS_UNMAP = 0x86,
+	IOMMUFD_CMD_OPTION = 0x87,
+	IOMMUFD_CMD_VFIO_IOAS = 0x88,
+	IOMMUFD_CMD_HWPT_ALLOC = 0x89,
+	IOMMUFD_CMD_GET_HW_INFO = 0x8a,
+	IOMMUFD_CMD_HWPT_SET_DIRTY_TRACKING = 0x8b,
+	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
+	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
+	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 };
 
 /**
@@ -400,8 +400,8 @@ struct iommu_hwpt_vtd_s1 {
  * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
  */
 enum iommu_hwpt_data_type {
-	IOMMU_HWPT_DATA_NONE,
-	IOMMU_HWPT_DATA_VTD_S1,
+	IOMMU_HWPT_DATA_NONE = 0,
+	IOMMU_HWPT_DATA_VTD_S1 = 1,
 };
 
 /**
@@ -491,8 +491,8 @@ struct iommu_hw_info_vtd {
  * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
  */
 enum iommu_hw_info_type {
-	IOMMU_HW_INFO_TYPE_NONE,
-	IOMMU_HW_INFO_TYPE_INTEL_VTD,
+	IOMMU_HW_INFO_TYPE_NONE = 0,
+	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
 };
 
 /**
@@ -629,7 +629,7 @@ struct iommu_hwpt_get_dirty_bitmap {
  * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
  */
 enum iommu_hwpt_invalidate_data_type {
-	IOMMU_HWPT_INVALIDATE_DATA_VTD_S1,
+	IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
 };
 
 /**
@@ -768,7 +768,7 @@ struct iommu_hwpt_pgfault {
  */
 enum iommufd_page_response_code {
 	IOMMUFD_PAGE_RESP_SUCCESS = 0,
-	IOMMUFD_PAGE_RESP_INVALID,
+	IOMMUFD_PAGE_RESP_INVALID = 1,
 };
 
 /**

From d022c3a33e475e695fb53ba652a58b81ed8176a3 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 17 Jul 2024 22:01:30 -0700
Subject: [PATCH 226/352] iommufd/device: Fix hwpt at err_unresv in
 iommufd_device_do_replace()

The rewind routine should remove the reserved iovas added to the new hwpt.

Fixes: 89db31635c87 ("iommufd: Derive iommufd_hwpt_paging from iommufd_hw_pagetable")
Cc: stable@vger.kernel.org
Link: https://patch.msgid.link/r/20240718050130.1956804-1-nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 950aeefb34923fe3c28ade35fe05f24e2c5b1d55)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 9a7ec5997c61c..3214a4c17c6b3 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -526,7 +526,7 @@ iommufd_device_do_replace(struct iommufd_device *idev,
 err_unresv:
 	if (hwpt_is_paging(hwpt))
 		iommufd_group_remove_reserved_iova(igroup,
-						   to_hwpt_paging(old_hwpt));
+						   to_hwpt_paging(hwpt));
 err_unlock:
 	mutex_unlock(&idev->igroup->lock);
 	return ERR_PTR(rc);

From 7f9f3501b0fab7eb7cb0b4d0ec3ad8e4d5c7abcd Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 2 Aug 2024 17:32:02 -0700
Subject: [PATCH 227/352] iommufd: Reorder include files

Reorder include files to alphabetic order to simplify maintenance, and
separate local headers and global headers with a blank line.

No functional change intended.

Link: https://patch.msgid.link/r/7524b037cc05afe19db3c18f863253e1d1554fa2.1722644866.git.nicolinc@nvidia.com
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 1d4684fbe88dc28e2bf79f5e94a432f0469d2dac)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/device.c          |  4 ++--
 drivers/iommu/iommufd/fault.c           |  4 ++--
 drivers/iommu/iommufd/io_pagetable.c    |  8 ++++----
 drivers/iommu/iommufd/io_pagetable.h    |  2 +-
 drivers/iommu/iommufd/ioas.c            |  2 +-
 drivers/iommu/iommufd/iommufd_private.h |  9 +++++----
 drivers/iommu/iommufd/iommufd_test.h    |  2 +-
 drivers/iommu/iommufd/iova_bitmap.c     |  2 +-
 drivers/iommu/iommufd/main.c            |  8 ++++----
 drivers/iommu/iommufd/pages.c           | 10 +++++-----
 drivers/iommu/iommufd/selftest.c        |  9 +++++----
 include/linux/iommufd.h                 |  4 ++--
 include/uapi/linux/iommufd.h            |  2 +-
 13 files changed, 34 insertions(+), 32 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 3214a4c17c6b3..895f2a59fde13 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
  */
+#include <linux/iommu.h>
 #include <linux/iommufd.h>
 #include <linux/slab.h>
-#include <linux/iommu.h>
 #include <uapi/linux/iommufd.h>
-#include "../iommu-priv.h"
 
+#include "../iommu-priv.h"
 #include "io_pagetable.h"
 #include "iommufd_private.h"
 
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index a643d5c7c535f..df03411c87289 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -3,14 +3,14 @@
  */
 #define pr_fmt(fmt) "iommufd: " fmt
 
+#include <linux/anon_inodes.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/iommufd.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/iommufd.h>
 #include <linux/pci.h>
 #include <linux/poll.h>
-#include <linux/anon_inodes.h>
 #include <uapi/linux/iommufd.h>
 
 #include "../iommu-priv.h"
diff --git a/drivers/iommu/iommufd/io_pagetable.c b/drivers/iommu/iommufd/io_pagetable.c
index 05fd9d3abf1b8..bbbc8a044bcf7 100644
--- a/drivers/iommu/iommufd/io_pagetable.c
+++ b/drivers/iommu/iommufd/io_pagetable.c
@@ -8,17 +8,17 @@
  * The datastructure uses the iopt_pages to optimize the storage of the PFNs
  * between the domains and xarray.
  */
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/iommu.h>
 #include <linux/iommufd.h>
 #include <linux/lockdep.h>
-#include <linux/iommu.h>
 #include <linux/sched/mm.h>
-#include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/errno.h>
 #include <uapi/linux/iommufd.h>
 
-#include "io_pagetable.h"
 #include "double_span.h"
+#include "io_pagetable.h"
 
 struct iopt_pages_list {
 	struct iopt_pages *pages;
diff --git a/drivers/iommu/iommufd/io_pagetable.h b/drivers/iommu/iommufd/io_pagetable.h
index 0ec3509b7e339..c61d74471684e 100644
--- a/drivers/iommu/iommufd/io_pagetable.h
+++ b/drivers/iommu/iommufd/io_pagetable.h
@@ -6,8 +6,8 @@
 #define __IO_PAGETABLE_H
 
 #include <linux/interval_tree.h>
-#include <linux/mutex.h>
 #include <linux/kref.h>
+#include <linux/mutex.h>
 #include <linux/xarray.h>
 
 #include "iommufd_private.h"
diff --git a/drivers/iommu/iommufd/ioas.c b/drivers/iommu/iommufd/ioas.c
index 7422482765481..82428e44a837c 100644
--- a/drivers/iommu/iommufd/ioas.c
+++ b/drivers/iommu/iommufd/ioas.c
@@ -3,8 +3,8 @@
  * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES
  */
 #include <linux/interval_tree.h>
-#include <linux/iommufd.h>
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <uapi/linux/iommufd.h>
 
 #include "io_pagetable.h"
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 92efe30a8f0d0..017e50574f3b2 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -4,13 +4,14 @@
 #ifndef __IOMMUFD_PRIVATE_H
 #define __IOMMUFD_PRIVATE_H
 
-#include <linux/rwsem.h>
-#include <linux/xarray.h>
-#include <linux/refcount.h>
-#include <linux/uaccess.h>
 #include <linux/iommu.h>
 #include <linux/iova_bitmap.h>
+#include <linux/refcount.h>
+#include <linux/rwsem.h>
+#include <linux/uaccess.h>
+#include <linux/xarray.h>
 #include <uapi/linux/iommufd.h>
+
 #include "../iommu-priv.h"
 
 struct iommu_domain;
diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index acbbba1c66716..f4bc23a92f9a2 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -4,8 +4,8 @@
 #ifndef _UAPI_IOMMUFD_TEST_H
 #define _UAPI_IOMMUFD_TEST_H
 
-#include <linux/types.h>
 #include <linux/iommufd.h>
+#include <linux/types.h>
 
 enum {
 	IOMMU_TEST_OP_ADD_RESERVED = 1,
diff --git a/drivers/iommu/iommufd/iova_bitmap.c b/drivers/iommu/iommufd/iova_bitmap.c
index b9e964b1ad5cc..d90b9e253412f 100644
--- a/drivers/iommu/iommufd/iova_bitmap.c
+++ b/drivers/iommu/iommufd/iova_bitmap.c
@@ -3,10 +3,10 @@
  * Copyright (c) 2022, Oracle and/or its affiliates.
  * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
  */
+#include <linux/highmem.h>
 #include <linux/iova_bitmap.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/highmem.h>
 
 #define BITS_PER_PAGE (PAGE_SIZE * BITS_PER_BYTE)
 
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 83bbd7c5d1608..b5f5d27ee9634 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -8,15 +8,15 @@
  */
 #define pr_fmt(fmt) "iommufd: " fmt
 
+#include <linux/bug.h>
 #include <linux/file.h>
 #include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/iommufd.h>
 #include <linux/miscdevice.h>
+#include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/bug.h>
+#include <linux/slab.h>
 #include <uapi/linux/iommufd.h>
-#include <linux/iommufd.h>
 
 #include "io_pagetable.h"
 #include "iommufd_private.h"
diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 117f644a0c5b7..93d806c9c0731 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -45,16 +45,16 @@
  * last_iova + 1 can overflow. An iopt_pages index will always be much less than
  * ULONG_MAX so last_index + 1 cannot overflow.
  */
+#include <linux/highmem.h>
+#include <linux/iommu.h>
+#include <linux/iommufd.h>
+#include <linux/kthread.h>
 #include <linux/overflow.h>
 #include <linux/slab.h>
-#include <linux/iommu.h>
 #include <linux/sched/mm.h>
-#include <linux/highmem.h>
-#include <linux/kthread.h>
-#include <linux/iommufd.h>
 
-#include "io_pagetable.h"
 #include "double_span.h"
+#include "io_pagetable.h"
 
 #ifndef CONFIG_IOMMUFD_TEST
 #define TEMP_MEMORY_LIMIT 65536
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index f95e32e291333..04293b20e20c9 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -3,13 +3,14 @@
  *
  * Kernel side components to support tools/testing/selftests/iommu
  */
-#include <linux/slab.h>
-#include <linux/iommu.h>
-#include <linux/xarray.h>
-#include <linux/file.h>
 #include <linux/anon_inodes.h>
+#include <linux/debugfs.h>
 #include <linux/fault-inject.h>
+#include <linux/file.h>
+#include <linux/iommu.h>
 #include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
 #include <uapi/linux/iommufd.h>
 
 #include "../iommu-priv.h"
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index ffc3a949f8374..c2f2f6b9148e2 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -6,9 +6,9 @@
 #ifndef __LINUX_IOMMUFD_H
 #define __LINUX_IOMMUFD_H
 
-#include <linux/types.h>
-#include <linux/errno.h>
 #include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/types.h>
 
 struct device;
 struct iommufd_device;
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 4dde745cfb7e2..72010f71c5e47 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -4,8 +4,8 @@
 #ifndef _UAPI_IOMMUFD_H
 #define _UAPI_IOMMUFD_H
 
-#include <linux/types.h>
 #include <linux/ioctl.h>
+#include <linux/types.h>
 
 #define IOMMUFD_TYPE (';')
 

From b177a97029dd8d67e3725fa94d44ca4584f1218f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:30 -0700
Subject: [PATCH 228/352] iommu/arm-smmu-v3: Issue a batch of commands to the
 same cmdq

The driver calls in different places the arm_smmu_get_cmdq() helper, and
it's fine to do so since the helper always returns the single SMMU CMDQ.
However, with NVIDIA CMDQV extension or SMMU ECMDQ, there can be multiple
cmdqs in the system to select one from. And either case requires a batch
of commands to be issued to the same cmdq. Thus, a cmdq has to be decided
in the higher-level callers.

Add a cmdq pointer in arm_smmu_cmdq_batch structure, and decide the cmdq
when initializing the batch. Pass its pointer down to the bottom function.
Update __arm_smmu_cmdq_issue_cmd() accordingly for single command issuers.

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/2cbf5ddefb6ea611e48d67c642271bd24421eb21.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 56ae8866f3b408836c5f6cafbe6102f6e97911ba)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 48 +++++++++++++--------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 +
 2 files changed, 31 insertions(+), 18 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 09011f6b94ff2..aa7e044986804 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -592,11 +592,11 @@ static void arm_smmu_cmdq_poll_valid_map(struct arm_smmu_cmdq *cmdq,
 
 /* Wait for the command queue to become non-full */
 static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
+					     struct arm_smmu_cmdq *cmdq,
 					     struct arm_smmu_ll_queue *llq)
 {
 	unsigned long flags;
 	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
 	int ret = 0;
 
 	/*
@@ -627,11 +627,11 @@ static int arm_smmu_cmdq_poll_until_not_full(struct arm_smmu_device *smmu,
  * Must be called with the cmdq lock held in some capacity.
  */
 static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
+					  struct arm_smmu_cmdq *cmdq,
 					  struct arm_smmu_ll_queue *llq)
 {
 	int ret = 0;
 	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
 	u32 *cmd = (u32 *)(Q_ENT(&cmdq->q, llq->prod));
 
 	queue_poll_init(smmu, &qp);
@@ -651,10 +651,10 @@ static int __arm_smmu_cmdq_poll_until_msi(struct arm_smmu_device *smmu,
  * Must be called with the cmdq lock held in some capacity.
  */
 static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
+					       struct arm_smmu_cmdq *cmdq,
 					       struct arm_smmu_ll_queue *llq)
 {
 	struct arm_smmu_queue_poll qp;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
 	u32 prod = llq->prod;
 	int ret = 0;
 
@@ -701,12 +701,13 @@ static int __arm_smmu_cmdq_poll_until_consumed(struct arm_smmu_device *smmu,
 }
 
 static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
+					 struct arm_smmu_cmdq *cmdq,
 					 struct arm_smmu_ll_queue *llq)
 {
 	if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
-		return __arm_smmu_cmdq_poll_until_msi(smmu, llq);
+		return __arm_smmu_cmdq_poll_until_msi(smmu, cmdq, llq);
 
-	return __arm_smmu_cmdq_poll_until_consumed(smmu, llq);
+	return __arm_smmu_cmdq_poll_until_consumed(smmu, cmdq, llq);
 }
 
 static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
@@ -743,13 +744,13 @@ static void arm_smmu_cmdq_write_entries(struct arm_smmu_cmdq *cmdq, u64 *cmds,
  *   CPU will appear before any of the commands from the other CPU.
  */
 static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
+				       struct arm_smmu_cmdq *cmdq,
 				       u64 *cmds, int n, bool sync)
 {
 	u64 cmd_sync[CMDQ_ENT_DWORDS];
 	u32 prod;
 	unsigned long flags;
 	bool owner;
-	struct arm_smmu_cmdq *cmdq = arm_smmu_get_cmdq(smmu);
 	struct arm_smmu_ll_queue llq, head;
 	int ret = 0;
 
@@ -763,7 +764,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 
 		while (!queue_has_space(&llq, n + sync)) {
 			local_irq_restore(flags);
-			if (arm_smmu_cmdq_poll_until_not_full(smmu, &llq))
+			if (arm_smmu_cmdq_poll_until_not_full(smmu, cmdq, &llq))
 				dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 			local_irq_save(flags);
 		}
@@ -839,7 +840,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	/* 5. If we are inserting a CMD_SYNC, we must wait for it to complete */
 	if (sync) {
 		llq.prod = queue_inc_prod_n(&llq, n);
-		ret = arm_smmu_cmdq_poll_until_sync(smmu, &llq);
+		ret = arm_smmu_cmdq_poll_until_sync(smmu, cmdq, &llq);
 		if (ret) {
 			dev_err_ratelimited(smmu->dev,
 					    "CMD_SYNC timeout at 0x%08x [hwprod 0x%08x, hwcons 0x%08x]\n",
@@ -874,7 +875,8 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 		return -EINVAL;
 	}
 
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmd, 1, sync);
+	return arm_smmu_cmdq_issue_cmdlist(
+		smmu, arm_smmu_get_cmdq(smmu), cmd, 1, sync);
 }
 
 static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -889,6 +891,13 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
 	return __arm_smmu_cmdq_issue_cmd(smmu, ent, true);
 }
 
+static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu,
+				     struct arm_smmu_cmdq_batch *cmds)
+{
+	cmds->num = 0;
+	cmds->cmdq = arm_smmu_get_cmdq(smmu);
+}
+
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_batch *cmds,
 				    struct arm_smmu_cmdq_ent *cmd)
@@ -897,13 +906,15 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
 	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
-		cmds->num = 0;
+		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
+					    cmds->num, true);
+		arm_smmu_cmdq_batch_init(smmu, cmds);
 	}
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES) {
-		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, false);
-		cmds->num = 0;
+		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
+					    cmds->num, false);
+		arm_smmu_cmdq_batch_init(smmu, cmds);
 	}
 
 	index = cmds->num * CMDQ_ENT_DWORDS;
@@ -919,7 +930,8 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 static int arm_smmu_cmdq_batch_submit(struct arm_smmu_device *smmu,
 				      struct arm_smmu_cmdq_batch *cmds)
 {
-	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmds, cmds->num, true);
+	return arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
+					   cmds->num, true);
 }
 
 static void arm_smmu_page_response(struct device *dev, struct iopf_fault *unused,
@@ -1170,7 +1182,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 		},
 	};
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu, &cmds);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.cfgi.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
@@ -2021,7 +2033,7 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 
 	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(master->smmu, &cmds);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.atc.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
@@ -2059,7 +2071,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	if (!atomic_read(&smmu_domain->nr_ats_masters))
 		return 0;
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master_domain, &smmu_domain->devices,
@@ -2141,7 +2153,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 			num_pages++;
 	}
 
-	cmds.num = 0;
+	arm_smmu_cmdq_batch_init(smmu, &cmds);
 
 	while (iova < end) {
 		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 14bca41a981b4..c1454e9758c48 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -570,6 +570,7 @@ struct arm_smmu_cmdq {
 
 struct arm_smmu_cmdq_batch {
 	u64				cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
+	struct arm_smmu_cmdq		*cmdq;
 	int				num;
 };
 

From 2a6d904ea91cc53b29cbeadc1b42b71c45d43b80 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:31 -0700
Subject: [PATCH 229/352] iommu/arm-smmu-v3: Pass in cmdq pointer to
 arm_smmu_cmdq_build_sync_cmd

The CMDQV extension on NVIDIA Tegra241 SoC only supports CS_NONE in the
CS field of CMD_SYNC, v.s. standard SMMU CMDQ. Pass in the cmdq pointer
directly, so the function can identify a different cmdq implementation.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/723288287997b6dfbcd2a904d2c11e9b23f82250.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 2ea1f0120f900b2643afc71cc6bf5bab52df27d8)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index aa7e044986804..675ebfb5852b4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -352,8 +352,9 @@ static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
 }
 
 static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
-					 struct arm_smmu_queue *q, u32 prod)
+					 struct arm_smmu_cmdq *cmdq, u32 prod)
 {
+	struct arm_smmu_queue *q = &cmdq->q;
 	struct arm_smmu_cmdq_ent ent = {
 		.opcode = CMDQ_OP_CMD_SYNC,
 	};
@@ -371,7 +372,7 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 }
 
 static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
-				     struct arm_smmu_queue *q)
+				     struct arm_smmu_cmdq *cmdq)
 {
 	static const char * const cerror_str[] = {
 		[CMDQ_ERR_CERROR_NONE_IDX]	= "No error",
@@ -379,6 +380,7 @@ static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
 		[CMDQ_ERR_CERROR_ABT_IDX]	= "Abort on command fetch",
 		[CMDQ_ERR_CERROR_ATC_INV_IDX]	= "ATC invalidate timeout",
 	};
+	struct arm_smmu_queue *q = &cmdq->q;
 
 	int i;
 	u64 cmd[CMDQ_ENT_DWORDS];
@@ -427,7 +429,7 @@ static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
 
 static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 {
-	__arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq.q);
+	__arm_smmu_cmdq_skip_err(smmu, &smmu->cmdq);
 }
 
 /*
@@ -790,7 +792,7 @@ static int arm_smmu_cmdq_issue_cmdlist(struct arm_smmu_device *smmu,
 	arm_smmu_cmdq_write_entries(cmdq, cmds, llq.prod, n);
 	if (sync) {
 		prod = queue_inc_prod_n(&llq, n);
-		arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, &cmdq->q, prod);
+		arm_smmu_cmdq_build_sync_cmd(cmd_sync, smmu, cmdq, prod);
 		queue_write(Q_ENT(&cmdq->q, prod), cmd_sync, CMDQ_ENT_DWORDS);
 
 		/*

From 1dd35638620e5f6fb9f72d2b94c2db03db681baa Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:32 -0700
Subject: [PATCH 230/352] iommu/arm-smmu-v3: Pass in cmdq pointer to
 arm_smmu_cmdq_init

So that this function can be used by other cmdqs than &smmu->cmdq only.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/e11a3c0bde172c9652c2946f12bc2ceed4c3a355.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit e736c895c45bfcf9a9c675022e51fcabbb33e748)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 675ebfb5852b4..0b87df6b6937f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3564,9 +3564,9 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 	return 0;
 }
 
-static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu)
+static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
+			      struct arm_smmu_cmdq *cmdq)
 {
-	struct arm_smmu_cmdq *cmdq = &smmu->cmdq;
 	unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
 
 	atomic_set(&cmdq->owner_prod, 0);
@@ -3591,7 +3591,7 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
-	ret = arm_smmu_cmdq_init(smmu);
+	ret = arm_smmu_cmdq_init(smmu, &smmu->cmdq);
 	if (ret)
 		return ret;
 

From 28ca6b4cef8c9f8a3e2c9ee36e4c926383dc859a Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:33 -0700
Subject: [PATCH 231/352] iommu/arm-smmu-v3: Make symbols public for
 CONFIG_TEGRA241_CMDQV

The symbols __arm_smmu_cmdq_skip_err(), arm_smmu_init_one_queue(), and
arm_smmu_cmdq_init() need to be used by the tegra241-cmdqv compilation
unit in a following patch.

Remove the static and put prototypes in the header.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/c4f2aa5f5f40a2e7c68b132c6d3171d6403de57a.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a7a08b857a32d2f17fb9aba42e2c30d816ce5f1c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 18 ++++++++----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  9 +++++++++
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0b87df6b6937f..0346887497a4f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -371,8 +371,8 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
 }
 
-static void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
-				     struct arm_smmu_cmdq *cmdq)
+void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
+			      struct arm_smmu_cmdq *cmdq)
 {
 	static const char * const cerror_str[] = {
 		[CMDQ_ERR_CERROR_NONE_IDX]	= "No error",
@@ -3521,12 +3521,10 @@ static struct iommu_dirty_ops arm_smmu_dirty_ops = {
 };
 
 /* Probing and initialisation functions */
-static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
-				   struct arm_smmu_queue *q,
-				   void __iomem *page,
-				   unsigned long prod_off,
-				   unsigned long cons_off,
-				   size_t dwords, const char *name)
+int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+			    struct arm_smmu_queue *q, void __iomem *page,
+			    unsigned long prod_off, unsigned long cons_off,
+			    size_t dwords, const char *name)
 {
 	size_t qsz;
 
@@ -3564,8 +3562,8 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 	return 0;
 }
 
-static int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
-			      struct arm_smmu_cmdq *cmdq)
+int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
+		       struct arm_smmu_cmdq *cmdq)
 {
 	unsigned int nents = 1 << cmdq->q.llq.max_n_shift;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index c1454e9758c48..ce76357b6fc2a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -816,6 +816,15 @@ void arm_smmu_tlb_inv_range_asid(unsigned long iova, size_t size, int asid,
 int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 			    unsigned long iova, size_t size);
 
+void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
+			      struct arm_smmu_cmdq *cmdq);
+int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
+			    struct arm_smmu_queue *q, void __iomem *page,
+			    unsigned long prod_off, unsigned long cons_off,
+			    size_t dwords, const char *name);
+int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
+		       struct arm_smmu_cmdq *cmdq);
+
 #ifdef CONFIG_ARM_SMMU_V3_SVA
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
 bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);

From c618beb404e00789546a73a0f06a982b92a2a7a0 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:34 -0700
Subject: [PATCH 232/352] iommu/arm-smmu-v3: Add ARM_SMMU_OPT_TEGRA241_CMDQV

The CMDQV extension in NVIDIA Tegra241 SoC only supports CS_NONE in the
CS field of CMD_SYNC. Add a new SMMU option to accommodate that.

Suggested-by: Will Deacon <will@kernel.org>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/a3cb9bb2429fbae4a59f7ef517614d226763d717.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit b935a5b1c670c0a167f1263df5647b1b5b06e806)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 +++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 +
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0346887497a4f..144882c578229 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -351,6 +351,15 @@ static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
 	return &smmu->cmdq;
 }
 
+static bool arm_smmu_cmdq_needs_busy_polling(struct arm_smmu_device *smmu,
+					     struct arm_smmu_cmdq *cmdq)
+{
+	if (cmdq == &smmu->cmdq)
+		return false;
+
+	return smmu->options & ARM_SMMU_OPT_TEGRA241_CMDQV;
+}
+
 static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 					 struct arm_smmu_cmdq *cmdq, u32 prod)
 {
@@ -369,6 +378,8 @@ static void arm_smmu_cmdq_build_sync_cmd(u64 *cmd, struct arm_smmu_device *smmu,
 	}
 
 	arm_smmu_cmdq_build_cmd(cmd, &ent);
+	if (arm_smmu_cmdq_needs_busy_polling(smmu, cmdq))
+		u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS);
 }
 
 void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
@@ -423,6 +434,8 @@ void __arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu,
 
 	/* Convert the erroneous command into a CMD_SYNC */
 	arm_smmu_cmdq_build_cmd(cmd, &cmd_sync);
+	if (arm_smmu_cmdq_needs_busy_polling(smmu, cmdq))
+		u64p_replace_bits(cmd, CMDQ_SYNC_0_CS_NONE, CMDQ_SYNC_0_CS);
 
 	queue_write(Q_ENT(q, cons), cmd, q->ent_dwords);
 }
@@ -706,7 +719,8 @@ static int arm_smmu_cmdq_poll_until_sync(struct arm_smmu_device *smmu,
 					 struct arm_smmu_cmdq *cmdq,
 					 struct arm_smmu_ll_queue *llq)
 {
-	if (smmu->options & ARM_SMMU_OPT_MSIPOLL)
+	if (smmu->options & ARM_SMMU_OPT_MSIPOLL &&
+	    !arm_smmu_cmdq_needs_busy_polling(smmu, cmdq))
 		return __arm_smmu_cmdq_poll_until_msi(smmu, cmdq, llq);
 
 	return __arm_smmu_cmdq_poll_until_consumed(smmu, cmdq, llq);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index ce76357b6fc2a..ebd9db4789ff3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -665,6 +665,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_OPT_PAGE0_REGS_ONLY	(1 << 1)
 #define ARM_SMMU_OPT_MSIPOLL		(1 << 2)
 #define ARM_SMMU_OPT_CMDQ_FORCE_SYNC	(1 << 3)
+#define ARM_SMMU_OPT_TEGRA241_CMDQV	(1 << 4)
 	u32				options;
 
 	struct arm_smmu_cmdq		cmdq;

From 1f63a6120e3e65dd0a5ff92a4c0736276b975711 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:35 -0700
Subject: [PATCH 233/352] iommu/arm-smmu-v3: Add acpi_smmu_iort_probe_model for
 impl

For model-specific implementation, repurpose the acpi_smmu_get_options()
to a wider acpi_smmu_acpi_probe_model(). A new model can add to the list
in this new function.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/79716299829aeab2e55b8c7932f2634b209bb4d5.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 6f3f9ff43d005571a8d70d4a562ed7c4150e324c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 144882c578229..c92254bef2918 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -4340,18 +4340,25 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 }
 
 #ifdef CONFIG_ACPI
-static void acpi_smmu_get_options(u32 model, struct arm_smmu_device *smmu)
+static int acpi_smmu_iort_probe_model(struct acpi_iort_node *node,
+				      struct arm_smmu_device *smmu)
 {
-	switch (model) {
+	struct acpi_iort_smmu_v3 *iort_smmu =
+		(struct acpi_iort_smmu_v3 *)node->node_data;
+
+	switch (iort_smmu->model) {
 	case ACPI_IORT_SMMU_V3_CAVIUM_CN99XX:
 		smmu->options |= ARM_SMMU_OPT_PAGE0_REGS_ONLY;
 		break;
 	case ACPI_IORT_SMMU_V3_HISILICON_HI161X:
 		smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH;
 		break;
+	case ACPI_IORT_SMMU_V3_GENERIC:
+		break;
 	}
 
 	dev_notice(smmu->dev, "option mask 0x%x\n", smmu->options);
+	return 0;
 }
 
 static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
@@ -4366,8 +4373,6 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
 	/* Retrieve SMMUv3 specific data */
 	iort_smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-	acpi_smmu_get_options(iort_smmu->model, smmu);
-
 	if (iort_smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE)
 		smmu->features |= ARM_SMMU_FEAT_COHERENCY;
 
@@ -4379,7 +4384,7 @@ static int arm_smmu_device_acpi_probe(struct platform_device *pdev,
 		smmu->features |= ARM_SMMU_FEAT_HA;
 	}
 
-	return 0;
+	return acpi_smmu_iort_probe_model(node, smmu);
 }
 #else
 static inline int arm_smmu_device_acpi_probe(struct platform_device *pdev,

From 6487e30a039a63c0f515eeb2c1318a5841143120 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:36 -0700
Subject: [PATCH 234/352] iommu/arm-smmu-v3: Add struct arm_smmu_impl_ops

Mimicing the arm-smmu (v2) driver, introduce a struct arm_smmu_impl_ops to
accommodate impl routines.

Suggested-by: Will Deacon <will@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/8fe9f3805568aabf771fc6706c116459016bf62d.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 6de80d619203c672e5c011e8715bd965d27b69cf)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 51 ++++++++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 10 ++++
 2 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index c92254bef2918..7a0b1a884ecb3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -348,7 +348,12 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 
 static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
 {
-	return &smmu->cmdq;
+	struct arm_smmu_cmdq *cmdq = NULL;
+
+	if (smmu->impl_ops && smmu->impl_ops->get_secondary_cmdq)
+		cmdq = smmu->impl_ops->get_secondary_cmdq(smmu);
+
+	return cmdq ?: &smmu->cmdq;
 }
 
 static bool arm_smmu_cmdq_needs_busy_polling(struct arm_smmu_device *smmu,
@@ -4051,6 +4056,14 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
 		return ret;
 	}
 
+	if (smmu->impl_ops && smmu->impl_ops->device_reset) {
+		ret = smmu->impl_ops->device_reset(smmu);
+		if (ret) {
+			dev_err(smmu->dev, "failed to reset impl\n");
+			return ret;
+		}
+	}
+
 	return 0;
 }
 
@@ -4465,6 +4478,38 @@ static void arm_smmu_rmr_install_bypass_ste(struct arm_smmu_device *smmu)
 	iort_put_rmr_sids(dev_fwnode(smmu->dev), &rmr_list);
 }
 
+static void arm_smmu_impl_remove(void *data)
+{
+	struct arm_smmu_device *smmu = data;
+
+	if (smmu->impl_ops && smmu->impl_ops->device_remove)
+		smmu->impl_ops->device_remove(smmu);
+}
+
+/*
+ * Probe all the compiled in implementations. Each one checks to see if it
+ * matches this HW and if so returns a devm_krealloc'd arm_smmu_device which
+ * replaces the callers. Otherwise the original is returned or ERR_PTR.
+ */
+static struct arm_smmu_device *arm_smmu_impl_probe(struct arm_smmu_device *smmu)
+{
+	struct arm_smmu_device *new_smmu = ERR_PTR(-ENODEV);
+	int ret;
+
+	/* Add impl probe */
+
+	if (new_smmu == ERR_PTR(-ENODEV))
+		return smmu;
+	if (IS_ERR(new_smmu))
+		return new_smmu;
+
+	ret = devm_add_action_or_reset(new_smmu->dev, arm_smmu_impl_remove,
+				       new_smmu);
+	if (ret)
+		return ERR_PTR(ret);
+	return new_smmu;
+}
+
 static int arm_smmu_device_probe(struct platform_device *pdev)
 {
 	int irq, ret;
@@ -4486,6 +4531,10 @@ static int arm_smmu_device_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	smmu = arm_smmu_impl_probe(smmu);
+	if (IS_ERR(smmu))
+		return PTR_ERR(smmu);
+
 	/* Base address */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!res)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index ebd9db4789ff3..c2d8649922805 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -14,6 +14,8 @@
 #include <linux/mmzone.h>
 #include <linux/sizes.h>
 
+struct arm_smmu_device;
+
 /* MMIO registers */
 #define ARM_SMMU_IDR0			0x0
 #define IDR0_ST_LVL			GENMASK(28, 27)
@@ -630,9 +632,17 @@ struct arm_smmu_strtab_cfg {
 	u32				strtab_base_cfg;
 };
 
+struct arm_smmu_impl_ops {
+	int (*device_reset)(struct arm_smmu_device *smmu);
+	void (*device_remove)(struct arm_smmu_device *smmu);
+	struct arm_smmu_cmdq *(*get_secondary_cmdq)(struct arm_smmu_device *smmu);
+};
+
 /* An SMMUv3 instance */
 struct arm_smmu_device {
 	struct device			*dev;
+	const struct arm_smmu_impl_ops	*impl_ops;
+
 	void __iomem			*base;
 	void __iomem			*page1;
 

From 86123d71ef3973557a832392b785e51a9a1aeac7 Mon Sep 17 00:00:00 2001
From: Nate Watterson <nwatterson@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:37 -0700
Subject: [PATCH 235/352] iommu/arm-smmu-v3: Add in-kernel support for NVIDIA
 Tegra241 (Grace) CMDQV

NVIDIA's Tegra241 Soc has a CMDQ-Virtualization (CMDQV) hardware, extending
the standard ARM SMMU v3 IP to support multiple VCMDQs with virtualization
capabilities. In terms of command queue, they are very like a standard SMMU
CMDQ (or ECMDQs), but only support CS_NONE in the CS field of CMD_SYNC.

Add a new tegra241-cmdqv driver, and insert its structure pointer into the
existing arm_smmu_device, and then add related function calls in the SMMUv3
driver to interact with the CMDQV driver.

In the CMDQV driver, add a minimal part for the in-kernel support: reserve
VINTF0 for in-kernel use, and assign some of the VCMDQs to the VINTF0, and
select one VCMDQ based on the current CPU ID to execute supported commands.
This multi-queue design for in-kernel use gives some limited improvements:
up to 20% reduction of invalidation time was measured by a multi-threaded
DMA unmap benchmark, compared to a single queue.

The other part of the CMDQV driver will be user-space support that gives a
hypervisor running on the host OS to talk to the driver for virtualization
use cases, allowing VMs to use VCMDQs without trappings, i.e. no VM Exits.
This is designed based on IOMMUFD, and its RFC series is also under review.
It will provide a guest OS a bigger improvement: 70% to 90% reductions of
TLB invalidation time were measured by DMA unmap tests running in a guest,
compared to nested SMMU CMDQ (with trappings).

As the initial version, the CMDQV driver only supports ACPI configurations.

Signed-off-by: Nate Watterson <nwatterson@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Co-developed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/dce50490b2c10b7254fb36aa73ed7ffd812b283a.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 918eb5c856f6ce4cf93b4b38e4b5e156905c5943)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 MAINTAINERS                                   |   1 +
 drivers/iommu/Kconfig                         |  11 +
 drivers/iommu/arm/arm-smmu-v3/Makefile        |   1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  33 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  11 +
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 858 ++++++++++++++++++
 6 files changed, 914 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 268d88f7423e7..fe0cd73a1a1ef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21610,6 +21610,7 @@ M:	Thierry Reding <thierry.reding@gmail.com>
 R:	Krishna Reddy <vdumpa@nvidia.com>
 L:	linux-tegra@vger.kernel.org
 S:	Supported
+F:	drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
 F:	drivers/iommu/arm/arm-smmu/arm-smmu-nvidia.c
 F:	drivers/iommu/tegra*
 
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index ea413284af28e..89a6800fe0fa1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -424,6 +424,17 @@ config ARM_SMMU_V3_KUNIT_TEST
 	  Enable this option to unit-test arm-smmu-v3 driver functions.
 
 	  If unsure, say N.
+
+config TEGRA241_CMDQV
+	bool "NVIDIA Tegra241 CMDQ-V extension support for ARM SMMUv3"
+	depends on ACPI
+	help
+	  Support for NVIDIA CMDQ-Virtualization extension for ARM SMMUv3. The
+	  CMDQ-V extension is similar to v3.3 ECMDQ for multi command queues
+	  support, except with virtualization capabilities.
+
+	  Say Y here if your system is NVIDIA Tegra241 (Grace) or it has the same
+	  CMDQ-V extension.
 endif
 
 config S390_IOMMU
diff --git a/drivers/iommu/arm/arm-smmu-v3/Makefile b/drivers/iommu/arm/arm-smmu-v3/Makefile
index 355173d1441d2..dc98c88b48c82 100644
--- a/drivers/iommu/arm/arm-smmu-v3/Makefile
+++ b/drivers/iommu/arm/arm-smmu-v3/Makefile
@@ -2,5 +2,6 @@
 obj-$(CONFIG_ARM_SMMU_V3) += arm_smmu_v3.o
 arm_smmu_v3-y := arm-smmu-v3.o
 arm_smmu_v3-$(CONFIG_ARM_SMMU_V3_SVA) += arm-smmu-v3-sva.o
+arm_smmu_v3-$(CONFIG_TEGRA241_CMDQV) += tegra241-cmdqv.o
 
 obj-$(CONFIG_ARM_SMMU_V3_KUNIT_TEST) += arm-smmu-v3-test.o
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 7a0b1a884ecb3..196078c813be9 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -4353,6 +4353,31 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 }
 
 #ifdef CONFIG_ACPI
+#ifdef CONFIG_TEGRA241_CMDQV
+static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node,
+						struct arm_smmu_device *smmu)
+{
+	const char *uid = kasprintf(GFP_KERNEL, "%u", node->identifier);
+	struct acpi_device *adev;
+
+	/* Look for an NVDA200C node whose _UID matches the SMMU node ID */
+	adev = acpi_dev_get_first_match_dev("NVDA200C", uid, -1);
+	if (adev) {
+		/* Tegra241 CMDQV driver is responsible for put_device() */
+		smmu->impl_dev = &adev->dev;
+		smmu->options |= ARM_SMMU_OPT_TEGRA241_CMDQV;
+		dev_info(smmu->dev, "found companion CMDQV device: %s\n",
+			 dev_name(smmu->impl_dev));
+	}
+	kfree(uid);
+}
+#else
+static void acpi_smmu_dsdt_probe_tegra241_cmdqv(struct acpi_iort_node *node,
+						struct arm_smmu_device *smmu)
+{
+}
+#endif
+
 static int acpi_smmu_iort_probe_model(struct acpi_iort_node *node,
 				      struct arm_smmu_device *smmu)
 {
@@ -4367,6 +4392,11 @@ static int acpi_smmu_iort_probe_model(struct acpi_iort_node *node,
 		smmu->options |= ARM_SMMU_OPT_SKIP_PREFETCH;
 		break;
 	case ACPI_IORT_SMMU_V3_GENERIC:
+		/*
+		 * Tegra241 implementation stores its SMMU options and impl_dev
+		 * in DSDT. Thus, go through the ACPI tables unconditionally.
+		 */
+		acpi_smmu_dsdt_probe_tegra241_cmdqv(node, smmu);
 		break;
 	}
 
@@ -4496,7 +4526,8 @@ static struct arm_smmu_device *arm_smmu_impl_probe(struct arm_smmu_device *smmu)
 	struct arm_smmu_device *new_smmu = ERR_PTR(-ENODEV);
 	int ret;
 
-	/* Add impl probe */
+	if (smmu->impl_dev && (smmu->options & ARM_SMMU_OPT_TEGRA241_CMDQV))
+		new_smmu = tegra241_cmdqv_probe(smmu);
 
 	if (new_smmu == ERR_PTR(-ENODEV))
 		return smmu;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index c2d8649922805..d820da90a09a4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -641,6 +641,7 @@ struct arm_smmu_impl_ops {
 /* An SMMUv3 instance */
 struct arm_smmu_device {
 	struct device			*dev;
+	struct device			*impl_dev;
 	const struct arm_smmu_impl_ops	*impl_ops;
 
 	void __iomem			*base;
@@ -887,4 +888,14 @@ static inline void arm_smmu_sva_remove_dev_pasid(struct iommu_domain *domain,
 {
 }
 #endif /* CONFIG_ARM_SMMU_V3_SVA */
+
+#ifdef CONFIG_TEGRA241_CMDQV
+struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu);
+#else /* CONFIG_TEGRA241_CMDQV */
+static inline struct arm_smmu_device *
+tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
+{
+	return ERR_PTR(-ENODEV);
+}
+#endif /* CONFIG_TEGRA241_CMDQV */
 #endif /* _ARM_SMMU_V3_H */
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
new file mode 100644
index 0000000000000..5ac3032ee6dd2
--- /dev/null
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -0,0 +1,858 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (C) 2021-2024 NVIDIA CORPORATION & AFFILIATES. */
+
+#define dev_fmt(fmt) "tegra241_cmdqv: " fmt
+
+#include <linux/acpi.h>
+#include <linux/debugfs.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/iommu.h>
+#include <linux/iopoll.h>
+
+#include <acpi/acpixf.h>
+
+#include "arm-smmu-v3.h"
+
+/* CMDQV register page base and size defines */
+#define TEGRA241_CMDQV_CONFIG_BASE	(0)
+#define TEGRA241_CMDQV_CONFIG_SIZE	(SZ_64K)
+#define TEGRA241_VCMDQ_PAGE0_BASE	(TEGRA241_CMDQV_CONFIG_BASE + SZ_64K)
+#define TEGRA241_VCMDQ_PAGE1_BASE	(TEGRA241_VCMDQ_PAGE0_BASE + SZ_64K)
+#define TEGRA241_VINTF_PAGE_BASE	(TEGRA241_VCMDQ_PAGE1_BASE + SZ_64K)
+
+/* CMDQV global base regs */
+#define TEGRA241_CMDQV_CONFIG		0x0000
+#define  CMDQV_EN			BIT(0)
+
+#define TEGRA241_CMDQV_PARAM		0x0004
+#define  CMDQV_NUM_VINTF_LOG2		GENMASK(11, 8)
+#define  CMDQV_NUM_VCMDQ_LOG2		GENMASK(7, 4)
+
+#define TEGRA241_CMDQV_STATUS		0x0008
+#define  CMDQV_ENABLED			BIT(0)
+
+#define TEGRA241_CMDQV_VINTF_ERR_MAP	0x0014
+#define TEGRA241_CMDQV_VINTF_INT_MASK	0x001C
+#define TEGRA241_CMDQV_CMDQ_ERR_MAP(m)  (0x0024 + 0x4*(m))
+
+#define TEGRA241_CMDQV_CMDQ_ALLOC(q)	(0x0200 + 0x4*(q))
+#define  CMDQV_CMDQ_ALLOC_VINTF		GENMASK(20, 15)
+#define  CMDQV_CMDQ_ALLOC_LVCMDQ	GENMASK(7, 1)
+#define  CMDQV_CMDQ_ALLOCATED		BIT(0)
+
+/* VINTF base regs */
+#define TEGRA241_VINTF(v)		(0x1000 + 0x100*(v))
+
+#define TEGRA241_VINTF_CONFIG		0x0000
+#define  VINTF_HYP_OWN			BIT(17)
+#define  VINTF_VMID			GENMASK(16, 1)
+#define  VINTF_EN			BIT(0)
+
+#define TEGRA241_VINTF_STATUS		0x0004
+#define  VINTF_STATUS			GENMASK(3, 1)
+#define  VINTF_ENABLED			BIT(0)
+
+#define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \
+					(0x00C0 + 0x8*(m))
+#define  LVCMDQ_ERR_MAP_NUM_64		2
+
+/* VCMDQ base regs */
+/* -- PAGE0 -- */
+#define TEGRA241_VCMDQ_PAGE0(q)		(TEGRA241_VCMDQ_PAGE0_BASE + 0x80*(q))
+
+#define TEGRA241_VCMDQ_CONS		0x00000
+#define  VCMDQ_CONS_ERR			GENMASK(30, 24)
+
+#define TEGRA241_VCMDQ_PROD		0x00004
+
+#define TEGRA241_VCMDQ_CONFIG		0x00008
+#define  VCMDQ_EN			BIT(0)
+
+#define TEGRA241_VCMDQ_STATUS		0x0000C
+#define  VCMDQ_ENABLED			BIT(0)
+
+#define TEGRA241_VCMDQ_GERROR		0x00010
+#define TEGRA241_VCMDQ_GERRORN		0x00014
+
+/* -- PAGE1 -- */
+#define TEGRA241_VCMDQ_PAGE1(q)		(TEGRA241_VCMDQ_PAGE1_BASE + 0x80*(q))
+#define  VCMDQ_ADDR			GENMASK(47, 5)
+#define  VCMDQ_LOG2SIZE			GENMASK(4, 0)
+#define  VCMDQ_LOG2SIZE_MAX		19
+
+#define TEGRA241_VCMDQ_BASE		0x00000
+#define TEGRA241_VCMDQ_CONS_INDX_BASE	0x00008
+
+/* VINTF logical-VCMDQ pages */
+#define TEGRA241_VINTFi_PAGE0(i)	(TEGRA241_VINTF_PAGE_BASE + SZ_128K*(i))
+#define TEGRA241_VINTFi_PAGE1(i)	(TEGRA241_VINTFi_PAGE0(i) + SZ_64K)
+#define TEGRA241_VINTFi_LVCMDQ_PAGE0(i, q) \
+					(TEGRA241_VINTFi_PAGE0(i) + 0x80*(q))
+#define TEGRA241_VINTFi_LVCMDQ_PAGE1(i, q) \
+					(TEGRA241_VINTFi_PAGE1(i) + 0x80*(q))
+
+/* MMIO helpers */
+#define REG_CMDQV(_cmdqv, _regname) \
+	((_cmdqv)->base + TEGRA241_CMDQV_##_regname)
+#define REG_VINTF(_vintf, _regname) \
+	((_vintf)->base + TEGRA241_VINTF_##_regname)
+#define REG_VCMDQ_PAGE0(_vcmdq, _regname) \
+	((_vcmdq)->page0 + TEGRA241_VCMDQ_##_regname)
+#define REG_VCMDQ_PAGE1(_vcmdq, _regname) \
+	((_vcmdq)->page1 + TEGRA241_VCMDQ_##_regname)
+
+
+static bool disable_cmdqv;
+module_param(disable_cmdqv, bool, 0444);
+MODULE_PARM_DESC(disable_cmdqv,
+	"This allows to disable CMDQV HW and use default SMMU internal CMDQ.");
+
+static bool bypass_vcmdq;
+module_param(bypass_vcmdq, bool, 0444);
+MODULE_PARM_DESC(bypass_vcmdq,
+	"This allows to bypass VCMDQ for debugging use or perf comparison.");
+
+/**
+ * struct tegra241_vcmdq - Virtual Command Queue
+ * @idx: Global index in the CMDQV
+ * @lidx: Local index in the VINTF
+ * @enabled: Enable status
+ * @cmdqv: Parent CMDQV pointer
+ * @vintf: Parent VINTF pointer
+ * @cmdq: Command Queue struct
+ * @page0: MMIO Page0 base address
+ * @page1: MMIO Page1 base address
+ */
+struct tegra241_vcmdq {
+	u16 idx;
+	u16 lidx;
+
+	bool enabled;
+
+	struct tegra241_cmdqv *cmdqv;
+	struct tegra241_vintf *vintf;
+	struct arm_smmu_cmdq cmdq;
+
+	void __iomem *page0;
+	void __iomem *page1;
+};
+
+/**
+ * struct tegra241_vintf - Virtual Interface
+ * @idx: Global index in the CMDQV
+ * @enabled: Enable status
+ * @cmdqv: Parent CMDQV pointer
+ * @lvcmdqs: List of logical VCMDQ pointers
+ * @base: MMIO base address
+ */
+struct tegra241_vintf {
+	u16 idx;
+
+	bool enabled;
+
+	struct tegra241_cmdqv *cmdqv;
+	struct tegra241_vcmdq **lvcmdqs;
+
+	void __iomem *base;
+};
+
+/**
+ * struct tegra241_cmdqv - CMDQ-V for SMMUv3
+ * @smmu: SMMUv3 device
+ * @dev: CMDQV device
+ * @base: MMIO base address
+ * @irq: IRQ number
+ * @num_vintfs: Total number of VINTFs
+ * @num_vcmdqs: Total number of VCMDQs
+ * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF
+ * @vintf_ids: VINTF id allocator
+ * @vintfs: List of VINTFs
+ */
+struct tegra241_cmdqv {
+	struct arm_smmu_device smmu;
+	struct device *dev;
+
+	void __iomem *base;
+	int irq;
+
+	/* CMDQV Hardware Params */
+	u16 num_vintfs;
+	u16 num_vcmdqs;
+	u16 num_lvcmdqs_per_vintf;
+
+	struct ida vintf_ids;
+
+	struct tegra241_vintf **vintfs;
+};
+
+/* Config and Polling Helpers */
+
+static inline int tegra241_cmdqv_write_config(struct tegra241_cmdqv *cmdqv,
+					      void __iomem *addr_config,
+					      void __iomem *addr_status,
+					      u32 regval, const char *header,
+					      bool *out_enabled)
+{
+	bool en = regval & BIT(0);
+	int ret;
+
+	writel(regval, addr_config);
+	ret = readl_poll_timeout(addr_status, regval,
+				 en ? regval & BIT(0) : !(regval & BIT(0)),
+				 1, ARM_SMMU_POLL_TIMEOUT_US);
+	if (ret)
+		dev_err(cmdqv->dev, "%sfailed to %sable, STATUS=0x%08X\n",
+			header, en ? "en" : "dis", regval);
+	if (out_enabled)
+		WRITE_ONCE(*out_enabled, regval & BIT(0));
+	return ret;
+}
+
+static inline int cmdqv_write_config(struct tegra241_cmdqv *cmdqv, u32 regval)
+{
+	return tegra241_cmdqv_write_config(cmdqv,
+					   REG_CMDQV(cmdqv, CONFIG),
+					   REG_CMDQV(cmdqv, STATUS),
+					   regval, "CMDQV: ", NULL);
+}
+
+static inline int vintf_write_config(struct tegra241_vintf *vintf, u32 regval)
+{
+	char header[16];
+
+	snprintf(header, 16, "VINTF%u: ", vintf->idx);
+	return tegra241_cmdqv_write_config(vintf->cmdqv,
+					   REG_VINTF(vintf, CONFIG),
+					   REG_VINTF(vintf, STATUS),
+					   regval, header, &vintf->enabled);
+}
+
+static inline char *lvcmdq_error_header(struct tegra241_vcmdq *vcmdq,
+					char *header, int hlen)
+{
+	WARN_ON(hlen < 32);
+	if (WARN_ON(!vcmdq->vintf))
+		return "";
+	snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ",
+		 vcmdq->vintf->idx, vcmdq->idx, vcmdq->lidx);
+	return header;
+}
+
+static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval)
+{
+	char header[32], *h = lvcmdq_error_header(vcmdq, header, 32);
+
+	return tegra241_cmdqv_write_config(vcmdq->cmdqv,
+					   REG_VCMDQ_PAGE0(vcmdq, CONFIG),
+					   REG_VCMDQ_PAGE0(vcmdq, STATUS),
+					   regval, h, &vcmdq->enabled);
+}
+
+/* ISR Functions */
+
+static void tegra241_vintf0_handle_error(struct tegra241_vintf *vintf)
+{
+	int i;
+
+	for (i = 0; i < LVCMDQ_ERR_MAP_NUM_64; i++) {
+		u64 map = readq_relaxed(REG_VINTF(vintf, LVCMDQ_ERR_MAP_64(i)));
+
+		while (map) {
+			unsigned long lidx = __ffs64(map);
+			struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx];
+			u32 gerror = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR));
+
+			__arm_smmu_cmdq_skip_err(&vintf->cmdqv->smmu, &vcmdq->cmdq);
+			writel(gerror, REG_VCMDQ_PAGE0(vcmdq, GERRORN));
+			map &= ~BIT_ULL(lidx);
+		}
+	}
+}
+
+static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
+{
+	struct tegra241_cmdqv *cmdqv = (struct tegra241_cmdqv *)devid;
+	void __iomem *reg_vintf_map = REG_CMDQV(cmdqv, VINTF_ERR_MAP);
+	char err_str[256];
+	u64 vintf_map;
+
+	/* Use readl_relaxed() as register addresses are not 64-bit aligned */
+	vintf_map = (u64)readl_relaxed(reg_vintf_map + 0x4) << 32 |
+		    (u64)readl_relaxed(reg_vintf_map);
+
+	snprintf(err_str, sizeof(err_str),
+		 "vintf_map: %016llx, vcmdq_map %08x:%08x:%08x:%08x", vintf_map,
+		 readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(3))),
+		 readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(2))),
+		 readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(1))),
+		 readl_relaxed(REG_CMDQV(cmdqv, CMDQ_ERR_MAP(0))));
+
+	dev_warn(cmdqv->dev, "unexpected error reported. %s\n", err_str);
+
+	/* Handle VINTF0 and its LVCMDQs */
+	if (vintf_map & BIT_ULL(0)) {
+		tegra241_vintf0_handle_error(cmdqv->vintfs[0]);
+		vintf_map &= ~BIT_ULL(0);
+	}
+
+	return IRQ_HANDLED;
+}
+
+/* Command Queue Function */
+
+static struct arm_smmu_cmdq *
+tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	struct tegra241_vintf *vintf = cmdqv->vintfs[0];
+	struct tegra241_vcmdq *vcmdq;
+	u16 lidx;
+
+	if (READ_ONCE(bypass_vcmdq))
+		return NULL;
+
+	/* Use SMMU CMDQ if VINTF0 is uninitialized */
+	if (!READ_ONCE(vintf->enabled))
+		return NULL;
+
+	/*
+	 * Select a LVCMDQ to use. Here we use a temporal solution to
+	 * balance out traffic on cmdq issuing: each cmdq has its own
+	 * lock, if all cpus issue cmdlist using the same cmdq, only
+	 * one CPU at a time can enter the process, while the others
+	 * will be spinning at the same lock.
+	 */
+	lidx = smp_processor_id() % cmdqv->num_lvcmdqs_per_vintf;
+	vcmdq = vintf->lvcmdqs[lidx];
+	if (!vcmdq || !READ_ONCE(vcmdq->enabled))
+		return NULL;
+	return &vcmdq->cmdq;
+}
+
+/* HW Reset Functions */
+
+static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
+{
+	char header[32], *h = lvcmdq_error_header(vcmdq, header, 32);
+	u32 gerrorn, gerror;
+
+	if (vcmdq_write_config(vcmdq, 0)) {
+		dev_err(vcmdq->cmdqv->dev,
+			"%sGERRORN=0x%X, GERROR=0x%X, CONS=0x%X\n", h,
+			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)),
+			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)),
+			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS)));
+	}
+	writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, PROD));
+	writel_relaxed(0, REG_VCMDQ_PAGE0(vcmdq, CONS));
+	writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, BASE));
+	writeq_relaxed(0, REG_VCMDQ_PAGE1(vcmdq, CONS_INDX_BASE));
+
+	gerrorn = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN));
+	gerror = readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR));
+	if (gerror != gerrorn) {
+		dev_warn(vcmdq->cmdqv->dev,
+			 "%suncleared error detected, resetting\n", h);
+		writel(gerror, REG_VCMDQ_PAGE0(vcmdq, GERRORN));
+	}
+
+	dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h);
+}
+
+static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
+{
+	char header[32], *h = lvcmdq_error_header(vcmdq, header, 32);
+	int ret;
+
+	/* Reset VCMDQ */
+	tegra241_vcmdq_hw_deinit(vcmdq);
+
+	/* Configure and enable VCMDQ */
+	writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+
+	ret = vcmdq_write_config(vcmdq, VCMDQ_EN);
+	if (ret) {
+		dev_err(vcmdq->cmdqv->dev,
+			"%sGERRORN=0x%X, GERROR=0x%X, CONS=0x%X\n", h,
+			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERRORN)),
+			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, GERROR)),
+			readl_relaxed(REG_VCMDQ_PAGE0(vcmdq, CONS)));
+		return ret;
+	}
+
+	dev_dbg(vcmdq->cmdqv->dev, "%sinited\n", h);
+	return 0;
+}
+
+static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
+{
+	u16 lidx;
+
+	for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++)
+		if (vintf->lvcmdqs && vintf->lvcmdqs[lidx])
+			tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]);
+	vintf_write_config(vintf, 0);
+}
+
+static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
+{
+	u32 regval;
+	u16 lidx;
+	int ret;
+
+	/* Reset VINTF */
+	tegra241_vintf_hw_deinit(vintf);
+
+	/* Configure and enable VINTF */
+	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own);
+	writel(regval, REG_VINTF(vintf, CONFIG));
+
+	ret = vintf_write_config(vintf, regval | VINTF_EN);
+	if (ret)
+		return ret;
+
+	for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+		if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) {
+			ret = tegra241_vcmdq_hw_init(vintf->lvcmdqs[lidx]);
+			if (ret) {
+				tegra241_vintf_hw_deinit(vintf);
+				return ret;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static int tegra241_cmdqv_hw_reset(struct arm_smmu_device *smmu)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	u16 qidx, lidx, idx;
+	u32 regval;
+	int ret;
+
+	/* Reset CMDQV */
+	regval = readl_relaxed(REG_CMDQV(cmdqv, CONFIG));
+	ret = cmdqv_write_config(cmdqv, regval & ~CMDQV_EN);
+	if (ret)
+		return ret;
+	ret = cmdqv_write_config(cmdqv, regval | CMDQV_EN);
+	if (ret)
+		return ret;
+
+	/* Assign preallocated global VCMDQs to each VINTF as LVCMDQs */
+	for (idx = 0, qidx = 0; idx < cmdqv->num_vintfs; idx++) {
+		for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+			regval  = FIELD_PREP(CMDQV_CMDQ_ALLOC_VINTF, idx);
+			regval |= FIELD_PREP(CMDQV_CMDQ_ALLOC_LVCMDQ, lidx);
+			regval |= CMDQV_CMDQ_ALLOCATED;
+			writel_relaxed(regval,
+				       REG_CMDQV(cmdqv, CMDQ_ALLOC(qidx++)));
+		}
+	}
+
+	return tegra241_vintf_hw_init(cmdqv->vintfs[0], true);
+}
+
+/* VCMDQ Resource Helpers */
+
+static void tegra241_vcmdq_free_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
+{
+	struct arm_smmu_queue *q = &vcmdq->cmdq.q;
+	size_t nents = 1 << q->llq.max_n_shift;
+	size_t qsz = nents << CMDQ_ENT_SZ_SHIFT;
+
+	if (!q->base)
+		return;
+	dmam_free_coherent(vcmdq->cmdqv->smmu.dev, qsz, q->base, q->base_dma);
+}
+
+static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
+{
+	struct arm_smmu_device *smmu = &vcmdq->cmdqv->smmu;
+	struct arm_smmu_cmdq *cmdq = &vcmdq->cmdq;
+	struct arm_smmu_queue *q = &cmdq->q;
+	char name[16];
+	int ret;
+
+	snprintf(name, 16, "vcmdq%u", vcmdq->idx);
+
+	q->llq.max_n_shift = VCMDQ_LOG2SIZE_MAX;
+
+	/* Use the common helper to init the VCMDQ, and then... */
+	ret = arm_smmu_init_one_queue(smmu, q, vcmdq->page0,
+				      TEGRA241_VCMDQ_PROD, TEGRA241_VCMDQ_CONS,
+				      CMDQ_ENT_DWORDS, name);
+	if (ret)
+		return ret;
+
+	/* ...override q_base to write VCMDQ_BASE registers */
+	q->q_base = q->base_dma & VCMDQ_ADDR;
+	q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift);
+
+	return arm_smmu_cmdq_init(smmu, cmdq);
+}
+
+/* VINTF Logical VCMDQ Resource Helpers */
+
+static void tegra241_vintf_deinit_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+	vintf->lvcmdqs[lidx] = NULL;
+}
+
+static int tegra241_vintf_init_lvcmdq(struct tegra241_vintf *vintf, u16 lidx,
+				      struct tegra241_vcmdq *vcmdq)
+{
+	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+	u16 idx = vintf->idx;
+
+	vcmdq->idx = idx * cmdqv->num_lvcmdqs_per_vintf + lidx;
+	vcmdq->lidx = lidx;
+	vcmdq->cmdqv = cmdqv;
+	vcmdq->vintf = vintf;
+	vcmdq->page0 = cmdqv->base + TEGRA241_VINTFi_LVCMDQ_PAGE0(idx, lidx);
+	vcmdq->page1 = cmdqv->base + TEGRA241_VINTFi_LVCMDQ_PAGE1(idx, lidx);
+
+	vintf->lvcmdqs[lidx] = vcmdq;
+	return 0;
+}
+
+static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+	struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx];
+	char header[32];
+
+	tegra241_vcmdq_free_smmu_cmdq(vcmdq);
+	tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+
+	dev_dbg(vintf->cmdqv->dev,
+		"%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 32));
+	kfree(vcmdq);
+}
+
+static struct tegra241_vcmdq *
+tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+	struct tegra241_vcmdq *vcmdq;
+	char header[32];
+	int ret;
+
+	vcmdq = kzalloc(sizeof(*vcmdq), GFP_KERNEL);
+	if (!vcmdq)
+		return ERR_PTR(-ENOMEM);
+
+	ret = tegra241_vintf_init_lvcmdq(vintf, lidx, vcmdq);
+	if (ret)
+		goto free_vcmdq;
+
+	/* Build an arm_smmu_cmdq for each LVCMDQ */
+	ret = tegra241_vcmdq_alloc_smmu_cmdq(vcmdq);
+	if (ret)
+		goto deinit_lvcmdq;
+
+	dev_dbg(cmdqv->dev,
+		"%sallocated\n", lvcmdq_error_header(vcmdq, header, 32));
+	return vcmdq;
+
+deinit_lvcmdq:
+	tegra241_vintf_deinit_lvcmdq(vintf, lidx);
+free_vcmdq:
+	kfree(vcmdq);
+	return ERR_PTR(ret);
+}
+
+/* VINTF Resource Helpers */
+
+static void tegra241_cmdqv_deinit_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
+{
+	kfree(cmdqv->vintfs[idx]->lvcmdqs);
+	ida_free(&cmdqv->vintf_ids, idx);
+	cmdqv->vintfs[idx] = NULL;
+}
+
+static int tegra241_cmdqv_init_vintf(struct tegra241_cmdqv *cmdqv, u16 max_idx,
+				     struct tegra241_vintf *vintf)
+{
+
+	u16 idx;
+	int ret;
+
+	ret = ida_alloc_max(&cmdqv->vintf_ids, max_idx, GFP_KERNEL);
+	if (ret < 0)
+		return ret;
+	idx = ret;
+
+	vintf->idx = idx;
+	vintf->cmdqv = cmdqv;
+	vintf->base = cmdqv->base + TEGRA241_VINTF(idx);
+
+	vintf->lvcmdqs = kcalloc(cmdqv->num_lvcmdqs_per_vintf,
+				 sizeof(*vintf->lvcmdqs), GFP_KERNEL);
+	if (!vintf->lvcmdqs) {
+		ida_free(&cmdqv->vintf_ids, idx);
+		return -ENOMEM;
+	}
+
+	cmdqv->vintfs[idx] = vintf;
+	return ret;
+}
+
+/* Remove Helpers */
+
+static void tegra241_vintf_remove_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
+{
+	tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]);
+	tegra241_vintf_free_lvcmdq(vintf, lidx);
+}
+
+static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
+{
+	struct tegra241_vintf *vintf = cmdqv->vintfs[idx];
+	u16 lidx;
+
+	/* Remove LVCMDQ resources */
+	for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++)
+		if (vintf->lvcmdqs[lidx])
+			tegra241_vintf_remove_lvcmdq(vintf, lidx);
+
+	/* Remove VINTF resources */
+	tegra241_vintf_hw_deinit(vintf);
+
+	dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx);
+	tegra241_cmdqv_deinit_vintf(cmdqv, idx);
+	kfree(vintf);
+}
+
+static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	u16 idx;
+
+	/* Remove VINTF resources */
+	for (idx = 0; idx < cmdqv->num_vintfs; idx++) {
+		if (cmdqv->vintfs[idx]) {
+			/* Only vintf0 should remain at this stage */
+			WARN_ON(idx > 0);
+			tegra241_cmdqv_remove_vintf(cmdqv, idx);
+		}
+	}
+
+	/* Remove cmdqv resources */
+	ida_destroy(&cmdqv->vintf_ids);
+
+	if (cmdqv->irq > 0)
+		free_irq(cmdqv->irq, cmdqv);
+	iounmap(cmdqv->base);
+	kfree(cmdqv->vintfs);
+	put_device(cmdqv->dev); /* smmu->impl_dev */
+}
+
+static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
+	.get_secondary_cmdq = tegra241_cmdqv_get_cmdq,
+	.device_reset = tegra241_cmdqv_hw_reset,
+	.device_remove = tegra241_cmdqv_remove,
+};
+
+/* Probe Functions */
+
+static int tegra241_cmdqv_acpi_is_memory(struct acpi_resource *res, void *data)
+{
+	struct resource_win win;
+
+	return !acpi_dev_resource_address_space(res, &win);
+}
+
+static int tegra241_cmdqv_acpi_get_irqs(struct acpi_resource *ares, void *data)
+{
+	struct resource r;
+	int *irq = data;
+
+	if (*irq <= 0 && acpi_dev_resource_interrupt(ares, 0, &r))
+		*irq = r.start;
+	return 1; /* No need to add resource to the list */
+}
+
+static struct resource *
+tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq)
+{
+	struct acpi_device *adev = to_acpi_device(dev);
+	struct list_head resource_list;
+	struct resource_entry *rentry;
+	struct resource *res = NULL;
+	int ret;
+
+	INIT_LIST_HEAD(&resource_list);
+	ret = acpi_dev_get_resources(adev, &resource_list,
+				     tegra241_cmdqv_acpi_is_memory, NULL);
+	if (ret < 0) {
+		dev_err(dev, "failed to get memory resource: %d\n", ret);
+		return NULL;
+	}
+
+	rentry = list_first_entry_or_null(&resource_list,
+					  struct resource_entry, node);
+	if (!rentry) {
+		dev_err(dev, "failed to get memory resource entry\n");
+		goto free_list;
+	}
+
+	/* Caller must free the res */
+	res = kzalloc(sizeof(*res), GFP_KERNEL);
+	if (!res)
+		goto free_list;
+
+	*res = *rentry->res;
+
+	acpi_dev_free_resource_list(&resource_list);
+
+	INIT_LIST_HEAD(&resource_list);
+
+	if (irq)
+		ret = acpi_dev_get_resources(adev, &resource_list,
+					     tegra241_cmdqv_acpi_get_irqs, irq);
+	if (ret < 0 || !irq || *irq <= 0)
+		dev_warn(dev, "no interrupt. errors will not be reported\n");
+
+free_list:
+	acpi_dev_free_resource_list(&resource_list);
+	return res;
+}
+
+struct dentry *cmdqv_debugfs_dir;
+
+static struct arm_smmu_device *
+__tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
+		       int irq)
+{
+	static struct arm_smmu_device *new_smmu;
+	struct tegra241_cmdqv *cmdqv = NULL;
+	struct tegra241_vintf *vintf;
+	void __iomem *base;
+	u32 regval;
+	int lidx;
+	int ret;
+
+	static_assert(offsetof(struct tegra241_cmdqv, smmu) == 0);
+
+	base = ioremap(res->start, resource_size(res));
+	if (IS_ERR(base)) {
+		dev_err(smmu->dev, "failed to ioremap: %ld\n", PTR_ERR(base));
+		goto iounmap;
+	}
+
+	regval = readl(base + TEGRA241_CMDQV_CONFIG);
+	if (disable_cmdqv) {
+		dev_info(smmu->dev, "Detected disable_cmdqv=true\n");
+		writel(regval & ~CMDQV_EN, base + TEGRA241_CMDQV_CONFIG);
+		goto iounmap;
+	}
+
+	cmdqv = devm_krealloc(smmu->dev, smmu, sizeof(*cmdqv), GFP_KERNEL);
+	if (!cmdqv)
+		goto iounmap;
+	new_smmu = &cmdqv->smmu;
+
+	cmdqv->irq = irq;
+	cmdqv->base = base;
+	cmdqv->dev = smmu->impl_dev;
+
+	if (cmdqv->irq > 0) {
+		ret = request_irq(irq, tegra241_cmdqv_isr, 0, "tegra241-cmdqv",
+				  cmdqv);
+		if (ret) {
+			dev_err(cmdqv->dev, "failed to request irq (%d): %d\n",
+				cmdqv->irq, ret);
+			goto iounmap;
+		}
+	}
+
+	regval = readl_relaxed(REG_CMDQV(cmdqv, PARAM));
+	cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
+	cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
+	cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs;
+
+	cmdqv->vintfs =
+		kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL);
+	if (!cmdqv->vintfs)
+		goto free_irq;
+
+	ida_init(&cmdqv->vintf_ids);
+
+	vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
+	if (!vintf)
+		goto destroy_ids;
+
+	/* Init VINTF0 for in-kernel use */
+	ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
+	if (ret) {
+		dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
+		goto free_vintf;
+	}
+
+	/* Preallocate logical VCMDQs to VINTF0 */
+	for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+		struct tegra241_vcmdq *vcmdq;
+
+		vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
+		if (IS_ERR(vcmdq))
+			goto free_lvcmdq;
+	}
+
+#ifdef CONFIG_IOMMU_DEBUGFS
+	if (!cmdqv_debugfs_dir) {
+		cmdqv_debugfs_dir =
+			debugfs_create_dir("tegra241_cmdqv", iommu_debugfs_dir);
+		debugfs_create_bool("bypass_vcmdq", 0644, cmdqv_debugfs_dir,
+				    &bypass_vcmdq);
+	}
+#endif
+
+	new_smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+
+	return new_smmu;
+
+free_lvcmdq:
+	for (lidx--; lidx >= 0; lidx--)
+		tegra241_vintf_free_lvcmdq(vintf, lidx);
+	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+free_vintf:
+	kfree(vintf);
+destroy_ids:
+	ida_destroy(&cmdqv->vintf_ids);
+	kfree(cmdqv->vintfs);
+free_irq:
+	if (cmdqv->irq > 0)
+		free_irq(cmdqv->irq, cmdqv);
+iounmap:
+	iounmap(base);
+	return NULL;
+}
+
+struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
+{
+	struct arm_smmu_device *new_smmu;
+	struct resource *res = NULL;
+	int irq;
+
+	if (!smmu->dev->of_node)
+		res = tegra241_cmdqv_find_acpi_resource(smmu->impl_dev, &irq);
+	if (!res)
+		goto out_fallback;
+
+	new_smmu = __tegra241_cmdqv_probe(smmu, res, irq);
+	kfree(res);
+
+	if (new_smmu)
+		return new_smmu;
+
+out_fallback:
+	dev_info(smmu->impl_dev, "Falling back to standard SMMU CMDQ\n");
+	smmu->options &= ~ARM_SMMU_OPT_TEGRA241_CMDQV;
+	put_device(smmu->impl_dev);
+	return ERR_PTR(-ENODEV);
+}

From 0ca9667668f5d39e32c9be562d31c0751e89d888 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:38 -0700
Subject: [PATCH 236/352] iommu/arm-smmu-v3: Start a new batch if new command
 is not supported

The VCMDQ in the tegra241-cmdqv driver has a guest mode that supports only
a few invalidation commands. A batch is initialized with a cmdq, so it has
to confirm whether a new command is supported or not.

Add a supports_cmd function pointer to the cmdq structure, where the vcmdq
driver should hook a command scan function. Add an inline helper too so it
can be used by both sides.

If a new command is not supported, simply issue the existing batch and re-
init it as a new batch.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/aafb24b881504f18c5d0c7c15f2134e40ad2c486.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit f59e854907128ec3d4a82b7fc4efe9be8da2e78e)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 6 ++++--
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 7 +++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 196078c813be9..e3b33aaa3b22f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -923,10 +923,12 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_batch *cmds,
 				    struct arm_smmu_cmdq_ent *cmd)
 {
+	bool unsupported_cmd = !arm_smmu_cmdq_supports_cmd(cmds->cmdq, cmd);
+	bool force_sync = (cmds->num == CMDQ_BATCH_ENTRIES - 1) &&
+			  (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC);
 	int index;
 
-	if (cmds->num == CMDQ_BATCH_ENTRIES - 1 &&
-	    (smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC)) {
+	if (force_sync || unsupported_cmd) {
 		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
 					    cmds->num, true);
 		arm_smmu_cmdq_batch_init(smmu, cmds);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index d820da90a09a4..d4e5a22408e3d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -568,8 +568,15 @@ struct arm_smmu_cmdq {
 	atomic_long_t			*valid_map;
 	atomic_t			owner_prod;
 	atomic_t			lock;
+	bool				(*supports_cmd)(struct arm_smmu_cmdq_ent *ent);
 };
 
+static inline bool arm_smmu_cmdq_supports_cmd(struct arm_smmu_cmdq *cmdq,
+					      struct arm_smmu_cmdq_ent *ent)
+{
+	return cmdq->supports_cmd ? cmdq->supports_cmd(ent) : true;
+}
+
 struct arm_smmu_cmdq_batch {
 	u64				cmds[CMDQ_BATCH_ENTRIES * CMDQ_ENT_DWORDS];
 	struct arm_smmu_cmdq		*cmdq;

From ac1d4be5c54f0b299981462a39230c8c4de01909 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Aug 2024 15:34:39 -0700
Subject: [PATCH 237/352] iommu/tegra241-cmdqv: Limit CMDs for VCMDQs of a
 guest owned VINTF

When VCMDQs are assigned to a VINTF owned by a guest (HYP_OWN bit unset),
only TLB and ATC invalidation commands are supported by the VCMDQ HW. So,
implement the new cmdq->supports_cmd op to scan the input cmd in order to
make sure that it is supported by the selected queue.

Note that the guest VM shouldn't have HYP_OWN bit being set regardless of
guest kernel driver writing it or not, i.e. the hypervisor running in the
host OS should wire this bit to zero when trapping a write access to this
VINTF_CONFIG register from a guest kernel.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/8160292337059b91271045800e5c62f7295e2c24.1724970714.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a9d40285bdefef700ebc7551ef79d2f3e4559e73)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   | 28 ++++++++-------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  3 +-
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 34 ++++++++++++++++++-
 3 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e3b33aaa3b22f..2e1245b48fcb2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -346,12 +346,13 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	return 0;
 }
 
-static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu)
+static struct arm_smmu_cmdq *arm_smmu_get_cmdq(struct arm_smmu_device *smmu,
+					       struct arm_smmu_cmdq_ent *ent)
 {
 	struct arm_smmu_cmdq *cmdq = NULL;
 
 	if (smmu->impl_ops && smmu->impl_ops->get_secondary_cmdq)
-		cmdq = smmu->impl_ops->get_secondary_cmdq(smmu);
+		cmdq = smmu->impl_ops->get_secondary_cmdq(smmu, ent);
 
 	return cmdq ?: &smmu->cmdq;
 }
@@ -897,7 +898,7 @@ static int __arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 	}
 
 	return arm_smmu_cmdq_issue_cmdlist(
-		smmu, arm_smmu_get_cmdq(smmu), cmd, 1, sync);
+		smmu, arm_smmu_get_cmdq(smmu, ent), cmd, 1, sync);
 }
 
 static int arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
@@ -913,10 +914,11 @@ static int arm_smmu_cmdq_issue_cmd_with_sync(struct arm_smmu_device *smmu,
 }
 
 static void arm_smmu_cmdq_batch_init(struct arm_smmu_device *smmu,
-				     struct arm_smmu_cmdq_batch *cmds)
+				     struct arm_smmu_cmdq_batch *cmds,
+				     struct arm_smmu_cmdq_ent *ent)
 {
 	cmds->num = 0;
-	cmds->cmdq = arm_smmu_get_cmdq(smmu);
+	cmds->cmdq = arm_smmu_get_cmdq(smmu, ent);
 }
 
 static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
@@ -931,13 +933,13 @@ static void arm_smmu_cmdq_batch_add(struct arm_smmu_device *smmu,
 	if (force_sync || unsupported_cmd) {
 		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
 					    cmds->num, true);
-		arm_smmu_cmdq_batch_init(smmu, cmds);
+		arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
 	}
 
 	if (cmds->num == CMDQ_BATCH_ENTRIES) {
 		arm_smmu_cmdq_issue_cmdlist(smmu, cmds->cmdq, cmds->cmds,
 					    cmds->num, false);
-		arm_smmu_cmdq_batch_init(smmu, cmds);
+		arm_smmu_cmdq_batch_init(smmu, cmds, cmd);
 	}
 
 	index = cmds->num * CMDQ_ENT_DWORDS;
@@ -1205,7 +1207,7 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 		},
 	};
 
-	arm_smmu_cmdq_batch_init(smmu, &cmds);
+	arm_smmu_cmdq_batch_init(smmu, &cmds, &cmd);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.cfgi.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(smmu, &cmds, &cmd);
@@ -2056,7 +2058,7 @@ static int arm_smmu_atc_inv_master(struct arm_smmu_master *master,
 
 	arm_smmu_atc_inv_to_cmd(ssid, 0, 0, &cmd);
 
-	arm_smmu_cmdq_batch_init(master->smmu, &cmds);
+	arm_smmu_cmdq_batch_init(master->smmu, &cmds, &cmd);
 	for (i = 0; i < master->num_streams; i++) {
 		cmd.atc.sid = master->streams[i].id;
 		arm_smmu_cmdq_batch_add(master->smmu, &cmds, &cmd);
@@ -2071,7 +2073,9 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	struct arm_smmu_master_domain *master_domain;
 	int i;
 	unsigned long flags;
-	struct arm_smmu_cmdq_ent cmd;
+	struct arm_smmu_cmdq_ent cmd = {
+		.opcode = CMDQ_OP_ATC_INV,
+	};
 	struct arm_smmu_cmdq_batch cmds;
 
 	if (!(smmu_domain->smmu->features & ARM_SMMU_FEAT_ATS))
@@ -2094,7 +2098,7 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 	if (!atomic_read(&smmu_domain->nr_ats_masters))
 		return 0;
 
-	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds);
+	arm_smmu_cmdq_batch_init(smmu_domain->smmu, &cmds, &cmd);
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
 	list_for_each_entry(master_domain, &smmu_domain->devices,
@@ -2176,7 +2180,7 @@ static void __arm_smmu_tlb_inv_range(struct arm_smmu_cmdq_ent *cmd,
 			num_pages++;
 	}
 
-	arm_smmu_cmdq_batch_init(smmu, &cmds);
+	arm_smmu_cmdq_batch_init(smmu, &cmds, cmd);
 
 	while (iova < end) {
 		if (smmu->features & ARM_SMMU_FEAT_RANGE_INV) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index d4e5a22408e3d..003330b239468 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -642,7 +642,8 @@ struct arm_smmu_strtab_cfg {
 struct arm_smmu_impl_ops {
 	int (*device_reset)(struct arm_smmu_device *smmu);
 	void (*device_remove)(struct arm_smmu_device *smmu);
-	struct arm_smmu_cmdq *(*get_secondary_cmdq)(struct arm_smmu_device *smmu);
+	struct arm_smmu_cmdq *(*get_secondary_cmdq)(
+		struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent);
 };
 
 /* An SMMUv3 instance */
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 5ac3032ee6dd2..9eb9d959f3e5d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -142,6 +142,7 @@ struct tegra241_vcmdq {
  * struct tegra241_vintf - Virtual Interface
  * @idx: Global index in the CMDQV
  * @enabled: Enable status
+ * @hyp_own: Owned by hypervisor (in-kernel)
  * @cmdqv: Parent CMDQV pointer
  * @lvcmdqs: List of logical VCMDQ pointers
  * @base: MMIO base address
@@ -150,6 +151,7 @@ struct tegra241_vintf {
 	u16 idx;
 
 	bool enabled;
+	bool hyp_own;
 
 	struct tegra241_cmdqv *cmdqv;
 	struct tegra241_vcmdq **lvcmdqs;
@@ -301,8 +303,21 @@ static irqreturn_t tegra241_cmdqv_isr(int irq, void *devid)
 
 /* Command Queue Function */
 
+static bool tegra241_guest_vcmdq_supports_cmd(struct arm_smmu_cmdq_ent *ent)
+{
+	switch (ent->opcode) {
+	case CMDQ_OP_TLBI_NH_ASID:
+	case CMDQ_OP_TLBI_NH_VA:
+	case CMDQ_OP_ATC_INV:
+		return true;
+	default:
+		return false;
+	}
+}
+
 static struct arm_smmu_cmdq *
-tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
+tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
+			struct arm_smmu_cmdq_ent *ent)
 {
 	struct tegra241_cmdqv *cmdqv =
 		container_of(smmu, struct tegra241_cmdqv, smmu);
@@ -328,6 +343,10 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu)
 	vcmdq = vintf->lvcmdqs[lidx];
 	if (!vcmdq || !READ_ONCE(vcmdq->enabled))
 		return NULL;
+
+	/* Unsupported CMD goes for smmu->cmdq pathway */
+	if (!arm_smmu_cmdq_supports_cmd(&vcmdq->cmdq, ent))
+		return NULL;
 	return &vcmdq->cmdq;
 }
 
@@ -406,12 +425,22 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
 	tegra241_vintf_hw_deinit(vintf);
 
 	/* Configure and enable VINTF */
+	/*
+	 * Note that HYP_OWN bit is wired to zero when running in guest kernel,
+	 * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a
+	 * restricted set of supported commands.
+	 */
 	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own);
 	writel(regval, REG_VINTF(vintf, CONFIG));
 
 	ret = vintf_write_config(vintf, regval | VINTF_EN);
 	if (ret)
 		return ret;
+	/*
+	 * As being mentioned above, HYP_OWN bit is wired to zero for a guest
+	 * kernel, so read it back from HW to ensure that reflects in hyp_own
+	 */
+	vintf->hyp_own = !!(VINTF_HYP_OWN & readl(REG_VINTF(vintf, CONFIG)));
 
 	for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++) {
 		if (vintf->lvcmdqs && vintf->lvcmdqs[lidx]) {
@@ -493,6 +522,9 @@ static int tegra241_vcmdq_alloc_smmu_cmdq(struct tegra241_vcmdq *vcmdq)
 	q->q_base = q->base_dma & VCMDQ_ADDR;
 	q->q_base |= FIELD_PREP(VCMDQ_LOG2SIZE, q->llq.max_n_shift);
 
+	if (!vcmdq->vintf->hyp_own)
+		cmdq->supports_cmd = tegra241_guest_vcmdq_supports_cmd;
+
 	return arm_smmu_cmdq_init(smmu, cmdq);
 }
 

From 4c912045946ad0afa5a9b6883bd29096b5bc70b6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Sun, 1 Sep 2024 22:57:45 -0700
Subject: [PATCH 238/352] iommu/tegra241-cmdqv: Fix -Wformat-truncation
 warnings in lvcmdq_error_header
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Kernel test robot reported a few trucation warnings at the snprintf:
drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c:
	In function ‘tegra241_vintf_free_lvcmdq’:
drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c:239:56:
	warning: ‘%u’ directive output may be truncated writing between 1 and
	5 bytes into a region of size between 3 and 11 [-Wformat-truncation=]
  239 |         snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ",
      |                                                        ^~
drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c:239:32: note: directive argument
	in the range [0, 65535]
  239 |         snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ",
      |                                ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c:239:9: note: ‘snprintf’ output
	between 25 and 37 bytes into a destination of size 32
  239 |         snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ",
      |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  240 |                  vcmdq->vintf->idx, vcmdq->idx, vcmdq->lidx);

Fix by bumping up the size of the header to hold more characters.

Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
Reported-by: kernel test robot <lkp@intel.com>
Closes: https://lore.kernel.org/oe-kbuild-all/202409020406.7ed5uojF-lkp@intel.com/
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/20240902055745.629456-1-nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit db184a1ced56dde6bbf8cc4d9b936c9f6a510e28)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 9eb9d959f3e5d..03fd13c21dcc3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -233,7 +233,7 @@ static inline int vintf_write_config(struct tegra241_vintf *vintf, u32 regval)
 static inline char *lvcmdq_error_header(struct tegra241_vcmdq *vcmdq,
 					char *header, int hlen)
 {
-	WARN_ON(hlen < 32);
+	WARN_ON(hlen < 64);
 	if (WARN_ON(!vcmdq->vintf))
 		return "";
 	snprintf(header, hlen, "VINTF%u: VCMDQ%u/LVCMDQ%u: ",
@@ -243,7 +243,7 @@ static inline char *lvcmdq_error_header(struct tegra241_vcmdq *vcmdq,
 
 static inline int vcmdq_write_config(struct tegra241_vcmdq *vcmdq, u32 regval)
 {
-	char header[32], *h = lvcmdq_error_header(vcmdq, header, 32);
+	char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
 
 	return tegra241_cmdqv_write_config(vcmdq->cmdqv,
 					   REG_VCMDQ_PAGE0(vcmdq, CONFIG),
@@ -354,7 +354,7 @@ tegra241_cmdqv_get_cmdq(struct arm_smmu_device *smmu,
 
 static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
 {
-	char header[32], *h = lvcmdq_error_header(vcmdq, header, 32);
+	char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
 	u32 gerrorn, gerror;
 
 	if (vcmdq_write_config(vcmdq, 0)) {
@@ -382,7 +382,7 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
 
 static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 {
-	char header[32], *h = lvcmdq_error_header(vcmdq, header, 32);
+	char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
 	int ret;
 
 	/* Reset VCMDQ */
@@ -555,13 +555,13 @@ static int tegra241_vintf_init_lvcmdq(struct tegra241_vintf *vintf, u16 lidx,
 static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 {
 	struct tegra241_vcmdq *vcmdq = vintf->lvcmdqs[lidx];
-	char header[32];
+	char header[64];
 
 	tegra241_vcmdq_free_smmu_cmdq(vcmdq);
 	tegra241_vintf_deinit_lvcmdq(vintf, lidx);
 
 	dev_dbg(vintf->cmdqv->dev,
-		"%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 32));
+		"%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64));
 	kfree(vcmdq);
 }
 
@@ -570,7 +570,7 @@ tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 {
 	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
 	struct tegra241_vcmdq *vcmdq;
-	char header[32];
+	char header[64];
 	int ret;
 
 	vcmdq = kzalloc(sizeof(*vcmdq), GFP_KERNEL);
@@ -587,7 +587,7 @@ tegra241_vintf_alloc_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 		goto deinit_lvcmdq;
 
 	dev_dbg(cmdqv->dev,
-		"%sallocated\n", lvcmdq_error_header(vcmdq, header, 32));
+		"%sallocated\n", lvcmdq_error_header(vcmdq, header, 64));
 	return vcmdq;
 
 deinit_lvcmdq:

From 338dec30c4e2407bec78338662291c3c7ffa9395 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Wed, 4 Sep 2024 11:02:43 +0300
Subject: [PATCH 239/352] iommu/tegra241-cmdqv: Fix ioremap() error handling in
 probe()

The ioremap() function doesn't return error pointers, it returns NULL
on error so update the error handling.  Also just return directly
instead of calling iounmap() on the NULL pointer.  Calling
iounmap(NULL) doesn't cause a problem on ARM but on other architectures
it can trigger a warning so it'a bad habbit.

Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Link: https://lore.kernel.org/r/5a6c1e9a-0724-41b1-86d4-36335d3768ea@stanley.mountain
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 086a3c40ebd02a4ac38121cf909326407b2883bc)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 03fd13c21dcc3..240b541921771 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -772,9 +772,9 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	static_assert(offsetof(struct tegra241_cmdqv, smmu) == 0);
 
 	base = ioremap(res->start, resource_size(res));
-	if (IS_ERR(base)) {
-		dev_err(smmu->dev, "failed to ioremap: %ld\n", PTR_ERR(base));
-		goto iounmap;
+	if (!base) {
+		dev_err(smmu->dev, "failed to ioremap\n");
+		return NULL;
 	}
 
 	regval = readl(base + TEGRA241_CMDQV_CONFIG);

From bdec68f5bcfeba15d4d79d20e72902f205477b4f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 4 Sep 2024 19:40:42 -0700
Subject: [PATCH 240/352] iommu/tegra241-cmdqv: Drop static at local variable

This is likely a typo. Drop it.

Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/13fd3accb5b7ed6ec11cc6b7435f79f84af9f45f.1725503154.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 2408b81f817ba6c278c5453eb9b43a167f35d471)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 240b541921771..0766dc2789cb3 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -761,8 +761,8 @@ static struct arm_smmu_device *
 __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 		       int irq)
 {
-	static struct arm_smmu_device *new_smmu;
 	struct tegra241_cmdqv *cmdqv = NULL;
+	struct arm_smmu_device *new_smmu;
 	struct tegra241_vintf *vintf;
 	void __iomem *base;
 	u32 regval;

From 4ea37cdcb3d636633dfd15483385ffaca2bdb9c2 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 4 Sep 2024 19:40:43 -0700
Subject: [PATCH 241/352] iommu/tegra241-cmdqv: Do not allocate vcmdq until
 dma_set_mask_and_coherent

It's observed that, when the first 4GB of system memory was reserved, all
VCMDQ allocations failed (even with the smallest qsz in the last attempt):
    arm-smmu-v3: found companion CMDQV device: NVDA200C:00
    arm-smmu-v3: option mask 0x10
    arm-smmu-v3: failed to allocate queue (0x8000 bytes) for vcmdq0
    acpi NVDA200C:00: tegra241_cmdqv: Falling back to standard SMMU CMDQ
    arm-smmu-v3: ias 48-bit, oas 48-bit (features 0x001e1fbf)
    arm-smmu-v3: allocated 524288 entries for cmdq
    arm-smmu-v3: allocated 524288 entries for evtq
    arm-smmu-v3: allocated 524288 entries for priq

This is because the 4GB reserved memory shifted the entire DMA zone from a
lower 32-bit range (on a system without the 4GB carveout) to higher range,
while the dev->coherent_dma_mask was set to DMA_BIT_MASK(32) by default.

The dma_set_mask_and_coherent() call is done in arm_smmu_device_hw_probe()
of the SMMU driver. So any DMA allocation from tegra241_cmdqv_probe() must
wait until the coherent_dma_mask is correctly set.

Move the vintf/vcmdq structure initialization routine into a different op,
"init_structures". Call it at the end of arm_smmu_init_structures(), where
standard SMMU queues get allocated.

Most of the impl_ops aren't ready until vintf/vcmdq structure are init-ed.
So replace the full impl_ops with an init_ops in __tegra241_cmdqv_probe().

And switch to tegra241_cmdqv_impl_ops later in arm_smmu_init_structures().
Note that tegra241_cmdqv_impl_ops does not link to the new init_structures
op after this switch, since there is no point in having it once it's done.

Fixes: 918eb5c856f6 ("iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV")
Reported-by: Matt Ochs <mochs@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/530993c3aafa1b0fc3d879b8119e13c629d12e2b.1725503154.git.nicolinc@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 483e0bd8883a40fd3dd3193997a4014337698d72)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  9 +-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  1 +
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 83 ++++++++++++-------
 3 files changed, 60 insertions(+), 33 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 2e1245b48fcb2..6221b483574a1 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3746,7 +3746,14 @@ static int arm_smmu_init_structures(struct arm_smmu_device *smmu)
 	if (ret)
 		return ret;
 
-	return arm_smmu_init_strtab(smmu);
+	ret = arm_smmu_init_strtab(smmu);
+	if (ret)
+		return ret;
+
+	if (smmu->impl_ops && smmu->impl_ops->init_structures)
+		return smmu->impl_ops->init_structures(smmu);
+
+	return 0;
 }
 
 static int arm_smmu_write_reg_sync(struct arm_smmu_device *smmu, u32 val,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 003330b239468..6fd75028d1443 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -642,6 +642,7 @@ struct arm_smmu_strtab_cfg {
 struct arm_smmu_impl_ops {
 	int (*device_reset)(struct arm_smmu_device *smmu);
 	void (*device_remove)(struct arm_smmu_device *smmu);
+	int (*init_structures)(struct arm_smmu_device *smmu);
 	struct arm_smmu_cmdq *(*get_secondary_cmdq)(
 		struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent);
 };
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index 0766dc2789cb3..fcd13d301fff6 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -755,18 +755,65 @@ tegra241_cmdqv_find_acpi_resource(struct device *dev, int *irq)
 	return res;
 }
 
+static int tegra241_cmdqv_init_structures(struct arm_smmu_device *smmu)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	struct tegra241_vintf *vintf;
+	int lidx;
+	int ret;
+
+	vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
+	if (!vintf)
+		goto out_fallback;
+
+	/* Init VINTF0 for in-kernel use */
+	ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
+	if (ret) {
+		dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
+		goto free_vintf;
+	}
+
+	/* Preallocate logical VCMDQs to VINTF0 */
+	for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
+		struct tegra241_vcmdq *vcmdq;
+
+		vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
+		if (IS_ERR(vcmdq))
+			goto free_lvcmdq;
+	}
+
+	/* Now, we are ready to run all the impl ops */
+	smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+	return 0;
+
+free_lvcmdq:
+	for (lidx--; lidx >= 0; lidx--)
+		tegra241_vintf_free_lvcmdq(vintf, lidx);
+	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+free_vintf:
+	kfree(vintf);
+out_fallback:
+	dev_info(smmu->impl_dev, "Falling back to standard SMMU CMDQ\n");
+	smmu->options &= ~ARM_SMMU_OPT_TEGRA241_CMDQV;
+	tegra241_cmdqv_remove(smmu);
+	return 0;
+}
+
 struct dentry *cmdqv_debugfs_dir;
 
 static struct arm_smmu_device *
 __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 		       int irq)
 {
+	static const struct arm_smmu_impl_ops init_ops = {
+		.init_structures = tegra241_cmdqv_init_structures,
+		.device_remove = tegra241_cmdqv_remove,
+	};
 	struct tegra241_cmdqv *cmdqv = NULL;
 	struct arm_smmu_device *new_smmu;
-	struct tegra241_vintf *vintf;
 	void __iomem *base;
 	u32 regval;
-	int lidx;
 	int ret;
 
 	static_assert(offsetof(struct tegra241_cmdqv, smmu) == 0);
@@ -815,26 +862,6 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 
 	ida_init(&cmdqv->vintf_ids);
 
-	vintf = kzalloc(sizeof(*vintf), GFP_KERNEL);
-	if (!vintf)
-		goto destroy_ids;
-
-	/* Init VINTF0 for in-kernel use */
-	ret = tegra241_cmdqv_init_vintf(cmdqv, 0, vintf);
-	if (ret) {
-		dev_err(cmdqv->dev, "failed to init vintf0: %d\n", ret);
-		goto free_vintf;
-	}
-
-	/* Preallocate logical VCMDQs to VINTF0 */
-	for (lidx = 0; lidx < cmdqv->num_lvcmdqs_per_vintf; lidx++) {
-		struct tegra241_vcmdq *vcmdq;
-
-		vcmdq = tegra241_vintf_alloc_lvcmdq(vintf, lidx);
-		if (IS_ERR(vcmdq))
-			goto free_lvcmdq;
-	}
-
 #ifdef CONFIG_IOMMU_DEBUGFS
 	if (!cmdqv_debugfs_dir) {
 		cmdqv_debugfs_dir =
@@ -844,19 +871,11 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	}
 #endif
 
-	new_smmu->impl_ops = &tegra241_cmdqv_impl_ops;
+	/* Provide init-level ops only, until tegra241_cmdqv_init_structures */
+	new_smmu->impl_ops = &init_ops;
 
 	return new_smmu;
 
-free_lvcmdq:
-	for (lidx--; lidx >= 0; lidx--)
-		tegra241_vintf_free_lvcmdq(vintf, lidx);
-	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
-free_vintf:
-	kfree(vintf);
-destroy_ids:
-	ida_destroy(&cmdqv->vintf_ids);
-	kfree(cmdqv->vintfs);
 free_irq:
 	if (cmdqv->irq > 0)
 		free_irq(cmdqv->irq, cmdqv);

From 94c19774931342230ec11f9d25fb7ae54f5b229b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 6 Aug 2024 20:31:15 -0300
Subject: [PATCH 242/352] iommu/arm-smmu-v3: Use the new rb tree helpers

Since v5.12 the rbtree has gained some simplifying helpers aimed at making
rb tree users write less convoluted boiler plate code. Instead the caller
provides a single comparison function and the helpers generate the prior
open-coded stuff.

Update smmu->streams to use rb_find_add() and rb_find().

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v3-9fef8cdc2ff6+150d1-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit a2bb820e862d61f9ca1499e500915f9f505a2655)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 68 ++++++++++-----------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6221b483574a1..3a1d93ad9cc5f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1733,26 +1733,37 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 	return 0;
 }
 
+static int arm_smmu_streams_cmp_key(const void *lhs, const struct rb_node *rhs)
+{
+	struct arm_smmu_stream *stream_rhs =
+		rb_entry(rhs, struct arm_smmu_stream, node);
+	const u32 *sid_lhs = lhs;
+
+	if (*sid_lhs < stream_rhs->id)
+		return -1;
+	if (*sid_lhs > stream_rhs->id)
+		return 1;
+	return 0;
+}
+
+static int arm_smmu_streams_cmp_node(struct rb_node *lhs,
+				     const struct rb_node *rhs)
+{
+	return arm_smmu_streams_cmp_key(
+		&rb_entry(lhs, struct arm_smmu_stream, node)->id, rhs);
+}
+
 static struct arm_smmu_master *
 arm_smmu_find_master(struct arm_smmu_device *smmu, u32 sid)
 {
 	struct rb_node *node;
-	struct arm_smmu_stream *stream;
 
 	lockdep_assert_held(&smmu->streams_mutex);
 
-	node = smmu->streams.rb_node;
-	while (node) {
-		stream = rb_entry(node, struct arm_smmu_stream, node);
-		if (stream->id < sid)
-			node = node->rb_right;
-		else if (stream->id > sid)
-			node = node->rb_left;
-		else
-			return stream->master;
-	}
-
-	return NULL;
+	node = rb_find(&sid, &smmu->streams, arm_smmu_streams_cmp_key);
+	if (!node)
+		return NULL;
+	return rb_entry(node, struct arm_smmu_stream, node)->master;
 }
 
 /* IRQ and event handlers */
@@ -3212,8 +3223,6 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
 {
 	int i;
 	int ret = 0;
-	struct arm_smmu_stream *new_stream, *cur_stream;
-	struct rb_node **new_node, *parent_node = NULL;
 	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(master->dev);
 
 	master->streams = kcalloc(fwspec->num_ids, sizeof(*master->streams),
@@ -3224,9 +3233,9 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
 
 	mutex_lock(&smmu->streams_mutex);
 	for (i = 0; i < fwspec->num_ids; i++) {
+		struct arm_smmu_stream *new_stream = &master->streams[i];
 		u32 sid = fwspec->ids[i];
 
-		new_stream = &master->streams[i];
 		new_stream->id = sid;
 		new_stream->master = master;
 
@@ -3235,28 +3244,13 @@ static int arm_smmu_insert_master(struct arm_smmu_device *smmu,
 			break;
 
 		/* Insert into SID tree */
-		new_node = &(smmu->streams.rb_node);
-		while (*new_node) {
-			cur_stream = rb_entry(*new_node, struct arm_smmu_stream,
-					      node);
-			parent_node = *new_node;
-			if (cur_stream->id > new_stream->id) {
-				new_node = &((*new_node)->rb_left);
-			} else if (cur_stream->id < new_stream->id) {
-				new_node = &((*new_node)->rb_right);
-			} else {
-				dev_warn(master->dev,
-					 "stream %u already in tree\n",
-					 cur_stream->id);
-				ret = -EINVAL;
-				break;
-			}
-		}
-		if (ret)
+		if (rb_find_add(&new_stream->node, &smmu->streams,
+				arm_smmu_streams_cmp_node)) {
+			dev_warn(master->dev, "stream %u already in tree\n",
+				 sid);
+			ret = -EINVAL;
 			break;
-
-		rb_link_node(&new_stream->node, parent_node, new_node);
-		rb_insert_color(&new_stream->node, &smmu->streams);
+		}
 	}
 
 	if (ret) {

From cb63a15fbe318424a88bebe0c4c4a50328413969 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:48 -0300
Subject: [PATCH 243/352] iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()

Don't open code the calculations of the indexes for each level, provide
two functions to do that math and call them in all the places. Update all
the places computing indexes.

Calculate the L1 table size directly based on the max required index from
the cap. Remove STRTAB_L1_SZ_SHIFT in favour of STRTAB_NUM_L2_STES.

Use STRTAB_NUM_L2_STES to replace remaining open coded 1 << STRTAB_SPLIT.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/1-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit ce410410f1a7db0259ca9282a285fb80fd553b8c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 45 +++++++++------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 14 ++++++-
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 3a1d93ad9cc5f..9af8ce1926f7f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1708,17 +1708,15 @@ static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
 static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 {
 	size_t size;
-	void *strtab;
 	dma_addr_t l2ptr_dma;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
-	struct arm_smmu_strtab_l1_desc *desc = &cfg->l1_desc[sid >> STRTAB_SPLIT];
+	struct arm_smmu_strtab_l1_desc *desc;
 
+	desc = &cfg->l1_desc[arm_smmu_strtab_l1_idx(sid)];
 	if (desc->l2ptr)
 		return 0;
 
-	size = 1 << (STRTAB_SPLIT + ilog2(STRTAB_STE_DWORDS) + 3);
-	strtab = &cfg->strtab[(sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS];
-
+	size = STRTAB_NUM_L2_STES * sizeof(struct arm_smmu_ste);
 	desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &l2ptr_dma,
 					  GFP_KERNEL);
 	if (!desc->l2ptr) {
@@ -1728,8 +1726,9 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 		return -ENOMEM;
 	}
 
-	arm_smmu_init_initial_stes(desc->l2ptr, 1 << STRTAB_SPLIT);
-	arm_smmu_write_strtab_l1_desc(strtab, l2ptr_dma);
+	arm_smmu_init_initial_stes(desc->l2ptr, STRTAB_NUM_L2_STES);
+	arm_smmu_write_strtab_l1_desc(&cfg->strtab[arm_smmu_strtab_l1_idx(sid)],
+				      l2ptr_dma);
 	return 0;
 }
 
@@ -2488,12 +2487,9 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
 	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
-		unsigned int idx1, idx2;
-
 		/* Two-level walk */
-		idx1 = (sid >> STRTAB_SPLIT) * STRTAB_L1_DESC_DWORDS;
-		idx2 = sid & ((1 << STRTAB_SPLIT) - 1);
-		return &cfg->l1_desc[idx1].l2ptr[idx2];
+		return &cfg->l1_desc[arm_smmu_strtab_l1_idx(sid)]
+				.l2ptr[arm_smmu_strtab_l2_idx(sid)];
 	} else {
 		/* Simple linear lookup */
 		return (struct arm_smmu_ste *)&cfg
@@ -3197,12 +3193,9 @@ struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
 
 static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
 {
-	unsigned long limit = smmu->strtab_cfg.num_l1_ents;
-
 	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
-		limit *= 1UL << STRTAB_SPLIT;
-
-	return sid < limit;
+		return arm_smmu_strtab_l1_idx(sid) < smmu->strtab_cfg.num_l1_ents;
+	return sid < smmu->strtab_cfg.num_l1_ents;
 }
 
 static int arm_smmu_init_sid_strtab(struct arm_smmu_device *smmu, u32 sid)
@@ -3639,19 +3632,18 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 {
 	void *strtab;
 	u64 reg;
-	u32 size, l1size;
+	u32 l1size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	unsigned int last_sid_idx =
+		arm_smmu_strtab_l1_idx((1 << smmu->sid_bits) - 1);
 
 	/* Calculate the L1 size, capped to the SIDSIZE. */
-	size = STRTAB_L1_SZ_SHIFT - (ilog2(STRTAB_L1_DESC_DWORDS) + 3);
-	size = min(size, smmu->sid_bits - STRTAB_SPLIT);
-	cfg->num_l1_ents = 1 << size;
-
-	size += STRTAB_SPLIT;
-	if (size < smmu->sid_bits)
+	cfg->num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES);
+	if (cfg->num_l1_ents <= last_sid_idx)
 		dev_warn(smmu->dev,
 			 "2-level strtab only covers %u/%u bits of SID\n",
-			 size, smmu->sid_bits);
+			 ilog2(cfg->num_l1_ents * STRTAB_NUM_L2_STES),
+			 smmu->sid_bits);
 
 	l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
 	strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
@@ -3666,7 +3658,8 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 
 	/* Configure strtab_base_cfg for 2 levels */
 	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL);
-	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, size);
+	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE,
+			  ilog2(cfg->num_l1_ents) + STRTAB_SPLIT);
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
 	cfg->strtab_base_cfg = reg;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 6fd75028d1443..92c92021dd52a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -204,7 +204,6 @@ struct arm_smmu_device;
  * 2lvl: 128k L1 entries,
  *       256 lazy entries per table (each table covers a PCI bus)
  */
-#define STRTAB_L1_SZ_SHIFT		20
 #define STRTAB_SPLIT			8
 
 #define STRTAB_L1_DESC_DWORDS		1
@@ -217,6 +216,19 @@ struct arm_smmu_ste {
 	__le64 data[STRTAB_STE_DWORDS];
 };
 
+#define STRTAB_NUM_L2_STES		(1 << STRTAB_SPLIT)
+#define STRTAB_MAX_L1_ENTRIES		(1 << 17)
+
+static inline u32 arm_smmu_strtab_l1_idx(u32 sid)
+{
+	return sid / STRTAB_NUM_L2_STES;
+}
+
+static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
+{
+	return sid % STRTAB_NUM_L2_STES;
+}
+
 #define STRTAB_STE_0_V			(1UL << 0)
 #define STRTAB_STE_0_CFG		GENMASK_ULL(3, 1)
 #define STRTAB_STE_0_CFG_ABORT		0

From 2fb0436cb3720065e08b46f54daed806264409a1 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:49 -0300
Subject: [PATCH 244/352] iommu/arm-smmu-v3: Add types for each level of the 2
 level stream table

Add types struct arm_smmu_strtab_l1 and l2 to represent the HW layout of
the descriptors, and use them in most places, following patches will get
the remaing places. The size of the l1 and l2 HW allocations are
sizeof(struct arm_smmu_strtab_l1/2).

This provides some more clarity than having raw __le64 *'s and sizes
computed via macros.

Remove STRTAB_L1_DESC_DWORDS.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/2-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit abb4f9d323a8d53870cc842d3c5024f71c2d4951)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 21 +++++++++++----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 10 ++++++++--
 2 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 9af8ce1926f7f..8e8f1d25c6b61 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1495,7 +1495,8 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 }
 
 /* Stream table manipulation functions */
-static void arm_smmu_write_strtab_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
+static void arm_smmu_write_strtab_l1_desc(struct arm_smmu_strtab_l1 *dst,
+					  dma_addr_t l2ptr_dma)
 {
 	u64 val = 0;
 
@@ -1503,7 +1504,7 @@ static void arm_smmu_write_strtab_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
 	val |= l2ptr_dma & STRTAB_L1_DESC_L2PTR_MASK;
 
 	/* The HW has 64 bit atomicity with stores to the L2 STE table */
-	WRITE_ONCE(*dst, cpu_to_le64(val));
+	WRITE_ONCE(dst->l2ptr, cpu_to_le64(val));
 }
 
 struct arm_smmu_ste_writer {
@@ -1707,18 +1708,17 @@ static void arm_smmu_init_initial_stes(struct arm_smmu_ste *strtab,
 
 static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 {
-	size_t size;
 	dma_addr_t l2ptr_dma;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 	struct arm_smmu_strtab_l1_desc *desc;
+	__le64 *dst;
 
 	desc = &cfg->l1_desc[arm_smmu_strtab_l1_idx(sid)];
 	if (desc->l2ptr)
 		return 0;
 
-	size = STRTAB_NUM_L2_STES * sizeof(struct arm_smmu_ste);
-	desc->l2ptr = dmam_alloc_coherent(smmu->dev, size, &l2ptr_dma,
-					  GFP_KERNEL);
+	desc->l2ptr = dmam_alloc_coherent(smmu->dev, sizeof(*desc->l2ptr),
+					  &l2ptr_dma, GFP_KERNEL);
 	if (!desc->l2ptr) {
 		dev_err(smmu->dev,
 			"failed to allocate l2 stream table for SID %u\n",
@@ -1726,8 +1726,9 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 		return -ENOMEM;
 	}
 
-	arm_smmu_init_initial_stes(desc->l2ptr, STRTAB_NUM_L2_STES);
-	arm_smmu_write_strtab_l1_desc(&cfg->strtab[arm_smmu_strtab_l1_idx(sid)],
+	arm_smmu_init_initial_stes(desc->l2ptr->stes, STRTAB_NUM_L2_STES);
+	dst = &cfg->strtab[arm_smmu_strtab_l1_idx(sid)];
+	arm_smmu_write_strtab_l1_desc((struct arm_smmu_strtab_l1 *)dst,
 				      l2ptr_dma);
 	return 0;
 }
@@ -2489,7 +2490,7 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
 	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
 		/* Two-level walk */
 		return &cfg->l1_desc[arm_smmu_strtab_l1_idx(sid)]
-				.l2ptr[arm_smmu_strtab_l2_idx(sid)];
+				.l2ptr->stes[arm_smmu_strtab_l2_idx(sid)];
 	} else {
 		/* Simple linear lookup */
 		return (struct arm_smmu_ste *)&cfg
@@ -3645,7 +3646,7 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 			 ilog2(cfg->num_l1_ents * STRTAB_NUM_L2_STES),
 			 smmu->sid_bits);
 
-	l1size = cfg->num_l1_ents * (STRTAB_L1_DESC_DWORDS << 3);
+	l1size = cfg->num_l1_ents * sizeof(struct arm_smmu_strtab_l1);
 	strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
 				     GFP_KERNEL);
 	if (!strtab) {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 92c92021dd52a..664145fbf92ed 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -206,7 +206,6 @@ struct arm_smmu_device;
  */
 #define STRTAB_SPLIT			8
 
-#define STRTAB_L1_DESC_DWORDS		1
 #define STRTAB_L1_DESC_SPAN		GENMASK_ULL(4, 0)
 #define STRTAB_L1_DESC_L2PTR_MASK	GENMASK_ULL(51, 6)
 
@@ -217,6 +216,13 @@ struct arm_smmu_ste {
 };
 
 #define STRTAB_NUM_L2_STES		(1 << STRTAB_SPLIT)
+struct arm_smmu_strtab_l2 {
+	struct arm_smmu_ste stes[STRTAB_NUM_L2_STES];
+};
+
+struct arm_smmu_strtab_l1 {
+	__le64 l2ptr;
+};
 #define STRTAB_MAX_L1_ENTRIES		(1 << 17)
 
 static inline u32 arm_smmu_strtab_l1_idx(u32 sid)
@@ -607,7 +613,7 @@ struct arm_smmu_priq {
 
 /* High-level stream table and context descriptor structures */
 struct arm_smmu_strtab_l1_desc {
-	struct arm_smmu_ste		*l2ptr;
+	struct arm_smmu_strtab_l2	*l2ptr;
 };
 
 struct arm_smmu_ctx_desc {

From 6d17daa98553f41dbe39bd0fb66b20b4b93d3b69 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:50 -0300
Subject: [PATCH 245/352] iommu/arm-smmu-v3: Reorganize struct
 arm_smmu_strtab_cfg

The members here are being used for both the linear and the 2 level case,
with the meaning of each item slightly different in the two cases.

Split it into a clean union where both cases have their own struct with
their own logical names and correct types.

Adjust all the users to detect linear/2lvl and use the right sub structure
and types consistently.

Remove STRTAB_STE_DWORDS by changing the last places to use
sizeof(struct arm_smmu_ste).

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/3-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 85196f54743d97b0678e7889df72fdcc58ab2b02)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 78 ++++++++++-----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 22 +++---
 2 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8e8f1d25c6b61..069a236db3a49 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1710,25 +1710,24 @@ static int arm_smmu_init_l2_strtab(struct arm_smmu_device *smmu, u32 sid)
 {
 	dma_addr_t l2ptr_dma;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
-	struct arm_smmu_strtab_l1_desc *desc;
-	__le64 *dst;
+	struct arm_smmu_strtab_l2 **l2table;
 
-	desc = &cfg->l1_desc[arm_smmu_strtab_l1_idx(sid)];
-	if (desc->l2ptr)
+	l2table = &cfg->l2.l2ptrs[arm_smmu_strtab_l1_idx(sid)];
+	if (*l2table)
 		return 0;
 
-	desc->l2ptr = dmam_alloc_coherent(smmu->dev, sizeof(*desc->l2ptr),
-					  &l2ptr_dma, GFP_KERNEL);
-	if (!desc->l2ptr) {
+	*l2table = dmam_alloc_coherent(smmu->dev, sizeof(**l2table),
+				       &l2ptr_dma, GFP_KERNEL);
+	if (!*l2table) {
 		dev_err(smmu->dev,
 			"failed to allocate l2 stream table for SID %u\n",
 			sid);
 		return -ENOMEM;
 	}
 
-	arm_smmu_init_initial_stes(desc->l2ptr->stes, STRTAB_NUM_L2_STES);
-	dst = &cfg->strtab[arm_smmu_strtab_l1_idx(sid)];
-	arm_smmu_write_strtab_l1_desc((struct arm_smmu_strtab_l1 *)dst,
+	arm_smmu_init_initial_stes((*l2table)->stes,
+				   ARRAY_SIZE((*l2table)->stes));
+	arm_smmu_write_strtab_l1_desc(&cfg->l2.l1tab[arm_smmu_strtab_l1_idx(sid)],
 				      l2ptr_dma);
 	return 0;
 }
@@ -2489,12 +2488,11 @@ arm_smmu_get_step_for_sid(struct arm_smmu_device *smmu, u32 sid)
 
 	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
 		/* Two-level walk */
-		return &cfg->l1_desc[arm_smmu_strtab_l1_idx(sid)]
-				.l2ptr->stes[arm_smmu_strtab_l2_idx(sid)];
+		return &cfg->l2.l2ptrs[arm_smmu_strtab_l1_idx(sid)]
+				->stes[arm_smmu_strtab_l2_idx(sid)];
 	} else {
 		/* Simple linear lookup */
-		return (struct arm_smmu_ste *)&cfg
-			       ->strtab[sid * STRTAB_STE_DWORDS];
+		return &cfg->linear.table[sid];
 	}
 }
 
@@ -3195,8 +3193,8 @@ struct arm_smmu_device *arm_smmu_get_by_fwnode(struct fwnode_handle *fwnode)
 static bool arm_smmu_sid_in_range(struct arm_smmu_device *smmu, u32 sid)
 {
 	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
-		return arm_smmu_strtab_l1_idx(sid) < smmu->strtab_cfg.num_l1_ents;
-	return sid < smmu->strtab_cfg.num_l1_ents;
+		return arm_smmu_strtab_l1_idx(sid) < smmu->strtab_cfg.l2.num_l1_ents;
+	return sid < smmu->strtab_cfg.linear.num_ents;
 }
 
 static int arm_smmu_init_sid_strtab(struct arm_smmu_device *smmu, u32 sid)
@@ -3631,7 +3629,6 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 
 static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 {
-	void *strtab;
 	u64 reg;
 	u32 l1size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
@@ -3639,34 +3636,33 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 		arm_smmu_strtab_l1_idx((1 << smmu->sid_bits) - 1);
 
 	/* Calculate the L1 size, capped to the SIDSIZE. */
-	cfg->num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES);
-	if (cfg->num_l1_ents <= last_sid_idx)
+	cfg->l2.num_l1_ents = min(last_sid_idx + 1, STRTAB_MAX_L1_ENTRIES);
+	if (cfg->l2.num_l1_ents <= last_sid_idx)
 		dev_warn(smmu->dev,
 			 "2-level strtab only covers %u/%u bits of SID\n",
-			 ilog2(cfg->num_l1_ents * STRTAB_NUM_L2_STES),
+			 ilog2(cfg->l2.num_l1_ents * STRTAB_NUM_L2_STES),
 			 smmu->sid_bits);
 
-	l1size = cfg->num_l1_ents * sizeof(struct arm_smmu_strtab_l1);
-	strtab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->strtab_dma,
-				     GFP_KERNEL);
-	if (!strtab) {
+	l1size = cfg->l2.num_l1_ents * sizeof(struct arm_smmu_strtab_l1);
+	cfg->l2.l1tab = dmam_alloc_coherent(smmu->dev, l1size, &cfg->l2.l1_dma,
+					    GFP_KERNEL);
+	if (!cfg->l2.l1tab) {
 		dev_err(smmu->dev,
 			"failed to allocate l1 stream table (%u bytes)\n",
 			l1size);
 		return -ENOMEM;
 	}
-	cfg->strtab = strtab;
 
 	/* Configure strtab_base_cfg for 2 levels */
 	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL);
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE,
-			  ilog2(cfg->num_l1_ents) + STRTAB_SPLIT);
+			  ilog2(cfg->l2.num_l1_ents) + STRTAB_SPLIT);
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
 	cfg->strtab_base_cfg = reg;
 
-	cfg->l1_desc = devm_kcalloc(smmu->dev, cfg->num_l1_ents,
-				    sizeof(*cfg->l1_desc), GFP_KERNEL);
-	if (!cfg->l1_desc)
+	cfg->l2.l2ptrs = devm_kcalloc(smmu->dev, cfg->l2.num_l1_ents,
+				      sizeof(*cfg->l2.l2ptrs), GFP_KERNEL);
+	if (!cfg->l2.l2ptrs)
 		return -ENOMEM;
 
 	return 0;
@@ -3674,29 +3670,28 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 
 static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 {
-	void *strtab;
 	u64 reg;
 	u32 size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
-	size = (1 << smmu->sid_bits) * (STRTAB_STE_DWORDS << 3);
-	strtab = dmam_alloc_coherent(smmu->dev, size, &cfg->strtab_dma,
-				     GFP_KERNEL);
-	if (!strtab) {
+	size = (1 << smmu->sid_bits) * sizeof(struct arm_smmu_ste);
+	cfg->linear.table = dmam_alloc_coherent(smmu->dev, size,
+						&cfg->linear.ste_dma,
+						GFP_KERNEL);
+	if (!cfg->linear.table) {
 		dev_err(smmu->dev,
 			"failed to allocate linear stream table (%u bytes)\n",
 			size);
 		return -ENOMEM;
 	}
-	cfg->strtab = strtab;
-	cfg->num_l1_ents = 1 << smmu->sid_bits;
+	cfg->linear.num_ents = 1 << smmu->sid_bits;
 
 	/* Configure strtab_base_cfg for a linear table covering all SIDs */
 	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR);
 	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
 	cfg->strtab_base_cfg = reg;
 
-	arm_smmu_init_initial_stes(strtab, cfg->num_l1_ents);
+	arm_smmu_init_initial_stes(cfg->linear.table, cfg->linear.num_ents);
 	return 0;
 }
 
@@ -3705,16 +3700,17 @@ static int arm_smmu_init_strtab(struct arm_smmu_device *smmu)
 	u64 reg;
 	int ret;
 
-	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
 		ret = arm_smmu_init_strtab_2lvl(smmu);
-	else
+		reg = smmu->strtab_cfg.l2.l1_dma & STRTAB_BASE_ADDR_MASK;
+	} else {
 		ret = arm_smmu_init_strtab_linear(smmu);
-
+		reg = smmu->strtab_cfg.linear.ste_dma & STRTAB_BASE_ADDR_MASK;
+	}
 	if (ret)
 		return ret;
 
 	/* Set the strtab base address */
-	reg  = smmu->strtab_cfg.strtab_dma & STRTAB_BASE_ADDR_MASK;
 	reg |= STRTAB_BASE_RA;
 	smmu->strtab_cfg.strtab_base = reg;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 664145fbf92ed..acab22d44cf38 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -612,10 +612,6 @@ struct arm_smmu_priq {
 };
 
 /* High-level stream table and context descriptor structures */
-struct arm_smmu_strtab_l1_desc {
-	struct arm_smmu_strtab_l2	*l2ptr;
-};
-
 struct arm_smmu_ctx_desc {
 	u16				asid;
 };
@@ -648,11 +644,19 @@ struct arm_smmu_s2_cfg {
 };
 
 struct arm_smmu_strtab_cfg {
-	__le64				*strtab;
-	dma_addr_t			strtab_dma;
-	struct arm_smmu_strtab_l1_desc	*l1_desc;
-	unsigned int			num_l1_ents;
-
+	union {
+		struct {
+			struct arm_smmu_ste *table;
+			dma_addr_t ste_dma;
+			unsigned int num_ents;
+		} linear;
+		struct {
+			struct arm_smmu_strtab_l1 *l1tab;
+			struct arm_smmu_strtab_l2 **l2ptrs;
+			dma_addr_t l1_dma;
+			unsigned int num_l1_ents;
+		} l2;
+	};
 	u64				strtab_base;
 	u32				strtab_base_cfg;
 };

From abad5e8f0d24bd80ae701aa6d898eb8d48d0c885 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:51 -0300
Subject: [PATCH 246/352] iommu/arm-smmu-v3: Remove strtab_base/cfg

These values can be computed from the other values already stored in the
config. Move the calculation to arm_smmu_write_strtab() and do it directly
before writing the registers.

This moves all the logic to calculate the two registers into one function
from three and saves an unimportant 16 bytes from the arm_smmu_device.

Suggested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/4-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 8c153ef95697242b72646d2c4cf6c4b23ccf35a3)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 55 ++++++++++-----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 -
 2 files changed, 27 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 069a236db3a49..28eeecbea1706 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3629,7 +3629,6 @@ static int arm_smmu_init_queues(struct arm_smmu_device *smmu)
 
 static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 {
-	u64 reg;
 	u32 l1size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 	unsigned int last_sid_idx =
@@ -3653,13 +3652,6 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 		return -ENOMEM;
 	}
 
-	/* Configure strtab_base_cfg for 2 levels */
-	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_2LVL);
-	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE,
-			  ilog2(cfg->l2.num_l1_ents) + STRTAB_SPLIT);
-	reg |= FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
-	cfg->strtab_base_cfg = reg;
-
 	cfg->l2.l2ptrs = devm_kcalloc(smmu->dev, cfg->l2.num_l1_ents,
 				      sizeof(*cfg->l2.l2ptrs), GFP_KERNEL);
 	if (!cfg->l2.l2ptrs)
@@ -3670,7 +3662,6 @@ static int arm_smmu_init_strtab_2lvl(struct arm_smmu_device *smmu)
 
 static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 {
-	u64 reg;
 	u32 size;
 	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
 
@@ -3686,34 +3677,21 @@ static int arm_smmu_init_strtab_linear(struct arm_smmu_device *smmu)
 	}
 	cfg->linear.num_ents = 1 << smmu->sid_bits;
 
-	/* Configure strtab_base_cfg for a linear table covering all SIDs */
-	reg  = FIELD_PREP(STRTAB_BASE_CFG_FMT, STRTAB_BASE_CFG_FMT_LINEAR);
-	reg |= FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
-	cfg->strtab_base_cfg = reg;
-
 	arm_smmu_init_initial_stes(cfg->linear.table, cfg->linear.num_ents);
 	return 0;
 }
 
 static int arm_smmu_init_strtab(struct arm_smmu_device *smmu)
 {
-	u64 reg;
 	int ret;
 
-	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB)
 		ret = arm_smmu_init_strtab_2lvl(smmu);
-		reg = smmu->strtab_cfg.l2.l1_dma & STRTAB_BASE_ADDR_MASK;
-	} else {
+	else
 		ret = arm_smmu_init_strtab_linear(smmu);
-		reg = smmu->strtab_cfg.linear.ste_dma & STRTAB_BASE_ADDR_MASK;
-	}
 	if (ret)
 		return ret;
 
-	/* Set the strtab base address */
-	reg |= STRTAB_BASE_RA;
-	smmu->strtab_cfg.strtab_base = reg;
-
 	ida_init(&smmu->vmid_map);
 
 	return 0;
@@ -3928,6 +3906,30 @@ static int arm_smmu_device_disable(struct arm_smmu_device *smmu)
 	return ret;
 }
 
+static void arm_smmu_write_strtab(struct arm_smmu_device *smmu)
+{
+	struct arm_smmu_strtab_cfg *cfg = &smmu->strtab_cfg;
+	dma_addr_t dma;
+	u32 reg;
+
+	if (smmu->features & ARM_SMMU_FEAT_2_LVL_STRTAB) {
+		reg = FIELD_PREP(STRTAB_BASE_CFG_FMT,
+				 STRTAB_BASE_CFG_FMT_2LVL) |
+		      FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE,
+				 ilog2(cfg->l2.num_l1_ents) + STRTAB_SPLIT) |
+		      FIELD_PREP(STRTAB_BASE_CFG_SPLIT, STRTAB_SPLIT);
+		dma = cfg->l2.l1_dma;
+	} else {
+		reg = FIELD_PREP(STRTAB_BASE_CFG_FMT,
+				 STRTAB_BASE_CFG_FMT_LINEAR) |
+		      FIELD_PREP(STRTAB_BASE_CFG_LOG2SIZE, smmu->sid_bits);
+		dma = cfg->linear.ste_dma;
+	}
+	writeq_relaxed((dma & STRTAB_BASE_ADDR_MASK) | STRTAB_BASE_RA,
+		       smmu->base + ARM_SMMU_STRTAB_BASE);
+	writel_relaxed(reg, smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+}
+
 static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
 {
 	int ret;
@@ -3963,10 +3965,7 @@ static int arm_smmu_device_reset(struct arm_smmu_device *smmu)
 	writel_relaxed(reg, smmu->base + ARM_SMMU_CR2);
 
 	/* Stream table */
-	writeq_relaxed(smmu->strtab_cfg.strtab_base,
-		       smmu->base + ARM_SMMU_STRTAB_BASE);
-	writel_relaxed(smmu->strtab_cfg.strtab_base_cfg,
-		       smmu->base + ARM_SMMU_STRTAB_BASE_CFG);
+	arm_smmu_write_strtab(smmu);
 
 	/* Command queue */
 	writeq_relaxed(smmu->cmdq.q.q_base, smmu->base + ARM_SMMU_CMDQ_BASE);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index acab22d44cf38..db5641de0de38 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -657,8 +657,6 @@ struct arm_smmu_strtab_cfg {
 			unsigned int num_l1_ents;
 		} l2;
 	};
-	u64				strtab_base;
-	u32				strtab_base_cfg;
 };
 
 struct arm_smmu_impl_ops {

From 5872222ebea7f00e9630b2a3324fc0574bcda213 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:52 -0300
Subject: [PATCH 247/352] iommu/arm-smmu-v3: Do not use devm for the cd table
 allocations

The master->cd_table is entirely contained within the struct
arm_smmu_master which is guaranteed to be freed by the core code under
arm_smmu_release_device().

There is no reason to use devm here, arm_smmu_free_cd_tables() is reliably
called to free the CD related memory. Remove it and save some memory.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/5-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 47b2de35cab2b683f69d03515c2658c2d8515323)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 29 +++++++++------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 28eeecbea1706..4bb4a77478bed 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1221,8 +1221,8 @@ static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
 {
 	size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
 
-	l1_desc->l2ptr = dmam_alloc_coherent(smmu->dev, size,
-					     &l1_desc->l2ptr_dma, GFP_KERNEL);
+	l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
+					    &l1_desc->l2ptr_dma, GFP_KERNEL);
 	if (!l1_desc->l2ptr) {
 		dev_warn(smmu->dev,
 			 "failed to allocate context descriptor table\n");
@@ -1436,17 +1436,17 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 		cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts,
 						  CTXDESC_L2_ENTRIES);
 
-		cd_table->l1_desc = devm_kcalloc(smmu->dev, cd_table->num_l1_ents,
-					      sizeof(*cd_table->l1_desc),
-					      GFP_KERNEL);
+		cd_table->l1_desc = kcalloc(cd_table->num_l1_ents,
+					    sizeof(*cd_table->l1_desc),
+					    GFP_KERNEL);
 		if (!cd_table->l1_desc)
 			return -ENOMEM;
 
 		l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
 	}
 
-	cd_table->cdtab = dmam_alloc_coherent(smmu->dev, l1size, &cd_table->cdtab_dma,
-					   GFP_KERNEL);
+	cd_table->cdtab = dma_alloc_coherent(smmu->dev, l1size,
+					     &cd_table->cdtab_dma, GFP_KERNEL);
 	if (!cd_table->cdtab) {
 		dev_warn(smmu->dev, "failed to allocate context descriptor\n");
 		ret = -ENOMEM;
@@ -1457,7 +1457,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 
 err_free_l1:
 	if (cd_table->l1_desc) {
-		devm_kfree(smmu->dev, cd_table->l1_desc);
+		kfree(cd_table->l1_desc);
 		cd_table->l1_desc = NULL;
 	}
 	return ret;
@@ -1477,21 +1477,18 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 			if (!cd_table->l1_desc[i].l2ptr)
 				continue;
 
-			dmam_free_coherent(smmu->dev, size,
-					   cd_table->l1_desc[i].l2ptr,
-					   cd_table->l1_desc[i].l2ptr_dma);
+			dma_free_coherent(smmu->dev, size,
+					  cd_table->l1_desc[i].l2ptr,
+					  cd_table->l1_desc[i].l2ptr_dma);
 		}
-		devm_kfree(smmu->dev, cd_table->l1_desc);
-		cd_table->l1_desc = NULL;
+		kfree(cd_table->l1_desc);
 
 		l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
 	} else {
 		l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3);
 	}
 
-	dmam_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma);
-	cd_table->cdtab_dma = 0;
-	cd_table->cdtab = NULL;
+	dma_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma);
 }
 
 /* Stream table manipulation functions */

From 6d6ad397dbcbe381c39d3d1422c19cfdb965b970 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:53 -0300
Subject: [PATCH 248/352] iommu/arm-smmu-v3: Shrink the cdtab l1_desc array

The top of the 2 level CD table is (at most) 1024 entries big, and two
high order allocations are required. One of __le64 which is programmed
into the HW (8k) and one of struct arm_smmu_l1_ctx_desc which holds the
CPU pointer (16k).

There are two copies of the l2ptr_dma, one is stored in the struct
arm_smmu_l1_ctx_desc, and another is encoded in the __le64 for the HW to
use. Instead of storing two copies just decode the value from the __le64.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/6-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit c0a25a96dee9c3af01fbcad227871fc0f222900b)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 42 +++++++++------------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 -
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 4bb4a77478bed..469c68ceb136b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1216,29 +1216,17 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 	arm_smmu_cmdq_batch_submit(smmu, &cmds);
 }
 
-static int arm_smmu_alloc_cd_leaf_table(struct arm_smmu_device *smmu,
-					struct arm_smmu_l1_ctx_desc *l1_desc)
+static void arm_smmu_write_cd_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
 {
-	size_t size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
+	u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V;
 
-	l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
-					    &l1_desc->l2ptr_dma, GFP_KERNEL);
-	if (!l1_desc->l2ptr) {
-		dev_warn(smmu->dev,
-			 "failed to allocate context descriptor table\n");
-		return -ENOMEM;
-	}
-	return 0;
+	/* The HW has 64 bit atomicity with stores to the L2 CD table */
+	WRITE_ONCE(*dst, cpu_to_le64(val));
 }
 
-static void arm_smmu_write_cd_l1_desc(__le64 *dst,
-				      struct arm_smmu_l1_ctx_desc *l1_desc)
+static dma_addr_t arm_smmu_cd_l1_get_desc(const __le64 *src)
 {
-	u64 val = (l1_desc->l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) |
-		  CTXDESC_L1_DESC_V;
-
-	/* The HW has 64 bit atomicity with stores to the L2 CD table */
-	WRITE_ONCE(*dst, cpu_to_le64(val));
+	return le64_to_cpu(*src) & CTXDESC_L1_DESC_L2PTR_MASK;
 }
 
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
@@ -1280,13 +1268,18 @@ static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
 
 		l1_desc = &cd_table->l1_desc[idx];
 		if (!l1_desc->l2ptr) {
-			__le64 *l1ptr;
-
-			if (arm_smmu_alloc_cd_leaf_table(smmu, l1_desc))
+			dma_addr_t l2ptr_dma;
+			size_t size;
+
+			size = CTXDESC_L2_ENTRIES * sizeof(struct arm_smmu_cd);
+			l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
+							    &l2ptr_dma,
+							    GFP_KERNEL);
+			if (!l1_desc->l2ptr)
 				return NULL;
 
-			l1ptr = cd_table->cdtab + idx * CTXDESC_L1_DESC_DWORDS;
-			arm_smmu_write_cd_l1_desc(l1ptr, l1_desc);
+			arm_smmu_write_cd_l1_desc(&cd_table->cdtab[idx],
+						  l2ptr_dma);
 			/* An invalid L1CD can be cached */
 			arm_smmu_sync_cd(master, ssid, false);
 		}
@@ -1479,7 +1472,8 @@ static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 
 			dma_free_coherent(smmu->dev, size,
 					  cd_table->l1_desc[i].l2ptr,
-					  cd_table->l1_desc[i].l2ptr_dma);
+					  arm_smmu_cd_l1_get_desc(
+						  &cd_table->cdtab[i]));
 		}
 		kfree(cd_table->l1_desc);
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index db5641de0de38..0f7cb8085553d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -618,7 +618,6 @@ struct arm_smmu_ctx_desc {
 
 struct arm_smmu_l1_ctx_desc {
 	struct arm_smmu_cd		*l2ptr;
-	dma_addr_t			l2ptr_dma;
 };
 
 struct arm_smmu_ctx_desc_cfg {

From a7bf7460374055c2c37c982362bd817d5d07bb6d Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:54 -0300
Subject: [PATCH 249/352] iommu/arm-smmu-v3: Add types for each level of the CD
 table

As well as indexing helpers arm_smmu_cdtab_l1/2_idx().

Remove CTXDESC_L1_DESC_DWORDS and CTXDESC_CD_DWORDS replacing them all
with type specific calculations.

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/7-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 7c567eb1e1d2a835140091ff8d4b73ac5454ba7b)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 45 +++++++++++----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h | 23 +++++++++--
 2 files changed, 44 insertions(+), 24 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 469c68ceb136b..25ea498863b8c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1216,17 +1216,18 @@ static void arm_smmu_sync_cd(struct arm_smmu_master *master,
 	arm_smmu_cmdq_batch_submit(smmu, &cmds);
 }
 
-static void arm_smmu_write_cd_l1_desc(__le64 *dst, dma_addr_t l2ptr_dma)
+static void arm_smmu_write_cd_l1_desc(struct arm_smmu_cdtab_l1 *dst,
+				      dma_addr_t l2ptr_dma)
 {
 	u64 val = (l2ptr_dma & CTXDESC_L1_DESC_L2PTR_MASK) | CTXDESC_L1_DESC_V;
 
 	/* The HW has 64 bit atomicity with stores to the L2 CD table */
-	WRITE_ONCE(*dst, cpu_to_le64(val));
+	WRITE_ONCE(dst->l2ptr, cpu_to_le64(val));
 }
 
-static dma_addr_t arm_smmu_cd_l1_get_desc(const __le64 *src)
+static dma_addr_t arm_smmu_cd_l1_get_desc(const struct arm_smmu_cdtab_l1 *src)
 {
-	return le64_to_cpu(*src) & CTXDESC_L1_DESC_L2PTR_MASK;
+	return le64_to_cpu(src->l2ptr) & CTXDESC_L1_DESC_L2PTR_MASK;
 }
 
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
@@ -1239,13 +1240,12 @@ struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 		return NULL;
 
 	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
-		return (struct arm_smmu_cd *)(cd_table->cdtab +
-					      ssid * CTXDESC_CD_DWORDS);
+		return &((struct arm_smmu_cd *)cd_table->cdtab)[ssid];
 
-	l1_desc = &cd_table->l1_desc[ssid / CTXDESC_L2_ENTRIES];
+	l1_desc = &cd_table->l1_desc[arm_smmu_cdtab_l1_idx(ssid)];
 	if (!l1_desc->l2ptr)
 		return NULL;
-	return &l1_desc->l2ptr[ssid % CTXDESC_L2_ENTRIES];
+	return &l1_desc->l2ptr->cds[arm_smmu_cdtab_l2_idx(ssid)];
 }
 
 static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
@@ -1263,11 +1263,12 @@ static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
 	}
 
 	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_64K_L2) {
-		unsigned int idx = ssid / CTXDESC_L2_ENTRIES;
+		unsigned int idx = arm_smmu_cdtab_l1_idx(ssid);
 		struct arm_smmu_l1_ctx_desc *l1_desc;
 
 		l1_desc = &cd_table->l1_desc[idx];
 		if (!l1_desc->l2ptr) {
+			struct arm_smmu_cdtab_l1 *dst;
 			dma_addr_t l2ptr_dma;
 			size_t size;
 
@@ -1278,8 +1279,8 @@ static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
 			if (!l1_desc->l2ptr)
 				return NULL;
 
-			arm_smmu_write_cd_l1_desc(&cd_table->cdtab[idx],
-						  l2ptr_dma);
+			dst = &((struct arm_smmu_cdtab_l1 *)cd_table->cdtab)[idx];
+			arm_smmu_write_cd_l1_desc(dst, l2ptr_dma);
 			/* An invalid L1CD can be cached */
 			arm_smmu_sync_cd(master, ssid, false);
 		}
@@ -1423,7 +1424,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 		cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR;
 		cd_table->num_l1_ents = max_contexts;
 
-		l1size = max_contexts * (CTXDESC_CD_DWORDS << 3);
+		l1size = max_contexts * sizeof(struct arm_smmu_cd);
 	} else {
 		cd_table->s1fmt = STRTAB_STE_0_S1FMT_64K_L2;
 		cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts,
@@ -1435,7 +1436,7 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 		if (!cd_table->l1_desc)
 			return -ENOMEM;
 
-		l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+		l1size = cd_table->num_l1_ents * sizeof(struct arm_smmu_cdtab_l1);
 	}
 
 	cd_table->cdtab = dma_alloc_coherent(smmu->dev, l1size,
@@ -1459,27 +1460,29 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 {
 	int i;
-	size_t size, l1size;
+	size_t l1size;
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
 	if (cd_table->l1_desc) {
-		size = CTXDESC_L2_ENTRIES * (CTXDESC_CD_DWORDS << 3);
-
 		for (i = 0; i < cd_table->num_l1_ents; i++) {
+			dma_addr_t dma_handle;
+
 			if (!cd_table->l1_desc[i].l2ptr)
 				continue;
 
-			dma_free_coherent(smmu->dev, size,
+			dma_handle = arm_smmu_cd_l1_get_desc(&(
+				(struct arm_smmu_cdtab_l1 *)cd_table->cdtab)[i]);
+			dma_free_coherent(smmu->dev,
+					  sizeof(*cd_table->l1_desc[i].l2ptr),
 					  cd_table->l1_desc[i].l2ptr,
-					  arm_smmu_cd_l1_get_desc(
-						  &cd_table->cdtab[i]));
+					  dma_handle);
 		}
 		kfree(cd_table->l1_desc);
 
-		l1size = cd_table->num_l1_ents * (CTXDESC_L1_DESC_DWORDS << 3);
+		l1size = cd_table->num_l1_ents * sizeof(struct arm_smmu_cdtab_l1);
 	} else {
-		l1size = cd_table->num_l1_ents * (CTXDESC_CD_DWORDS << 3);
+		l1size = cd_table->num_l1_ents * sizeof(struct arm_smmu_cd);
 	}
 
 	dma_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma);
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 0f7cb8085553d..014d35c04c65b 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -300,7 +300,6 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
  */
 #define CTXDESC_L2_ENTRIES		1024
 
-#define CTXDESC_L1_DESC_DWORDS		1
 #define CTXDESC_L1_DESC_V		(1UL << 0)
 #define CTXDESC_L1_DESC_L2PTR_MASK	GENMASK_ULL(51, 12)
 
@@ -310,6 +309,24 @@ struct arm_smmu_cd {
 	__le64 data[CTXDESC_CD_DWORDS];
 };
 
+struct arm_smmu_cdtab_l2 {
+	struct arm_smmu_cd cds[CTXDESC_L2_ENTRIES];
+};
+
+struct arm_smmu_cdtab_l1 {
+	__le64 l2ptr;
+};
+
+static inline unsigned int arm_smmu_cdtab_l1_idx(unsigned int ssid)
+{
+	return ssid / CTXDESC_L2_ENTRIES;
+}
+
+static inline unsigned int arm_smmu_cdtab_l2_idx(unsigned int ssid)
+{
+	return ssid % CTXDESC_L2_ENTRIES;
+}
+
 #define CTXDESC_CD_0_TCR_T0SZ		GENMASK_ULL(5, 0)
 #define CTXDESC_CD_0_TCR_TG0		GENMASK_ULL(7, 6)
 #define CTXDESC_CD_0_TCR_IRGN0		GENMASK_ULL(9, 8)
@@ -340,7 +357,7 @@ struct arm_smmu_cd {
  * When the SMMU only supports linear context descriptor tables, pick a
  * reasonable size limit (64kB).
  */
-#define CTXDESC_LINEAR_CDMAX		ilog2(SZ_64K / (CTXDESC_CD_DWORDS << 3))
+#define CTXDESC_LINEAR_CDMAX		ilog2(SZ_64K / sizeof(struct arm_smmu_cd))
 
 /* Command queue */
 #define CMDQ_ENT_SZ_SHIFT		4
@@ -617,7 +634,7 @@ struct arm_smmu_ctx_desc {
 };
 
 struct arm_smmu_l1_ctx_desc {
-	struct arm_smmu_cd		*l2ptr;
+	struct arm_smmu_cdtab_l2	*l2ptr;
 };
 
 struct arm_smmu_ctx_desc_cfg {

From dc0cdfa86799aa6ec7645dab5ecd72f7e3eb2780 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 6 Sep 2024 12:47:55 -0300
Subject: [PATCH 250/352] iommu/arm-smmu-v3: Reorganize struct
 arm_smmu_ctx_desc_cfg

The members here are being used for both the linear and the 2 level case,
with the meaning of each item slightly different in the two cases.

Split it into a clean union where both cases have their own struct with
their own logical names and correct types.

Adjust all the users to detect linear/2lvl and use the right sub structure
and types consistently.

Remove CTXDESC_CD_DWORDS by changing the last places to use
sizeof(struct arm_smmu_cd).

Tested-by: Nicolin Chen <nicolinc@nvidia.com>
Reviewed-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/8-v4-6416877274e1+1af-smmuv3_tidy_jgg@nvidia.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit e3b1be2e73dbe599f8b8886e120d206aa87e90f9)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 115 ++++++++++----------
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  24 ++--
 2 files changed, 72 insertions(+), 67 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 25ea498863b8c..5c999d998f7fc 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1233,19 +1233,19 @@ static dma_addr_t arm_smmu_cd_l1_get_desc(const struct arm_smmu_cdtab_l1 *src)
 struct arm_smmu_cd *arm_smmu_get_cd_ptr(struct arm_smmu_master *master,
 					u32 ssid)
 {
-	struct arm_smmu_l1_ctx_desc *l1_desc;
+	struct arm_smmu_cdtab_l2 *l2;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
-	if (!cd_table->cdtab)
+	if (!arm_smmu_cdtab_allocated(cd_table))
 		return NULL;
 
 	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_LINEAR)
-		return &((struct arm_smmu_cd *)cd_table->cdtab)[ssid];
+		return &cd_table->linear.table[ssid];
 
-	l1_desc = &cd_table->l1_desc[arm_smmu_cdtab_l1_idx(ssid)];
-	if (!l1_desc->l2ptr)
+	l2 = cd_table->l2.l2ptrs[arm_smmu_cdtab_l1_idx(ssid)];
+	if (!l2)
 		return NULL;
-	return &l1_desc->l2ptr->cds[arm_smmu_cdtab_l2_idx(ssid)];
+	return &l2->cds[arm_smmu_cdtab_l2_idx(ssid)];
 }
 
 static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
@@ -1257,30 +1257,25 @@ static struct arm_smmu_cd *arm_smmu_alloc_cd_ptr(struct arm_smmu_master *master,
 	might_sleep();
 	iommu_group_mutex_assert(master->dev);
 
-	if (!cd_table->cdtab) {
+	if (!arm_smmu_cdtab_allocated(cd_table)) {
 		if (arm_smmu_alloc_cd_tables(master))
 			return NULL;
 	}
 
 	if (cd_table->s1fmt == STRTAB_STE_0_S1FMT_64K_L2) {
 		unsigned int idx = arm_smmu_cdtab_l1_idx(ssid);
-		struct arm_smmu_l1_ctx_desc *l1_desc;
+		struct arm_smmu_cdtab_l2 **l2ptr = &cd_table->l2.l2ptrs[idx];
 
-		l1_desc = &cd_table->l1_desc[idx];
-		if (!l1_desc->l2ptr) {
-			struct arm_smmu_cdtab_l1 *dst;
+		if (!*l2ptr) {
 			dma_addr_t l2ptr_dma;
-			size_t size;
 
-			size = CTXDESC_L2_ENTRIES * sizeof(struct arm_smmu_cd);
-			l1_desc->l2ptr = dma_alloc_coherent(smmu->dev, size,
-							    &l2ptr_dma,
-							    GFP_KERNEL);
-			if (!l1_desc->l2ptr)
+			*l2ptr = dma_alloc_coherent(smmu->dev, sizeof(**l2ptr),
+						    &l2ptr_dma, GFP_KERNEL);
+			if (!*l2ptr)
 				return NULL;
 
-			dst = &((struct arm_smmu_cdtab_l1 *)cd_table->cdtab)[idx];
-			arm_smmu_write_cd_l1_desc(dst, l2ptr_dma);
+			arm_smmu_write_cd_l1_desc(&cd_table->l2.l1tab[idx],
+						  l2ptr_dma);
 			/* An invalid L1CD can be cached */
 			arm_smmu_sync_cd(master, ssid, false);
 		}
@@ -1400,7 +1395,7 @@ void arm_smmu_clear_cd(struct arm_smmu_master *master, ioasid_t ssid)
 	struct arm_smmu_cd target = {};
 	struct arm_smmu_cd *cdptr;
 
-	if (!master->cd_table.cdtab)
+	if (!arm_smmu_cdtab_allocated(&master->cd_table))
 		return;
 	cdptr = arm_smmu_get_cd_ptr(master, ssid);
 	if (WARN_ON(!cdptr))
@@ -1422,70 +1417,70 @@ static int arm_smmu_alloc_cd_tables(struct arm_smmu_master *master)
 	if (!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB) ||
 	    max_contexts <= CTXDESC_L2_ENTRIES) {
 		cd_table->s1fmt = STRTAB_STE_0_S1FMT_LINEAR;
-		cd_table->num_l1_ents = max_contexts;
+		cd_table->linear.num_ents = max_contexts;
 
-		l1size = max_contexts * sizeof(struct arm_smmu_cd);
+		l1size = max_contexts * sizeof(struct arm_smmu_cd),
+		cd_table->linear.table = dma_alloc_coherent(smmu->dev, l1size,
+							    &cd_table->cdtab_dma,
+							    GFP_KERNEL);
+		if (!cd_table->linear.table)
+			return -ENOMEM;
 	} else {
 		cd_table->s1fmt = STRTAB_STE_0_S1FMT_64K_L2;
-		cd_table->num_l1_ents = DIV_ROUND_UP(max_contexts,
-						  CTXDESC_L2_ENTRIES);
+		cd_table->l2.num_l1_ents =
+			DIV_ROUND_UP(max_contexts, CTXDESC_L2_ENTRIES);
 
-		cd_table->l1_desc = kcalloc(cd_table->num_l1_ents,
-					    sizeof(*cd_table->l1_desc),
-					    GFP_KERNEL);
-		if (!cd_table->l1_desc)
+		cd_table->l2.l2ptrs = kcalloc(cd_table->l2.num_l1_ents,
+					     sizeof(*cd_table->l2.l2ptrs),
+					     GFP_KERNEL);
+		if (!cd_table->l2.l2ptrs)
 			return -ENOMEM;
 
-		l1size = cd_table->num_l1_ents * sizeof(struct arm_smmu_cdtab_l1);
-	}
-
-	cd_table->cdtab = dma_alloc_coherent(smmu->dev, l1size,
-					     &cd_table->cdtab_dma, GFP_KERNEL);
-	if (!cd_table->cdtab) {
-		dev_warn(smmu->dev, "failed to allocate context descriptor\n");
-		ret = -ENOMEM;
-		goto err_free_l1;
+		l1size = cd_table->l2.num_l1_ents * sizeof(struct arm_smmu_cdtab_l1);
+		cd_table->l2.l1tab = dma_alloc_coherent(smmu->dev, l1size,
+							&cd_table->cdtab_dma,
+							GFP_KERNEL);
+		if (!cd_table->l2.l2ptrs) {
+			ret = -ENOMEM;
+			goto err_free_l2ptrs;
+		}
 	}
-
 	return 0;
 
-err_free_l1:
-	if (cd_table->l1_desc) {
-		kfree(cd_table->l1_desc);
-		cd_table->l1_desc = NULL;
-	}
+err_free_l2ptrs:
+	kfree(cd_table->l2.l2ptrs);
+	cd_table->l2.l2ptrs = NULL;
 	return ret;
 }
 
 static void arm_smmu_free_cd_tables(struct arm_smmu_master *master)
 {
 	int i;
-	size_t l1size;
 	struct arm_smmu_device *smmu = master->smmu;
 	struct arm_smmu_ctx_desc_cfg *cd_table = &master->cd_table;
 
-	if (cd_table->l1_desc) {
-		for (i = 0; i < cd_table->num_l1_ents; i++) {
-			dma_addr_t dma_handle;
-
-			if (!cd_table->l1_desc[i].l2ptr)
+	if (cd_table->s1fmt != STRTAB_STE_0_S1FMT_LINEAR) {
+		for (i = 0; i < cd_table->l2.num_l1_ents; i++) {
+			if (!cd_table->l2.l2ptrs[i])
 				continue;
 
-			dma_handle = arm_smmu_cd_l1_get_desc(&(
-				(struct arm_smmu_cdtab_l1 *)cd_table->cdtab)[i]);
 			dma_free_coherent(smmu->dev,
-					  sizeof(*cd_table->l1_desc[i].l2ptr),
-					  cd_table->l1_desc[i].l2ptr,
-					  dma_handle);
+					  sizeof(*cd_table->l2.l2ptrs[i]),
+					  cd_table->l2.l2ptrs[i],
+					  arm_smmu_cd_l1_get_desc(&cd_table->l2.l1tab[i]));
 		}
-		kfree(cd_table->l1_desc);
+		kfree(cd_table->l2.l2ptrs);
 
-		l1size = cd_table->num_l1_ents * sizeof(struct arm_smmu_cdtab_l1);
+		dma_free_coherent(smmu->dev,
+				  cd_table->l2.num_l1_ents *
+					  sizeof(struct arm_smmu_cdtab_l1),
+				  cd_table->l2.l1tab, cd_table->cdtab_dma);
 	} else {
-		l1size = cd_table->num_l1_ents * sizeof(struct arm_smmu_cd);
+		dma_free_coherent(smmu->dev,
+				  cd_table->linear.num_ents *
+					  sizeof(struct arm_smmu_cd),
+				  cd_table->linear.table, cd_table->cdtab_dma);
 	}
-
-	dma_free_coherent(smmu->dev, l1size, cd_table->cdtab, cd_table->cdtab_dma);
 }
 
 /* Stream table manipulation functions */
@@ -3336,7 +3331,7 @@ static void arm_smmu_release_device(struct device *dev)
 
 	arm_smmu_disable_pasid(master);
 	arm_smmu_remove_master(master);
-	if (master->cd_table.cdtab)
+	if (arm_smmu_cdtab_allocated(&master->cd_table))
 		arm_smmu_free_cd_tables(master);
 	kfree(master);
 }
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 014d35c04c65b..c07e8d6d11f45 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -633,15 +633,19 @@ struct arm_smmu_ctx_desc {
 	u16				asid;
 };
 
-struct arm_smmu_l1_ctx_desc {
-	struct arm_smmu_cdtab_l2	*l2ptr;
-};
-
 struct arm_smmu_ctx_desc_cfg {
-	__le64				*cdtab;
+	union {
+		struct {
+			struct arm_smmu_cd *table;
+			unsigned int num_ents;
+		} linear;
+		struct {
+			struct arm_smmu_cdtab_l1 *l1tab;
+			struct arm_smmu_cdtab_l2 **l2ptrs;
+			unsigned int num_l1_ents;
+		} l2;
+	};
 	dma_addr_t			cdtab_dma;
-	struct arm_smmu_l1_ctx_desc	*l1_desc;
-	unsigned int			num_l1_ents;
 	unsigned int			used_ssids;
 	u8				in_ste;
 	u8				s1fmt;
@@ -649,6 +653,12 @@ struct arm_smmu_ctx_desc_cfg {
 	u8				s1cdmax;
 };
 
+static inline bool
+arm_smmu_cdtab_allocated(struct arm_smmu_ctx_desc_cfg *cfg)
+{
+	return cfg->linear.table || cfg->l2.l1tab;
+}
+
 /* True if the cd table has SSIDS > 0 in use. */
 static inline bool arm_smmu_ssids_in_use(struct arm_smmu_ctx_desc_cfg *cd_table)
 {

From 33f0cc327c7a068c56a09e3e22b55eba9bd41be0 Mon Sep 17 00:00:00 2001
From: Rob Clark <robdclark@chromium.org>
Date: Fri, 9 Aug 2024 10:27:14 -0700
Subject: [PATCH 251/352] iommu/arm-smmu: Un-demote unhandled-fault msg

Previously this was dev_err_ratelimited() but it got changed to a
ratelimited dev_dbg().  Change it back to dev_err().

Fixes: d525b0af0c3b ("iommu/arm-smmu: Pretty-print context fault related regs")
Signed-off-by: Rob Clark <robdclark@chromium.org>
Reviewed-by: Pranjal Shrivastava <praan@google.com>
Link: https://lore.kernel.org/r/20240809172716.10275-1-robdclark@gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 98db56e4900837e4d5d3892b332dca76c8c9f68a)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 79ec911ae151f..70ec5b2b61677 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -418,7 +418,7 @@ void arm_smmu_read_context_fault_info(struct arm_smmu_device *smmu, int idx,
 void arm_smmu_print_context_fault_info(struct arm_smmu_device *smmu, int idx,
 				       const struct arm_smmu_context_fault_info *cfi)
 {
-	dev_dbg(smmu->dev,
+	dev_err(smmu->dev,
 		"Unhandled context fault: fsr=0x%x, iova=0x%08lx, fsynr=0x%x, cbfrsynra=0x%x, cb=%d\n",
 		cfi->fsr, cfi->iova, cfi->fsynr, cfi->cbfrsynra, idx);
 

From 16611b9d75ad988c1c276724ee87b4e5fb7350d7 Mon Sep 17 00:00:00 2001
From: Georgi Djakov <quic_c_gdjako@quicinc.com>
Date: Wed, 3 Jul 2024 18:07:59 -0700
Subject: [PATCH 252/352] iommu/arm-smmu-qcom: Register the TBU driver in
 qcom_smmu_impl_init

Currently the TBU driver will only probe when CONFIG_ARM_SMMU_QCOM_DEBUG
is enabled. The driver not probing would prevent the platform to reach
sync_state and the system will remain in sub-optimal power consumption
mode while waiting for all consumer drivers to probe. To address this,
let's register the TBU driver in qcom_smmu_impl_init(), so that it can
probe, but still enable its functionality only when the debug option in
Kconfig is enabled.

Reported-by: Dmitry Baryshkov <dmitry.baryshkov@linaro.org>
Closes: https://lore.kernel.org/r/CAA8EJppcXVu72OSo+OiYEiC1HQjP3qCwKMumOsUhcn6Czj0URg@mail.gmail.com
Fixes: 414ecb030870 ("iommu/arm-smmu-qcom-debug: Add support for TBUs")
Signed-off-by: Georgi Djakov <quic_c_gdjako@quicinc.com>
Link: https://lore.kernel.org/r/20240704010759.507798-1-quic_c_gdjako@quicinc.com
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit 0b4eeee2876f2b08442eb32081451bf130e01a4c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 .../iommu/arm/arm-smmu/arm-smmu-qcom-debug.c  | 17 +-------
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c    | 39 +++++++++++++++++++
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h    |  2 +
 3 files changed, 42 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
index ef93f825f11f9..548783f3f8e89 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-debug.c
@@ -464,7 +464,7 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev)
 	return ret;
 }
 
-static int qcom_tbu_probe(struct platform_device *pdev)
+int qcom_tbu_probe(struct platform_device *pdev)
 {
 	struct of_phandle_args args = { .args_count = 2 };
 	struct device_node *np = pdev->dev.of_node;
@@ -506,18 +506,3 @@ static int qcom_tbu_probe(struct platform_device *pdev)
 
 	return 0;
 }
-
-static const struct of_device_id qcom_tbu_of_match[] = {
-	{ .compatible = "qcom,sc7280-tbu" },
-	{ .compatible = "qcom,sdm845-tbu" },
-	{ }
-};
-
-static struct platform_driver qcom_tbu_driver = {
-	.driver = {
-		.name           = "qcom_tbu",
-		.of_match_table = qcom_tbu_of_match,
-	},
-	.probe = qcom_tbu_probe,
-};
-builtin_platform_driver(qcom_tbu_driver);
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 971c6a2e592b9..36c6b36ad4ff7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -8,6 +8,8 @@
 #include <linux/delay.h>
 #include <linux/of_device.h>
 #include <linux/firmware/qcom/qcom_scm.h>
+#include <linux/platform_device.h>
+#include <linux/pm_runtime.h>
 
 #include "arm-smmu.h"
 #include "arm-smmu-qcom.h"
@@ -562,10 +564,47 @@ static struct acpi_platform_list qcom_acpi_platlist[] = {
 };
 #endif
 
+static int qcom_smmu_tbu_probe(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	int ret;
+
+	if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM_DEBUG)) {
+		ret = qcom_tbu_probe(pdev);
+		if (ret)
+			return ret;
+	}
+
+	if (dev->pm_domain) {
+		pm_runtime_set_active(dev);
+		pm_runtime_enable(dev);
+	}
+
+	return 0;
+}
+
+static const struct of_device_id qcom_smmu_tbu_of_match[] = {
+	{ .compatible = "qcom,sc7280-tbu" },
+	{ .compatible = "qcom,sdm845-tbu" },
+	{ }
+};
+
+static struct platform_driver qcom_smmu_tbu_driver = {
+	.driver = {
+		.name           = "qcom_tbu",
+		.of_match_table = qcom_smmu_tbu_of_match,
+	},
+	.probe = qcom_smmu_tbu_probe,
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
 	const struct device_node *np = smmu->dev->of_node;
 	const struct of_device_id *match;
+	static u8 tbu_registered;
+
+	if (!tbu_registered++)
+		platform_driver_register(&qcom_smmu_tbu_driver);
 
 #ifdef CONFIG_ACPI
 	if (np == NULL) {
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
index 9bb3ae7d62da6..3c134d1a62773 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.h
@@ -34,8 +34,10 @@ irqreturn_t qcom_smmu_context_fault(int irq, void *dev);
 
 #ifdef CONFIG_ARM_SMMU_QCOM_DEBUG
 void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu);
+int qcom_tbu_probe(struct platform_device *pdev);
 #else
 static inline void qcom_smmu_tlb_sync_debug(struct arm_smmu_device *smmu) { }
+static inline int qcom_tbu_probe(struct platform_device *pdev) { return -EINVAL; }
 #endif
 
 #endif /* _ARM_SMMU_QCOM_H */

From 13588f0ff82ba75a68bd7a0ff8cd46075f49b233 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@linaro.org>
Date: Thu, 15 Aug 2024 14:25:00 +0300
Subject: [PATCH 253/352] iommu/arm-smmu-v3: Fix a NULL vs IS_ERR() check

The arm_smmu_domain_alloc() function returns error pointers on error.  It
doesn't return NULL.  Update the error checking to match.

Fixes: 52acd7d8a413 ("iommu/arm-smmu-v3: Add support for domain_alloc_user fn")
Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
Reviewed-by: Shameer Kolothum <shameerali.kolothum.thodi@huawei.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Link: https://lore.kernel.org/r/9208cd0d-8105-40df-93e9-bdcdf0d55eec@stanley.mountain
Signed-off-by: Will Deacon <will@kernel.org>
(cherry picked from commit af048ec9c05178206e845a88bfd3cb2884a43da7)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5c999d998f7fc..7f8bd85dc78b7 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3096,8 +3096,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 		return ERR_PTR(-EOPNOTSUPP);
 
 	smmu_domain = arm_smmu_domain_alloc();
-	if (!smmu_domain)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(smmu_domain))
+		return ERR_CAST(smmu_domain);
 
 	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
 	smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;

From 9c4ef5fc3d7280b7adc8dc033801bad11e4e171b Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Thu, 22 Aug 2024 11:47:09 -0300
Subject: [PATCH 254/352] iommufd/selftest: Fix buffer read overrrun in the
 dirty test

test_bit() is used to read the memory storing the bitmap, however
test_bit() always uses a unsigned long 8 byte access.

If the bitmap is not an aligned size of 64 bits this will now trigger a
KASAN warning reading past the end of the buffer.

Properly round the buffer allocation to an unsigned long size. Continue to
copy_from_user() using a byte granularity.

Fixes: 9560393b830b ("iommufd/selftest: Fix iommufd_test_dirty() to handle <u8 bitmaps")
Link: https://patch.msgid.link/r/0-v1-113e8d9e7861+5ae-iommufd_kasan_jgg@nvidia.com
Reviewed-by: Joao Martins <joao.m.martins@oracle.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
(cherry picked from commit 79ea4a496ab5c970a3a793d863ed8893b1af107c)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/selftest.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 04293b20e20c9..c5c14bbf723cb 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1343,7 +1343,7 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 			      unsigned long page_size, void __user *uptr,
 			      u32 flags)
 {
-	unsigned long bitmap_size, i, max;
+	unsigned long i, max;
 	struct iommu_test_cmd *cmd = ucmd->cmd;
 	struct iommufd_hw_pagetable *hwpt;
 	struct mock_iommu_domain *mock;
@@ -1364,15 +1364,14 @@ static int iommufd_test_dirty(struct iommufd_ucmd *ucmd, unsigned int mockpt_id,
 	}
 
 	max = length / page_size;
-	bitmap_size = DIV_ROUND_UP(max, BITS_PER_BYTE);
-
-	tmp = kvzalloc(bitmap_size, GFP_KERNEL_ACCOUNT);
+	tmp = kvzalloc(DIV_ROUND_UP(max, BITS_PER_LONG) * sizeof(unsigned long),
+		       GFP_KERNEL_ACCOUNT);
 	if (!tmp) {
 		rc = -ENOMEM;
 		goto out_put;
 	}
 
-	if (copy_from_user(tmp, uptr, bitmap_size)) {
+	if (copy_from_user(tmp, uptr,DIV_ROUND_UP(max, BITS_PER_BYTE))) {
 		rc = -EFAULT;
 		goto out_free;
 	}

From 1c4e3087cc6d97638d45a95877270ad51ef6ea27 Mon Sep 17 00:00:00 2001
From: Pranjal Shrivastava <praan@google.com>
Date: Fri, 16 Aug 2024 10:49:06 +0000
Subject: [PATCH 255/352] iommu: Handle iommu faults for a bad iopf setup

The iommu_report_device_fault function was updated to return void while
assuming that drivers only need to call iommu_report_device_fault() for
reporting an iopf. This implementation causes following problems:

1. The drivers rely on the core code to call it's page_reponse,
   however, when a fault is received and no fault capable domain is
   attached / iopf_param is NULL, the ops->page_response is NOT called
   causing the device to stall in case the fault type was PAGE_REQ.

2. The arm_smmu_v3 driver relies on the returned value to log errors
   returning void from iommu_report_device_fault causes these events to
   be missed while logging.

Modify the iommu_report_device_fault function to return -EINVAL for
cases where no fault capable domain is attached or iopf_param was NULL
and calls back to the driver (ops->page_response) in case the fault type
was IOMMU_FAULT_PAGE_REQ. The returned value can be used by the drivers
to log the fault/event as needed.

Reported-by: Kunkun Jiang <jiangkunkun@huawei.com>
Closes: https://lore.kernel.org/all/6147caf0-b9a0-30ca-795e-a1aa502a5c51@huawei.com/
Fixes: 3dfa64aecbaf ("iommu: Make iommu_report_device_fault() return void")
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Pranjal Shrivastava <praan@google.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Lu Baolu <baolu.lu@linux.intel.com>
Link: https://lore.kernel.org/r/20240816104906.1010626-1-praan@google.com
Signed-off-by: Joerg Roedel <jroedel@suse.de>
(cherry picked from commit b58b133e680b20d219940e0fdb6f6132c2b60f38)
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |   2 +-
 drivers/iommu/io-pgfault.c                  | 120 ++++++++++++++------
 include/linux/iommu.h                       |   5 +-
 3 files changed, 87 insertions(+), 40 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 7f8bd85dc78b7..fdb378a24dde0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1813,7 +1813,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 		goto out_unlock;
 	}
 
-	iommu_report_device_fault(master->dev, &fault_evt);
+	ret = iommu_report_device_fault(master->dev, &fault_evt);
 out_unlock:
 	mutex_unlock(&smmu->streams_mutex);
 	return ret;
diff --git a/drivers/iommu/io-pgfault.c b/drivers/iommu/io-pgfault.c
index cd679c13752e0..4674e618797c1 100644
--- a/drivers/iommu/io-pgfault.c
+++ b/drivers/iommu/io-pgfault.c
@@ -115,6 +115,59 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
 	return group;
 }
 
+static struct iommu_attach_handle *find_fault_handler(struct device *dev,
+						     struct iopf_fault *evt)
+{
+	struct iommu_fault *fault = &evt->fault;
+	struct iommu_attach_handle *attach_handle;
+
+	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
+		attach_handle = iommu_attach_handle_get(dev->iommu_group,
+				fault->prm.pasid, 0);
+		if (IS_ERR(attach_handle)) {
+			const struct iommu_ops *ops = dev_iommu_ops(dev);
+
+			if (!ops->user_pasid_table)
+				return NULL;
+			/*
+			 * The iommu driver for this device supports user-
+			 * managed PASID table. Therefore page faults for
+			 * any PASID should go through the NESTING domain
+			 * attached to the device RID.
+			 */
+			attach_handle = iommu_attach_handle_get(
+					dev->iommu_group, IOMMU_NO_PASID,
+					IOMMU_DOMAIN_NESTED);
+			if (IS_ERR(attach_handle))
+				return NULL;
+		}
+	} else {
+		attach_handle = iommu_attach_handle_get(dev->iommu_group,
+				IOMMU_NO_PASID, 0);
+
+		if (IS_ERR(attach_handle))
+			return NULL;
+	}
+
+	if (!attach_handle->domain->iopf_handler)
+		return NULL;
+
+	return attach_handle;
+}
+
+static void iopf_error_response(struct device *dev, struct iopf_fault *evt)
+{
+	const struct iommu_ops *ops = dev_iommu_ops(dev);
+	struct iommu_fault *fault = &evt->fault;
+	struct iommu_page_response resp = {
+		.pasid = fault->prm.pasid,
+		.grpid = fault->prm.grpid,
+		.code = IOMMU_PAGE_RESP_INVALID
+	};
+
+	ops->page_response(dev, evt, &resp);
+}
+
 /**
  * iommu_report_device_fault() - Report fault event to device driver
  * @dev: the device
@@ -153,23 +206,39 @@ static struct iopf_group *iopf_group_alloc(struct iommu_fault_param *iopf_param,
  * handling framework should guarantee that the iommu domain could only be
  * freed after the device has stopped generating page faults (or the iommu
  * hardware has been set to block the page faults) and the pending page faults
- * have been flushed.
+ * have been flushed. In case no page fault handler is attached or no iopf params
+ * are setup, then the ops->page_response() is called to complete the evt.
+ *
+ * Returns 0 on success, or an error in case of a bad/failed iopf setup.
  */
-void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
+	struct iommu_attach_handle *attach_handle;
 	struct iommu_fault *fault = &evt->fault;
 	struct iommu_fault_param *iopf_param;
 	struct iopf_group abort_group = {};
 	struct iopf_group *group;
 
+	attach_handle = find_fault_handler(dev, evt);
+	if (!attach_handle)
+		goto err_bad_iopf;
+
+	/*
+	 * Something has gone wrong if a fault capable domain is attached but no
+	 * iopf_param is setup
+	 */
 	iopf_param = iopf_get_dev_fault_param(dev);
 	if (WARN_ON(!iopf_param))
-		return;
+		goto err_bad_iopf;
 
 	if (!(fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE)) {
-		report_partial_fault(iopf_param, fault);
+		int ret;
+
+		ret = report_partial_fault(iopf_param, fault);
 		iopf_put_dev_fault_param(iopf_param);
 		/* A request that is not the last does not need to be ack'd */
+
+		return ret;
 	}
 
 	/*
@@ -184,38 +253,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	if (group == &abort_group)
 		goto err_abort;
 
-	if (fault->prm.flags & IOMMU_FAULT_PAGE_REQUEST_PASID_VALID) {
-		group->attach_handle = iommu_attach_handle_get(dev->iommu_group,
-							       fault->prm.pasid,
-							       0);
-		if (IS_ERR(group->attach_handle)) {
-			const struct iommu_ops *ops = dev_iommu_ops(dev);
-
-			if (!ops->user_pasid_table)
-				goto err_abort;
-
-			/*
-			 * The iommu driver for this device supports user-
-			 * managed PASID table. Therefore page faults for
-			 * any PASID should go through the NESTING domain
-			 * attached to the device RID.
-			 */
-			group->attach_handle =
-				iommu_attach_handle_get(dev->iommu_group,
-							IOMMU_NO_PASID,
-							IOMMU_DOMAIN_NESTED);
-			if (IS_ERR(group->attach_handle))
-				goto err_abort;
-		}
-	} else {
-		group->attach_handle =
-			iommu_attach_handle_get(dev->iommu_group, IOMMU_NO_PASID, 0);
-		if (IS_ERR(group->attach_handle))
-			goto err_abort;
-	}
-
-	if (!group->attach_handle->domain->iopf_handler)
-		goto err_abort;
+	group->attach_handle = attach_handle;
 
 	/*
 	 * On success iopf_handler must call iopf_group_response() and
@@ -224,7 +262,7 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 	if (group->attach_handle->domain->iopf_handler(group))
 		goto err_abort;
 
-	return;
+	return 0;
 
 err_abort:
 	dev_warn_ratelimited(dev, "iopf with pasid %d aborted\n",
@@ -234,6 +272,14 @@ void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 		__iopf_free_group(group);
 	else
 		iopf_free_group(group);
+
+	return 0;
+
+err_bad_iopf:
+	if (fault->type == IOMMU_FAULT_PAGE_REQ)
+		iopf_error_response(dev, evt);
+
+	return -EINVAL;
 }
 EXPORT_SYMBOL_GPL(iommu_report_device_fault);
 
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b1c94e691afd2..f981aed48c5a2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -1568,7 +1568,7 @@ struct iopf_queue *iopf_queue_alloc(const char *name);
 void iopf_queue_free(struct iopf_queue *queue);
 int iopf_queue_discard_partial(struct iopf_queue *queue);
 void iopf_free_group(struct iopf_group *group);
-void iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
+int iommu_report_device_fault(struct device *dev, struct iopf_fault *evt);
 void iopf_group_response(struct iopf_group *group,
 			 enum iommu_page_response_code status);
 #else
@@ -1606,9 +1606,10 @@ static inline void iopf_free_group(struct iopf_group *group)
 {
 }
 
-static inline void
+static inline int
 iommu_report_device_fault(struct device *dev, struct iopf_fault *evt)
 {
+	return -ENODEV;
 }
 
 static inline void iopf_group_response(struct iopf_group *group,

From 4dbb8026697812b73d7ba16de5188d23b182ef8e Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 23 Aug 2024 22:49:44 +0000
Subject: [PATCH 256/352] cover-letter: Apply upstream patches for dependencies

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 8fc59dfe9443b982702be03ab73e436dacd6cec2 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 10 May 2022 13:55:24 -0300
Subject: [PATCH 257/352] vfio: Remove VFIO_TYPE1_NESTING_IOMMU

This control causes the ARM SMMU drivers to choose a stage 2
implementation for the IO pagetable (vs the stage 1 usual default),
however this choice has no significant visible impact to the VFIO
user. Further qemu never implemented this and no other userspace user is
known.

The original description in commit f5c9ecebaf2a ("vfio/iommu_type1: add
new VFIO_TYPE1_NESTING_IOMMU IOMMU type") suggested this was to "provide
SMMU translation services to the guest operating system" however the rest
of the API to set the guest table pointer for the stage 1 and manage
invalidation was never completed, or at least never upstreamed, rendering
this part useless dead code.

Upstream has now settled on iommufd as the uAPI for controlling nested
translation. Choosing the stage 2 implementation should be done by through
the IOMMU_HWPT_ALLOC_NEST_PARENT flag during domain allocation.

Remove VFIO_TYPE1_NESTING_IOMMU and everything under it including the
enable_nesting iommu_domain_op.

Just in-case there is some userspace using this continue to treat
requesting it as a NOP, but do not advertise support any more.

Acked-by: Alex Williamson <alex.williamson@redhat.com>
Reviewed-by: Mostafa Saleh <smostafa@google.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 16 ----------------
 drivers/iommu/arm/arm-smmu/arm-smmu.c       | 16 ----------------
 drivers/iommu/iommu.c                       | 10 ----------
 drivers/iommu/iommufd/vfio_compat.c         |  7 +------
 drivers/vfio/vfio_iommu_type1.c             | 12 +-----------
 include/linux/iommu.h                       |  3 ---
 include/uapi/linux/vfio.h                   |  2 +-
 7 files changed, 3 insertions(+), 63 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index fdb378a24dde0..d9b3ea462935c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3374,21 +3374,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_of_xlate(struct device *dev,
 			     const struct of_phandle_args *args)
 {
@@ -3510,7 +3495,6 @@ static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.free			= arm_smmu_domain_free_paging,
 	}
 };
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 70ec5b2b61677..16aa2e8d463a4 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -1559,21 +1559,6 @@ static struct iommu_group *arm_smmu_device_group(struct device *dev)
 	return group;
 }
 
-static int arm_smmu_enable_nesting(struct iommu_domain *domain)
-{
-	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
-	int ret = 0;
-
-	mutex_lock(&smmu_domain->init_mutex);
-	if (smmu_domain->smmu)
-		ret = -EPERM;
-	else
-		smmu_domain->stage = ARM_SMMU_DOMAIN_NESTED;
-	mutex_unlock(&smmu_domain->init_mutex);
-
-	return ret;
-}
-
 static int arm_smmu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks)
 {
@@ -1657,7 +1642,6 @@ static struct iommu_ops arm_smmu_ops = {
 		.flush_iotlb_all	= arm_smmu_flush_iotlb_all,
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
-		.enable_nesting		= arm_smmu_enable_nesting,
 		.set_pgtable_quirks	= arm_smmu_set_pgtable_quirks,
 		.free			= arm_smmu_domain_free,
 	}
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 6579bda375652..a1e1c60285071 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2726,16 +2726,6 @@ static int __init iommu_init(void)
 }
 core_initcall(iommu_init);
 
-int iommu_enable_nesting(struct iommu_domain *domain)
-{
-	if (domain->type != IOMMU_DOMAIN_UNMANAGED)
-		return -EINVAL;
-	if (!domain->ops->enable_nesting)
-		return -EINVAL;
-	return domain->ops->enable_nesting(domain);
-}
-EXPORT_SYMBOL_GPL(iommu_enable_nesting);
-
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirk)
 {
diff --git a/drivers/iommu/iommufd/vfio_compat.c b/drivers/iommu/iommufd/vfio_compat.c
index a3ad5f0b6c59d..514aacd640094 100644
--- a/drivers/iommu/iommufd/vfio_compat.c
+++ b/drivers/iommu/iommufd/vfio_compat.c
@@ -291,12 +291,7 @@ static int iommufd_vfio_check_extension(struct iommufd_ctx *ictx,
 	case VFIO_DMA_CC_IOMMU:
 		return iommufd_vfio_cc_iommu(ictx);
 
-	/*
-	 * This is obsolete, and to be removed from VFIO. It was an incomplete
-	 * idea that got merged.
-	 * https://lore.kernel.org/kvm/0-v1-0093c9b0e345+19-vfio_no_nesting_jgg@nvidia.com/
-	 */
-	case VFIO_TYPE1_NESTING_IOMMU:
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 		return 0;
 
 	/*
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 49e1c52aa5a82..8198c0a54661d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -72,7 +72,6 @@ struct vfio_iommu {
 	uint64_t		pgsize_bitmap;
 	uint64_t		num_non_pinned_groups;
 	bool			v2;
-	bool			nesting;
 	bool			dirty_page_tracking;
 	struct list_head	emulated_iommu_groups;
 };
@@ -2199,12 +2198,6 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_free_domain;
 	}
 
-	if (iommu->nesting) {
-		ret = iommu_enable_nesting(domain->domain);
-		if (ret)
-			goto out_domain;
-	}
-
 	ret = iommu_attach_group(domain->domain, group->iommu_group);
 	if (ret)
 		goto out_domain;
@@ -2545,9 +2538,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 		break;
-	case VFIO_TYPE1_NESTING_IOMMU:
-		iommu->nesting = true;
-		fallthrough;
+	case __VFIO_RESERVED_TYPE1_NESTING_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
 		iommu->v2 = true;
 		break;
@@ -2642,7 +2633,6 @@ static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
 	switch (arg) {
 	case VFIO_TYPE1_IOMMU:
 	case VFIO_TYPE1v2_IOMMU:
-	case VFIO_TYPE1_NESTING_IOMMU:
 	case VFIO_UNMAP_ALL:
 		return 1;
 	case VFIO_UPDATE_VADDR:
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index f981aed48c5a2..0a9d211d3b7fc 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -636,7 +636,6 @@ struct iommu_ops {
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
  *                           specific mechanisms.
- * @enable_nesting: Enable nesting
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @free: Release the domain after use.
  */
@@ -664,7 +663,6 @@ struct iommu_domain_ops {
 				    dma_addr_t iova);
 
 	bool (*enforce_cache_coherency)(struct iommu_domain *domain);
-	int (*enable_nesting)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
 
@@ -831,7 +829,6 @@ extern void iommu_group_put(struct iommu_group *group);
 extern int iommu_group_id(struct iommu_group *group);
 extern struct iommu_domain *iommu_group_default_domain(struct iommu_group *);
 
-int iommu_enable_nesting(struct iommu_domain *domain);
 int iommu_set_pgtable_quirks(struct iommu_domain *domain,
 		unsigned long quirks);
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 2b68e6cdf1902..c8dbf8219c4fc 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -35,7 +35,7 @@
 #define VFIO_EEH			5
 
 /* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
+#define __VFIO_RESERVED_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
 
 #define VFIO_SPAPR_TCE_v2_IOMMU		7
 

From 3dad76efc75f274b151dde9e2a46aa2b97e32ea7 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 24 May 2024 16:29:21 -0300
Subject: [PATCH 258/352] iommu/arm-smmu-v3: Use S2FWB when available

Force Write Back (FWB) changes how the S2 IOPTE's MemAttr field
works. When S2FWB is supported and enabled the IOPTE will force cachable
access to IOMMU_CACHE memory when nesting with a S1 and deny cachable
access otherwise.

When using a single stage of translation, a simple S2 domain, it doesn't
change anything as it is just a different encoding for the exsting mapping
of the IOMMU protection flags to cachability attributes.

However, when used with a nested S1, FWB has the effect of preventing the
guest from choosing a MemAttr in it's S1 that would cause ordinary DMA to
bypass the cache. Consistent with KVM we wish to deny the guest the
ability to become incoherent with cached memory the hypervisor believes is
cachable so we don't have to flush it.

Turn on S2FWB whenever the SMMU supports it and use it for all S2
mappings.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 +++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  3 +++
 drivers/iommu/io-pgtable-arm.c              | 27 +++++++++++++++++----
 include/linux/io-pgtable.h                  |  2 ++
 4 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index d9b3ea462935c..e7f35d26235cd 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1654,6 +1654,8 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 		FIELD_PREP(STRTAB_STE_1_EATS,
 			   ats_enabled ? STRTAB_STE_1_EATS_TRANS : 0));
 
+	if (smmu->features & ARM_SMMU_FEAT_S2FWB)
+		target->data[1] |= cpu_to_le64(STRTAB_STE_1_S2FWB);
 	if (smmu->features & ARM_SMMU_FEAT_ATTR_TYPES_OVR)
 		target->data[1] |= cpu_to_le64(FIELD_PREP(STRTAB_STE_1_SHCFG,
 							  STRTAB_STE_1_SHCFG_INCOMING));
@@ -2444,6 +2446,8 @@ static int arm_smmu_domain_finalise(struct arm_smmu_domain *smmu_domain,
 		pgtbl_cfg.oas = smmu->oas;
 		fmt = ARM_64_LPAE_S2;
 		finalise_stage_fn = arm_smmu_domain_finalise_s2;
+		if (smmu->features & ARM_SMMU_FEAT_S2FWB)
+			pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_ARM_S2FWB;
 		break;
 	default:
 		return -EINVAL;
@@ -4245,6 +4249,13 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 
 	/* IDR3 */
 	reg = readl_relaxed(smmu->base + ARM_SMMU_IDR3);
+	/*
+	 * If for some reason the HW does not support DMA coherency then using
+	 * S2FWB won't work. This will also disable nesting support.
+	 */
+	if (FIELD_GET(IDR3_FWB, reg) &&
+	    (smmu->features & ARM_SMMU_FEAT_COHERENCY))
+		smmu->features |= ARM_SMMU_FEAT_S2FWB;
 	if (FIELD_GET(IDR3_RIL, reg))
 		smmu->features |= ARM_SMMU_FEAT_RANGE_INV;
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index c07e8d6d11f45..8cc46ca9abfe0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -57,6 +57,7 @@ struct arm_smmu_device;
 #define IDR1_SIDSIZE			GENMASK(5, 0)
 
 #define ARM_SMMU_IDR3			0xc
+#define IDR3_FWB			(1 << 8)
 #define IDR3_RIL			(1 << 10)
 
 #define ARM_SMMU_IDR5			0x14
@@ -262,6 +263,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_1_S1CSH		GENMASK_ULL(7, 6)
 
 #define STRTAB_STE_1_S1STALLD		(1UL << 27)
+#define STRTAB_STE_1_S2FWB		(1UL << 25)
 
 #define STRTAB_STE_1_EATS		GENMASK_ULL(29, 28)
 #define STRTAB_STE_1_EATS_ABT		0UL
@@ -725,6 +727,7 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_ATTR_TYPES_OVR	(1 << 20)
 #define ARM_SMMU_FEAT_HA		(1 << 21)
 #define ARM_SMMU_FEAT_HD		(1 << 22)
+#define ARM_SMMU_FEAT_S2FWB		(1 << 23)
 	u32				features;
 
 #define ARM_SMMU_OPT_SKIP_PREFETCH	(1 << 0)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index f5d9fd1f45bf4..9b3658aae2100 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -106,6 +106,18 @@
 #define ARM_LPAE_PTE_HAP_FAULT		(((arm_lpae_iopte)0) << 6)
 #define ARM_LPAE_PTE_HAP_READ		(((arm_lpae_iopte)1) << 6)
 #define ARM_LPAE_PTE_HAP_WRITE		(((arm_lpae_iopte)2) << 6)
+/*
+ * For !FWB these code to:
+ *  1111 = Normal outer write back cachable / Inner Write Back Cachable
+ *         Permit S1 to override
+ *  0101 = Normal Non-cachable / Inner Non-cachable
+ *  0001 = Device / Device-nGnRE
+ * For S2FWB these code:
+ *  0110 Force Normal Write Back
+ *  0101 Normal* is forced Normal-NC, Device unchanged
+ *  0001 Force Device-nGnRE
+ */
+#define ARM_LPAE_PTE_MEMATTR_FWB_WB	(((arm_lpae_iopte)0x6) << 2)
 #define ARM_LPAE_PTE_MEMATTR_OIWB	(((arm_lpae_iopte)0xf) << 2)
 #define ARM_LPAE_PTE_MEMATTR_NC		(((arm_lpae_iopte)0x5) << 2)
 #define ARM_LPAE_PTE_MEMATTR_DEV	(((arm_lpae_iopte)0x1) << 2)
@@ -458,12 +470,16 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
 	 */
 	if (data->iop.fmt == ARM_64_LPAE_S2 ||
 	    data->iop.fmt == ARM_32_LPAE_S2) {
-		if (prot & IOMMU_MMIO)
+		if (prot & IOMMU_MMIO) {
 			pte |= ARM_LPAE_PTE_MEMATTR_DEV;
-		else if (prot & IOMMU_CACHE)
-			pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
-		else
+		} else if (prot & IOMMU_CACHE) {
+			if (data->iop.cfg.quirks & IO_PGTABLE_QUIRK_ARM_S2FWB)
+				pte |= ARM_LPAE_PTE_MEMATTR_FWB_WB;
+			else
+				pte |= ARM_LPAE_PTE_MEMATTR_OIWB;
+		} else {
 			pte |= ARM_LPAE_PTE_MEMATTR_NC;
+		}
 	} else {
 		if (prot & IOMMU_MMIO)
 			pte |= (ARM_LPAE_MAIR_ATTR_IDX_DEV
@@ -932,7 +948,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, void *cookie)
 	if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
 			    IO_PGTABLE_QUIRK_ARM_TTBR1 |
 			    IO_PGTABLE_QUIRK_ARM_OUTER_WBWA |
-			    IO_PGTABLE_QUIRK_ARM_HD))
+			    IO_PGTABLE_QUIRK_ARM_HD |
+			    IO_PGTABLE_QUIRK_ARM_S2FWB))
 		return NULL;
 
 	data = arm_lpae_alloc_pgtable(cfg);
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index f9a81761bfced..aff9b020b6dcc 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,6 +87,7 @@ struct io_pgtable_cfg {
 	 *	attributes set in the TCR for a non-coherent page-table walker.
 	 *
 	 * IO_PGTABLE_QUIRK_ARM_HD: Enables dirty tracking in stage 1 pagetable.
+	 * IO_PGTABLE_QUIRK_ARM_S2FWB: Use the FWB format for the MemAttrs bits
 	 */
 	#define IO_PGTABLE_QUIRK_ARM_NS			BIT(0)
 	#define IO_PGTABLE_QUIRK_NO_PERMS		BIT(1)
@@ -95,6 +96,7 @@ struct io_pgtable_cfg {
 	#define IO_PGTABLE_QUIRK_ARM_TTBR1		BIT(5)
 	#define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA		BIT(6)
 	#define IO_PGTABLE_QUIRK_ARM_HD			BIT(7)
+	#define IO_PGTABLE_QUIRK_ARM_S2FWB		BIT(8)
 	unsigned long			quirks;
 	unsigned long			pgsize_bitmap;
 	unsigned int			ias;

From 24069e65b7fa0a2a669d02b3298195bced7a44ae Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 9 Aug 2024 10:46:10 -0700
Subject: [PATCH 259/352] ACPICA: IORT: Update for revision E.f

ACPICA commit c4f5c083d24df9ddd71d5782c0988408cf0fc1ab

The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory
Access Flag field in the Memory Access Properties table, mainly for a PCI
Root Complex.

This CANWBS defines the coherency of memory accesses to be not marked IOWB
cacheable/shareable. Its value further implies the coherency impact from a
pair of mismatched memory attributes (e.g. in a nested translation case):
  0x0: Use of mismatched memory attributes for accesses made by this
       device may lead to a loss of coherency.
  0x1: Coherency of accesses made by this device to locations in
       Conventional memory are ensured as follows, even if the memory
       attributes for the accesses presented by the device or provided by
       the SMMU are different from Inner and Outer Write-back cacheable,
       Shareable.

Link: https://github.com/acpica/acpica/commit/c4f5c083
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/acpi/actbl2.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h
index 9775384d61c69..23e1e3e971725 100644
--- a/include/acpi/actbl2.h
+++ b/include/acpi/actbl2.h
@@ -376,7 +376,7 @@ struct acpi_table_ccel {
  * IORT - IO Remapping Table
  *
  * Conforms to "IO Remapping Table System Software on ARM Platforms",
- * Document number: ARM DEN 0049E.e, Sep 2022
+ * Document number: ARM DEN 0049E.f, Apr 2024
  *
  ******************************************************************************/
 
@@ -447,6 +447,7 @@ struct acpi_iort_memory_access {
 
 #define ACPI_IORT_MF_COHERENCY          (1)
 #define ACPI_IORT_MF_ATTRIBUTES         (1<<1)
+#define ACPI_IORT_MF_CANWBS             (1<<2)
 
 /*
  * IORT node specific subtables

From 4abdaa00f189b4c7ee6acb3f3624ed8ef7cca876 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 15 Jul 2024 21:32:10 +0000
Subject: [PATCH 260/352] ACPI/IORT: Support CANWBS memory access flag

The IORT spec, Issue E.f (April 2024), adds a new CANWBS bit to the Memory
Access Flag field in the Memory Access Properties table, mainly for a PCI
Root Complex.

This CANWBS defines the coherency of memory accesses to be not marked IOWB
cacheable/shareable. Its value further implies the coherency impact from a
pair of mismatched memory attributes (e.g. in a nested translation case):
  0x0: Use of mismatched memory attributes for accesses made by this
       device may lead to a loss of coherency.
  0x1: Coherency of accesses made by this device to locations in
       Conventional memory are ensured as follows, even if the memory
       attributes for the accesses presented by the device or provided by
       the SMMU are different from Inner and Outer Write-back cacheable,
       Shareable.

Note that the loss of coherency on a CANWBS-unsupported HW typically could
occur to an SMMU that doesn't implement the S2FWB feature where additional
cache flush operations would be required to prevent that from happening.

Add a new ACPI_IORT_MF_CANWBS flag and set IOMMU_FWSPEC_PCI_RC_CANWBS upon
the presence of this new flag.

CANWBS and S2FWB are similar features, in that they both guarantee the VM
can not violate coherency, however S2FWB can be bypassed by PCI No Snoop
TLPs, while CANWBS cannot. Thus CANWBS meets the requirements to set
IOMMU_CAP_ENFORCE_CACHE_COHERENCY.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/acpi/arm64/iort.c | 13 +++++++++++++
 include/linux/iommu.h     |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 6496ff5a6ba20..4700092cf5129 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1218,6 +1218,17 @@ static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
 	return pci_rc->ats_attribute & ACPI_IORT_ATS_SUPPORTED;
 }
 
+static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node)
+{
+	struct acpi_iort_memory_access *memory_access;
+	struct acpi_iort_root_complex *pci_rc;
+
+	pci_rc = (struct acpi_iort_root_complex *)node->node_data;
+	memory_access =
+		(struct acpi_iort_memory_access *)&pci_rc->memory_properties;
+	return memory_access->memory_flags & ACPI_IORT_MF_CANWBS;
+}
+
 static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,
 			    u32 streamid)
 {
@@ -1344,6 +1355,8 @@ int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 		fwspec = dev_iommu_fwspec_get(dev);
 		if (fwspec && iort_pci_rc_supports_ats(node))
 			fwspec->flags |= IOMMU_FWSPEC_PCI_RC_ATS;
+		if (fwspec && iort_pci_rc_supports_canwbs(node))
+			fwspec->flags |= IOMMU_FWSPEC_PCI_RC_CANWBS;
 	} else {
 		node = iort_scan_node(ACPI_IORT_NODE_NAMED_COMPONENT,
 				      iort_match_node_callback, dev);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0a9d211d3b7fc..16dcd4ec9fcce 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -980,6 +980,8 @@ struct iommu_fwspec {
 
 /* ATS is supported */
 #define IOMMU_FWSPEC_PCI_RC_ATS			(1 << 0)
+/* CANWBS is supported */
+#define IOMMU_FWSPEC_PCI_RC_CANWBS		(1 << 1)
 
 /*
  * An iommu attach handle represents a relationship between an iommu domain

From f7bc6ed003512e7be9f0cbaa20e75292ee35fbf0 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 17 Jul 2024 15:58:25 -0300
Subject: [PATCH 261/352] iommu/arm-smmu-v3: Report
 IOMMU_CAP_ENFORCE_CACHE_COHERENCY for CANWBS

HW with CANWBS is always cache coherent and ignores PCI No Snoop requests
as well. This meets the requirement for IOMMU_CAP_ENFORCE_CACHE_COHERENCY,
so let's return it.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 35 +++++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 +
 2 files changed, 36 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index e7f35d26235cd..f96fbcef3bbbc 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2297,6 +2297,9 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 	case IOMMU_CAP_CACHE_COHERENCY:
 		/* Assume that a coherent TCU implies coherent TBUs */
 		return master->smmu->features & ARM_SMMU_FEAT_COHERENCY;
+	case IOMMU_CAP_ENFORCE_CACHE_COHERENCY:
+		return dev_iommu_fwspec_get(dev)->flags &
+		       IOMMU_FWSPEC_PCI_RC_CANWBS;
 	case IOMMU_CAP_NOEXEC:
 	case IOMMU_CAP_DEFERRED_FLUSH:
 		return true;
@@ -2307,6 +2310,28 @@ static bool arm_smmu_capable(struct device *dev, enum iommu_cap cap)
 	}
 }
 
+static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct arm_smmu_master_domain *master_domain;
+	unsigned long flags;
+	bool ret = false;
+
+	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+	list_for_each_entry(master_domain, &smmu_domain->devices,
+			    devices_elm) {
+		if (!(dev_iommu_fwspec_get(master_domain->master->dev)->flags &
+		      IOMMU_FWSPEC_PCI_RC_CANWBS))
+			goto out;
+	}
+
+	smmu_domain->enforce_cache_coherency = true;
+	ret = true;
+out:
+	spin_unlock_irqrestore(&smmu_domain->devices_lock, flags);
+	return ret;
+}
+
 struct arm_smmu_domain *arm_smmu_domain_alloc(void)
 {
 	struct arm_smmu_domain *smmu_domain;
@@ -2737,6 +2762,15 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		 * one of them.
 		 */
 		spin_lock_irqsave(&smmu_domain->devices_lock, flags);
+		if (smmu_domain->enforce_cache_coherency &&
+		    !(dev_iommu_fwspec_get(master->dev)->flags &
+		      IOMMU_FWSPEC_PCI_RC_CANWBS)) {
+			kfree(master_domain);
+			spin_unlock_irqrestore(&smmu_domain->devices_lock,
+					       flags);
+			return -EINVAL;
+		}
+
 		if (state->ats_enabled)
 			atomic_inc(&smmu_domain->nr_ats_masters);
 		list_add(&master_domain->devices_elm, &smmu_domain->devices);
@@ -3493,6 +3527,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.owner			= THIS_MODULE,
 	.default_domain_ops = &(const struct iommu_domain_ops) {
 		.attach_dev		= arm_smmu_attach_dev,
+		.enforce_cache_coherency = arm_smmu_enforce_cache_coherency,
 		.set_dev_pasid		= arm_smmu_s1_set_dev_pasid,
 		.map_pages		= arm_smmu_map_pages,
 		.unmap_pages		= arm_smmu_unmap_pages,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 8cc46ca9abfe0..ca86aee3d5010 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -813,6 +813,7 @@ struct arm_smmu_domain {
 	/* List of struct arm_smmu_master_domain */
 	struct list_head		devices;
 	spinlock_t			devices_lock;
+	bool				enforce_cache_coherency : 1;
 
 	struct mmu_notifier		mmu_notifier;
 };

From eac9c8e86e90fd94e84ebafdeef3df0a92d32fe9 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 22 Sep 2022 15:46:52 -0700
Subject: [PATCH 262/352] iommu/arm-smmu-v3: Support IOMMU_GET_HW_INFO via
 struct arm_smmu_hw_info

For virtualization cases the IDR/IIDR/AIDR values of the actual SMMU
instance need to be available to the VMM so it can construct an
appropriate vSMMUv3 that reflects the correct HW capabilities.

For userspace page tables these values are required to constrain the valid
values within the CD table and the IOPTEs.

The kernel does not sanitize these values. If building a VMM then
userspace is required to only forward bits into a VM that it knows it can
implement. Some bits will also require a VMM to detect if appropriate
kernel support is available such as for ATS and BTM.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 24 ++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  2 ++
 include/uapi/linux/iommufd.h                | 35 +++++++++++++++++++++
 3 files changed, 61 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index f96fbcef3bbbc..02fa2d90e2a25 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2332,6 +2332,29 @@ static bool arm_smmu_enforce_cache_coherency(struct iommu_domain *domain)
 	return ret;
 }
 
+static void *arm_smmu_hw_info(struct device *dev, u32 *length, u32 *type)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct iommu_hw_info_arm_smmuv3 *info;
+	u32 __iomem *base_idr;
+	unsigned int i;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return ERR_PTR(-ENOMEM);
+
+	base_idr = master->smmu->base + ARM_SMMU_IDR0;
+	for (i = 0; i <= 5; i++)
+		info->idr[i] = readl_relaxed(base_idr + i);
+	info->iidr = readl_relaxed(master->smmu->base + ARM_SMMU_IIDR);
+	info->aidr = readl_relaxed(master->smmu->base + ARM_SMMU_AIDR);
+
+	*length = sizeof(*info);
+	*type = IOMMU_HW_INFO_TYPE_ARM_SMMUV3;
+
+	return info;
+}
+
 struct arm_smmu_domain *arm_smmu_domain_alloc(void)
 {
 	struct arm_smmu_domain *smmu_domain;
@@ -3510,6 +3533,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
 	.capable		= arm_smmu_capable,
+	.hw_info		= arm_smmu_hw_info,
 	.domain_alloc_paging    = arm_smmu_domain_alloc_paging,
 	.domain_alloc_sva       = arm_smmu_sva_domain_alloc,
 	.domain_alloc_user	= arm_smmu_domain_alloc_user,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index ca86aee3d5010..88afff159e9cb 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -82,6 +82,8 @@ struct arm_smmu_device;
 #define IIDR_REVISION			GENMASK(15, 12)
 #define IIDR_IMPLEMENTER		GENMASK(11, 0)
 
+#define ARM_SMMU_AIDR			0x1C
+
 #define ARM_SMMU_CR0			0x20
 #define CR0_ATSCHK			(1 << 4)
 #define CR0_CMDQEN			(1 << 3)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 72010f71c5e47..b5c94fecb94ca 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -484,15 +484,50 @@ struct iommu_hw_info_vtd {
 	__aligned_u64 ecap_reg;
 };
 
+/**
+ * struct iommu_hw_info_arm_smmuv3 - ARM SMMUv3 hardware information
+ *                                   (IOMMU_HW_INFO_TYPE_ARM_SMMUV3)
+ *
+ * @flags: Must be set to 0
+ * @__reserved: Must be 0
+ * @idr: Implemented features for ARM SMMU Non-secure programming interface
+ * @iidr: Information about the implementation and implementer of ARM SMMU,
+ *        and architecture version supported
+ * @aidr: ARM SMMU architecture version
+ *
+ * For the details of @idr, @iidr and @aidr, please refer to the chapters
+ * from 6.3.1 to 6.3.6 in the SMMUv3 Spec.
+ *
+ * User space should read the underlying ARM SMMUv3 hardware information for
+ * the list of supported features.
+ *
+ * Note that these values reflect the raw HW capability, without any insight if
+ * any required kernel driver support is present. Bits may be set indicating the
+ * HW has functionality that is lacking kernel software support, such as BTM. If
+ * a VMM is using this information to construct emulated copies of these
+ * registers it should only forward bits that it knows it can support.
+ *
+ * In future, presence of required kernel support will be indicated in flags.
+ */
+struct iommu_hw_info_arm_smmuv3 {
+	__u32 flags;
+	__u32 __reserved;
+	__u32 idr[6];
+	__u32 iidr;
+	__u32 aidr;
+};
+
 /**
  * enum iommu_hw_info_type - IOMMU Hardware Info Types
  * @IOMMU_HW_INFO_TYPE_NONE: Used by the drivers that do not report hardware
  *                           info
  * @IOMMU_HW_INFO_TYPE_INTEL_VTD: Intel VT-d iommu info type
+ * @IOMMU_HW_INFO_TYPE_ARM_SMMUV3: ARM SMMUv3 iommu info type
  */
 enum iommu_hw_info_type {
 	IOMMU_HW_INFO_TYPE_NONE = 0,
 	IOMMU_HW_INFO_TYPE_INTEL_VTD = 1,
+	IOMMU_HW_INFO_TYPE_ARM_SMMUV3 = 2,
 };
 
 /**

From cf046130ddc5bdf376c9231fb9700b87da82c037 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 31 Oct 2023 16:28:13 -0300
Subject: [PATCH 263/352] iommu/arm-smmu-v3: Implement
 IOMMU_HWPT_ALLOC_NEST_PARENT

For SMMUv3 the parent must be a S2 domain, which can be composed
into a IOMMU_DOMAIN_NESTED.

In future the S2 parent will also need a VMID linked to the VIOMMU and
even to KVM.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 02fa2d90e2a25..6038cf8a33022 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3147,7 +3147,8 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			   const struct iommu_user_data *user_data)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
-	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING;
+	const u32 PAGING_FLAGS = IOMMU_HWPT_ALLOC_DIRTY_TRACKING |
+				 IOMMU_HWPT_ALLOC_NEST_PARENT;
 	struct arm_smmu_domain *smmu_domain;
 	int ret;
 
@@ -3160,6 +3161,14 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 	if (IS_ERR(smmu_domain))
 		return ERR_CAST(smmu_domain);
 
+	if (flags & IOMMU_HWPT_ALLOC_NEST_PARENT) {
+		if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING)) {
+			ret = -EOPNOTSUPP;
+			goto err_free;
+		}
+		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+	}
+
 	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
 	smmu_domain->domain.ops = arm_smmu_ops.default_domain_ops;
 	ret = arm_smmu_domain_finalise(smmu_domain, master->smmu, flags);

From 050fce2b4b8daf1b75811b3f1fa1b0ddf49af0ae Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 6 Jul 2022 23:37:53 -0700
Subject: [PATCH 264/352] iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED

For SMMUv3 a IOMMU_DOMAIN_NESTED is composed of a S2 iommu_domain acting
as the parent and a user provided STE fragment that defines the CD table
and related data with addresses translated by the S2 iommu_domain.

The kernel only permits userspace to control certain allowed bits of the
STE that are safe for user/guest control.

IOTLB maintenance is a bit subtle here, the S1 implicitly includes the S2
translation, but there is no way of knowing which S1 entries refer to a
range of S2.

For the IOTLB we follow ARM's guidance and issue a CMDQ_OP_TLBI_NH_ALL to
flush all ASIDs from the VMID after flushing the S2 on any change to the
S2.

Similarly we have to flush the entire ATC if the S2 is changed.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 217 +++++++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  20 ++
 include/uapi/linux/iommufd.h                |  20 ++
 3 files changed, 250 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 6038cf8a33022..92b5b9b4ccf39 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -295,6 +295,7 @@ static int arm_smmu_cmdq_build_cmd(u64 *cmd, struct arm_smmu_cmdq_ent *ent)
 	case CMDQ_OP_TLBI_NH_ASID:
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_ASID, ent->tlbi.asid);
 		fallthrough;
+	case CMDQ_OP_TLBI_NH_ALL:
 	case CMDQ_OP_TLBI_S12_VMALL:
 		cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, ent->tlbi.vmid);
 		break;
@@ -1682,6 +1683,59 @@ void arm_smmu_make_s2_domain_ste(struct arm_smmu_ste *target,
 }
 EXPORT_SYMBOL_IF_KUNIT(arm_smmu_make_s2_domain_ste);
 
+static void arm_smmu_make_nested_cd_table_ste(
+	struct arm_smmu_ste *target, struct arm_smmu_master *master,
+	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+	arm_smmu_make_s2_domain_ste(target, master, nested_domain->s2_parent,
+				    ats_enabled);
+
+	target->data[0] = cpu_to_le64(STRTAB_STE_0_V |
+				      FIELD_PREP(STRTAB_STE_0_CFG,
+						 STRTAB_STE_0_CFG_NESTED)) |
+			  (nested_domain->ste[0] & ~STRTAB_STE_0_CFG);
+	target->data[1] |= nested_domain->ste[1];
+}
+
+/*
+ * Create a physical STE from the virtual STE that userspace provided when it
+ * created the nested domain. Using the vSTE userspace can request:
+ * - Non-valid STE
+ * - Abort STE
+ * - Bypass STE (install the S2, no CD table)
+ * - CD table STE (install the S2 and the userspace CD table)
+ */
+static void arm_smmu_make_nested_domain_ste(
+	struct arm_smmu_ste *target, struct arm_smmu_master *master,
+	struct arm_smmu_nested_domain *nested_domain, bool ats_enabled)
+{
+	/*
+	 * Userspace can request a non-valid STE through the nesting interface.
+	 * We relay that into an abort physical STE with the intention that
+	 * C_BAD_STE for this SID can be generated to userspace.
+	 */
+	if (!(nested_domain->ste[0] & cpu_to_le64(STRTAB_STE_0_V))) {
+		arm_smmu_make_abort_ste(target);
+		return;
+	}
+
+	switch (FIELD_GET(STRTAB_STE_0_CFG,
+			  le64_to_cpu(nested_domain->ste[0]))) {
+	case STRTAB_STE_0_CFG_S1_TRANS:
+		arm_smmu_make_nested_cd_table_ste(target, master, nested_domain,
+						  ats_enabled);
+		break;
+	case STRTAB_STE_0_CFG_BYPASS:
+		arm_smmu_make_s2_domain_ste(
+			target, master, nested_domain->s2_parent, ats_enabled);
+		break;
+	case STRTAB_STE_0_CFG_ABORT:
+	default:
+		arm_smmu_make_abort_ste(target);
+		break;
+	}
+}
+
 /*
  * This can safely directly manipulate the STE memory without a sync sequence
  * because the STE table has not been installed in the SMMU yet.
@@ -2109,7 +2163,16 @@ int arm_smmu_atc_inv_domain(struct arm_smmu_domain *smmu_domain,
 		if (!master->ats_enabled)
 			continue;
 
-		arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size, &cmd);
+		if (master_domain->nest_parent) {
+			/*
+			 * If a S2 used as a nesting parent is changed we have
+			 * no option but to completely flush the ATC.
+			 */
+			arm_smmu_atc_inv_to_cmd(IOMMU_NO_PASID, 0, 0, &cmd);
+		} else {
+			arm_smmu_atc_inv_to_cmd(master_domain->ssid, iova, size,
+						&cmd);
+		}
 
 		for (i = 0; i < master->num_streams; i++) {
 			cmd.atc.sid = master->streams[i].id;
@@ -2236,6 +2299,16 @@ static void arm_smmu_tlb_inv_range_domain(unsigned long iova, size_t size,
 	}
 	__arm_smmu_tlb_inv_range(&cmd, iova, size, granule, smmu_domain);
 
+	if (smmu_domain->stage == ARM_SMMU_DOMAIN_S2 &&
+	    smmu_domain->nest_parent) {
+		/*
+		 * When the S2 domain changes all the nested S1 ASIDs have to be
+		 * flushed too.
+		 */
+		cmd.opcode = CMDQ_OP_TLBI_NH_ALL;
+		arm_smmu_cmdq_issue_cmd_with_sync(smmu_domain->smmu, &cmd);
+	}
+
 	/*
 	 * Unfortunately, this can't be leaf-only since we may have
 	 * zapped an entire table.
@@ -2648,8 +2721,8 @@ static void arm_smmu_disable_pasid(struct arm_smmu_master *master)
 
 static struct arm_smmu_master_domain *
 arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
-			    struct arm_smmu_master *master,
-			    ioasid_t ssid)
+			    struct arm_smmu_master *master, ioasid_t ssid,
+			    bool nest_parent)
 {
 	struct arm_smmu_master_domain *master_domain;
 
@@ -2658,7 +2731,8 @@ arm_smmu_find_master_domain(struct arm_smmu_domain *smmu_domain,
 	list_for_each_entry(master_domain, &smmu_domain->devices,
 			    devices_elm) {
 		if (master_domain->master == master &&
-		    master_domain->ssid == ssid)
+		    master_domain->ssid == ssid &&
+		    master_domain->nest_parent == nest_parent)
 			return master_domain;
 	}
 	return NULL;
@@ -2678,6 +2752,9 @@ to_smmu_domain_devices(struct iommu_domain *domain)
 	if ((domain->type & __IOMMU_DOMAIN_PAGING) ||
 	    domain->type == IOMMU_DOMAIN_SVA)
 		return to_smmu_domain(domain);
+	if (domain->type == IOMMU_DOMAIN_NESTED)
+		return container_of(domain, struct arm_smmu_nested_domain,
+				    domain)->s2_parent;
 	return NULL;
 }
 
@@ -2693,7 +2770,8 @@ static void arm_smmu_remove_master_domain(struct arm_smmu_master *master,
 		return;
 
 	spin_lock_irqsave(&smmu_domain->devices_lock, flags);
-	master_domain = arm_smmu_find_master_domain(smmu_domain, master, ssid);
+	master_domain = arm_smmu_find_master_domain(
+		smmu_domain, master, ssid, domain->type == IOMMU_DOMAIN_NESTED);
 	if (master_domain) {
 		list_del(&master_domain->devices_elm);
 		kfree(master_domain);
@@ -2708,6 +2786,7 @@ struct arm_smmu_attach_state {
 	struct iommu_domain *old_domain;
 	struct arm_smmu_master *master;
 	bool cd_needs_ats;
+	bool disable_ats;
 	ioasid_t ssid;
 	/* Resulting state */
 	bool ats_enabled;
@@ -2760,7 +2839,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		 * enabled if we have arm_smmu_domain, those always have page
 		 * tables.
 		 */
-		state->ats_enabled = arm_smmu_ats_supported(master);
+		state->ats_enabled = !state->disable_ats &&
+				     arm_smmu_ats_supported(master);
 	}
 
 	if (smmu_domain) {
@@ -2769,6 +2849,8 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 			return -ENOMEM;
 		master_domain->master = master;
 		master_domain->ssid = state->ssid;
+		master_domain->nest_parent = new_domain->type ==
+					       IOMMU_DOMAIN_NESTED;
 
 		/*
 		 * During prepare we want the current smmu_domain and new
@@ -3141,6 +3223,122 @@ static struct iommu_domain arm_smmu_blocked_domain = {
 	.ops = &arm_smmu_blocked_ops,
 };
 
+static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
+				      struct device *dev)
+{
+	struct arm_smmu_nested_domain *nested_domain =
+		container_of(domain, struct arm_smmu_nested_domain, domain);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_attach_state state = {
+		.master = master,
+		.old_domain = iommu_get_domain_for_dev(dev),
+		.ssid = IOMMU_NO_PASID,
+		/* Currently invalidation of ATC is not supported */
+		.disable_ats = true,
+	};
+	struct arm_smmu_ste ste;
+	int ret;
+
+	if (arm_smmu_ssids_in_use(&master->cd_table) ||
+	    nested_domain->s2_parent->smmu != master->smmu)
+		return -EINVAL;
+
+	mutex_lock(&arm_smmu_asid_lock);
+	ret = arm_smmu_attach_prepare(&state, domain);
+	if (ret) {
+		mutex_unlock(&arm_smmu_asid_lock);
+		return ret;
+	}
+
+	arm_smmu_make_nested_domain_ste(&ste, master, nested_domain,
+					state.ats_enabled);
+	arm_smmu_install_ste_for_dev(master, &ste);
+	arm_smmu_attach_commit(&state);
+	mutex_unlock(&arm_smmu_asid_lock);
+	return 0;
+}
+
+static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
+{
+	kfree(container_of(domain, struct arm_smmu_nested_domain, domain));
+}
+
+static const struct iommu_domain_ops arm_smmu_nested_ops = {
+	.attach_dev = arm_smmu_attach_dev_nested,
+	.free = arm_smmu_domain_nested_free,
+};
+
+static struct iommu_domain *
+arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
+			      struct iommu_domain *parent,
+			      const struct iommu_user_data *user_data)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+	struct arm_smmu_nested_domain *nested_domain;
+	struct arm_smmu_domain *smmu_parent;
+	struct iommu_hwpt_arm_smmuv3 arg;
+	unsigned int eats;
+	unsigned int cfg;
+	int ret;
+
+	if (!(master->smmu->features & ARM_SMMU_FEAT_NESTING))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	/*
+	 * Must support some way to prevent the VM from bypassing the cache
+	 * because VFIO currently does not do any cache maintenance.
+	 */
+	if (!(fwspec->flags & IOMMU_FWSPEC_PCI_RC_CANWBS) &&
+	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	ret = iommu_copy_struct_from_user(&arg, user_data,
+					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (flags || !(master->smmu->features & ARM_SMMU_FEAT_TRANS_S1))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	if (!(parent->type & __IOMMU_DOMAIN_PAGING))
+		return ERR_PTR(-EINVAL);
+
+	smmu_parent = to_smmu_domain(parent);
+	if (smmu_parent->stage != ARM_SMMU_DOMAIN_S2 ||
+	    smmu_parent->smmu != master->smmu)
+		return ERR_PTR(-EINVAL);
+
+	/* EIO is reserved for invalid STE data. */
+	if ((arg.ste[0] & ~STRTAB_STE_0_NESTING_ALLOWED) ||
+	    (arg.ste[1] & ~STRTAB_STE_1_NESTING_ALLOWED))
+		return ERR_PTR(-EIO);
+
+	cfg = FIELD_GET(STRTAB_STE_0_CFG, le64_to_cpu(arg.ste[0]));
+	if (cfg != STRTAB_STE_0_CFG_ABORT && cfg != STRTAB_STE_0_CFG_BYPASS &&
+	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
+		return ERR_PTR(-EIO);
+
+	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1]));
+	if (eats != STRTAB_STE_1_EATS_ABT)
+		return ERR_PTR(-EIO);
+
+	if (cfg != STRTAB_STE_0_CFG_S1_TRANS)
+		eats = STRTAB_STE_1_EATS_ABT;
+
+	nested_domain = kzalloc(sizeof(*nested_domain), GFP_KERNEL_ACCOUNT);
+	if (!nested_domain)
+		return ERR_PTR(-ENOMEM);
+
+	nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
+	nested_domain->domain.ops = &arm_smmu_nested_ops;
+	nested_domain->s2_parent = smmu_parent;
+	nested_domain->ste[0] = arg.ste[0];
+	nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
+
+	return &nested_domain->domain;
+}
+
 static struct iommu_domain *
 arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			   struct iommu_domain *parent,
@@ -3152,9 +3350,13 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 	struct arm_smmu_domain *smmu_domain;
 	int ret;
 
+	if (parent)
+		return arm_smmu_domain_alloc_nesting(dev, flags, parent,
+						     user_data);
+
 	if (flags & ~PAGING_FLAGS)
 		return ERR_PTR(-EOPNOTSUPP);
-	if (parent || user_data)
+	if (user_data)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	smmu_domain = arm_smmu_domain_alloc();
@@ -3167,6 +3369,7 @@ arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			goto err_free;
 		}
 		smmu_domain->stage = ARM_SMMU_DOMAIN_S2;
+		smmu_domain->nest_parent = true;
 	}
 
 	smmu_domain->domain.type = IOMMU_DOMAIN_UNMANAGED;
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 88afff159e9cb..5378541fe697a 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -244,6 +244,7 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 #define STRTAB_STE_0_CFG_BYPASS		4
 #define STRTAB_STE_0_CFG_S1_TRANS	5
 #define STRTAB_STE_0_CFG_S2_TRANS	6
+#define STRTAB_STE_0_CFG_NESTED		7
 
 #define STRTAB_STE_0_S1FMT		GENMASK_ULL(5, 4)
 #define STRTAB_STE_0_S1FMT_LINEAR	0
@@ -295,6 +296,15 @@ static inline u32 arm_smmu_strtab_l2_idx(u32 sid)
 
 #define STRTAB_STE_3_S2TTB_MASK		GENMASK_ULL(51, 4)
 
+/* These bits can be controlled by userspace for STRTAB_STE_0_CFG_NESTED */
+#define STRTAB_STE_0_NESTING_ALLOWED                                         \
+	cpu_to_le64(STRTAB_STE_0_V | STRTAB_STE_0_CFG | STRTAB_STE_0_S1FMT | \
+		    STRTAB_STE_0_S1CTXPTR_MASK | STRTAB_STE_0_S1CDMAX)
+#define STRTAB_STE_1_NESTING_ALLOWED                            \
+	cpu_to_le64(STRTAB_STE_1_S1DSS | STRTAB_STE_1_S1CIR |   \
+		    STRTAB_STE_1_S1COR | STRTAB_STE_1_S1CSH |   \
+		    STRTAB_STE_1_S1STALLD | STRTAB_STE_1_EATS)
+
 /*
  * Context descriptors.
  *
@@ -514,6 +524,7 @@ struct arm_smmu_cmdq_ent {
 			};
 		} cfgi;
 
+		#define CMDQ_OP_TLBI_NH_ALL     0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
@@ -816,10 +827,18 @@ struct arm_smmu_domain {
 	struct list_head		devices;
 	spinlock_t			devices_lock;
 	bool				enforce_cache_coherency : 1;
+	bool				nest_parent : 1;
 
 	struct mmu_notifier		mmu_notifier;
 };
 
+struct arm_smmu_nested_domain {
+	struct iommu_domain domain;
+	struct arm_smmu_domain *s2_parent;
+
+	__le64 ste[2];
+};
+
 /* The following are exposed for testing purposes. */
 struct arm_smmu_entry_writer_ops;
 struct arm_smmu_entry_writer {
@@ -856,6 +875,7 @@ struct arm_smmu_master_domain {
 	struct list_head devices_elm;
 	struct arm_smmu_master *master;
 	ioasid_t ssid;
+	u8 nest_parent;
 };
 
 static inline struct arm_smmu_domain *to_smmu_domain(struct iommu_domain *dom)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index b5c94fecb94ca..cd4920886ad05 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -394,14 +394,34 @@ struct iommu_hwpt_vtd_s1 {
 	__u32 __reserved;
 };
 
+/**
+ * struct iommu_hwpt_arm_smmuv3 - ARM SMMUv3 Context Descriptor Table info
+ *                                (IOMMU_HWPT_DATA_ARM_SMMUV3)
+ *
+ * @ste: The first two double words of the user space Stream Table Entry for
+ *       a user stage-1 Context Descriptor Table. Must be little-endian.
+ *       Allowed fields: (Refer to "5.2 Stream Table Entry" in SMMUv3 HW Spec)
+ *       - word-0: V, Cfg, S1Fmt, S1ContextPtr, S1CDMax
+ *       - word-1: S1DSS, S1CIR, S1COR, S1CSH, S1STALLD
+ *
+ * -EIO will be returned if @ste is not legal or contains any non-allowed field.
+ * Cfg can be used to select a S1, Bypass or Abort configuration. A Bypass
+ * nested domain will translate the same as the nesting parent.
+ */
+struct iommu_hwpt_arm_smmuv3 {
+	__aligned_le64 ste[2];
+};
+
 /**
  * enum iommu_hwpt_data_type - IOMMU HWPT Data Type
  * @IOMMU_HWPT_DATA_NONE: no data
  * @IOMMU_HWPT_DATA_VTD_S1: Intel VT-d stage-1 page table
+ * @IOMMU_HWPT_DATA_ARM_SMMUV3: ARM SMMUv3 Context Descriptor Table
  */
 enum iommu_hwpt_data_type {
 	IOMMU_HWPT_DATA_NONE = 0,
 	IOMMU_HWPT_DATA_VTD_S1 = 1,
+	IOMMU_HWPT_DATA_ARM_SMMUV3 = 2,
 };
 
 /**

From 7563a9e49d00be9edf4b7db7bb3ed65ec95bd3ae Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Tue, 31 Oct 2023 16:21:10 -0300
Subject: [PATCH 265/352] cover-letter: Initial support for SMMUv3 nested
 translation

This brings support for the IOMMFD ioctls:

 - IOMMU_GET_HW_INFO
 - IOMMU_HWPT_ALLOC_NEST_PARENT
 - IOMMU_DOMAIN_NESTED
 - ops->enforce_cache_coherency()

This is quite straightforward as the nested STE can just be built in the
special NESTED domain op and fed through the generic update machinery.

The design allows the user provided STE fragment to control several
aspects of the translation, including putting the STE into a "virtual
bypass" or a aborting state. This duplicates functionality available by
other means, but it allows trivially preserving the VMID in the STE as we
eventually move towards the VIOMMU owning the VMID.

Nesting support requires the system to either support S2FWB or the
stronger CANWBS ACPI flag. This is to ensure the VM cannot bypass the
cache and view incoherent data, currently VFIO lacks any cache flushing
that would make this safe.

Yan has a series to add some of the needed infrastructure for VFIO cache
flushing here:

 https://lore.kernel.org/linux-iommu/20240507061802.20184-1-yan.y.zhao@intel.com/

Which may someday allow relaxing this further.

Remove VFIO_TYPE1_NESTING_IOMMU since it was never used and superseded by
this.

This is the first series in what will be several to complete nesting
support. At least:
 - IOMMU_RESV_SW_MSI related fixups
 - VIOMMU object support to allow ATS and CD invalidations
 - vCMDQ hypervisor support for direct invalidation queue assignment
 - KVM pinned VMID using VIOMMU for vBTM
 - Cross instance S2 sharing
 - Virtual Machine Structure using VIOMMU (for vMPAM?)
 - Fault forwarding support through IOMMUFD's fault fd for vSVA

The VIOMMU series is essential to allow the invalidations to be processed
for the CD as well.

It is enough to allow significant amounts of qemu work to progress.

This is on github: https://github.com/jgunthorpe/linux/commits/smmuv3_nesting

v2:
 - Revise commit messages
 - Guard S2FWB support with ARM_SMMU_FEAT_COHERENCY, since it doesn't make
   sense to use S2FWB to enforce coherency on inherently non-coherent hardware.
 - Add missing IO_PGTABLE_QUIRK_ARM_S2FWB validation
 - Include formal ACPIA commit for IORT built using
   generate/linux/gen-patch.sh
 - Use FEAT_NESTING to block creating a NESTING_PARENT
 - Use an abort STE instead of non-valid if the user requests a non-valid
   vSTE
 - Consistently use 'nest_parent' for naming variables
 - Use the right domain for arm_smmu_remove_master_domain() when it
   removes the master
 - Join bitfields together
 - Drop arm_smmu_cache_invalidate_user patch, invalidation will
   exclusively go via viommu
v1: https://patch.msgid.link/r/0-v1-54e734311a7f+14f72-smmuv3_nesting_jgg@nvidia.com

Cc: Nicolin Chen <nicolinc@nvidia.com>
Cc: Michael Shavit <mshavit@google.com>
Cc: Jean-Philippe Brucker <jean-philippe@linaro.org>
Cc: Shameerali Kolothum Thodi <shameerali.kolothum.thodi@huawei.com>
Cc: Eric Auger <eric.auger@redhat.com>
Cc: Moritz Fischer <mdf@kernel.org>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 003cf964deac9fd88ba0a215806c5f87a8673094 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 17 Jul 2024 20:02:35 +0000
Subject: [PATCH 266/352] WAR: ACPI/IORT: Set CANWBS for Grace CPU

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/acpi/arm64/iort.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 4700092cf5129..a584ff429169a 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1226,7 +1226,8 @@ static bool iort_pci_rc_supports_canwbs(struct acpi_iort_node *node)
 	pci_rc = (struct acpi_iort_root_complex *)node->node_data;
 	memory_access =
 		(struct acpi_iort_memory_access *)&pci_rc->memory_properties;
-	return memory_access->memory_flags & ACPI_IORT_MF_CANWBS;
+	/* Grace supports CANWBS, return ture until we sets that in firmware */
+	return true;
 }
 
 static int iort_iommu_xlate(struct device *dev, struct acpi_iort_node *node,

From 414775e58b84d7a8b565df3606964f5744cc9b6f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 31 May 2024 00:14:34 +0000
Subject: [PATCH 267/352] cover-letter: WAR for nesting patches

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 279125bd50a1349bbbd2279e4d7a075a9a0ba073 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 21 Aug 2024 19:23:51 +0000
Subject: [PATCH 268/352] iommufd: Reorder struct forward declarations

Reorder struct forward declarations to alphabetic order to simplify
maintenance, as upcoming patches will add more to the list.

No functional change intended.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/iommufd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index c2f2f6b9148e2..30f832a60ccb3 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -11,12 +11,12 @@
 #include <linux/types.h>
 
 struct device;
-struct iommufd_device;
-struct page;
-struct iommufd_ctx;
-struct iommufd_access;
 struct file;
 struct iommu_group;
+struct iommufd_access;
+struct iommufd_ctx;
+struct iommufd_device;
+struct page;
 
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);

From 1c151ea03904cc556913d11c4277df921b57b8f6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 5 Apr 2024 03:33:27 +0000
Subject: [PATCH 269/352] iommufd/viommu: Add IOMMUFD_OBJ_VIOMMU and
 IOMMU_VIOMMU_ALLOC ioctl

Add a new IOMMUFD_OBJ_VIOMMU with an iommufd_viommu structure to represent
a vIOMMU instance in the user space, backed by a physical IOMMU for its HW
accelerated virtualization feature, such as nested translation support for
a multi-viommu-instance VM, NVIDIA CMDQ-Virtualization extension for ARM
SMMUv3, and AMD Hardware Accelerated Virtualized IOMMU (vIOMMU).

Also, add a new ioctl for user space to do a viommu allocation. It must be
based on a nested parent HWPT, so take its refcount.

As an initial version, support a viommu of IOMMU_VIOMMU_TYPE_DEFAULT type.
IOMMUFD core can use this viommu to store a virtual device ID lookup table
in a following patch.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/Makefile          |  3 +-
 drivers/iommu/iommufd/iommufd_private.h | 12 +++++
 drivers/iommu/iommufd/main.c            |  6 +++
 drivers/iommu/iommufd/viommu.c          | 72 +++++++++++++++++++++++++
 include/uapi/linux/iommufd.h            | 30 +++++++++++
 5 files changed, 122 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/iommufd/viommu.c

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index cf4605962bea6..df490e836b301 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -7,7 +7,8 @@ iommufd-y := \
 	ioas.o \
 	main.o \
 	pages.o \
-	vfio_compat.o
+	vfio_compat.o \
+	viommu.o
 
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 017e50574f3b2..d8946efa0fbdd 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -131,6 +131,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
 	IOMMUFD_OBJ_FAULT,
+	IOMMUFD_OBJ_VIOMMU,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
@@ -507,6 +508,17 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+struct iommufd_viommu {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_hwpt_paging *hwpt;
+
+	unsigned int type;
+};
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_viommu_destroy(struct iommufd_object *obj);
+
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
 void iommufd_selftest_destroy(struct iommufd_object *obj);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index b5f5d27ee9634..288ee51b6829b 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -333,6 +333,7 @@ union ucmd_buffer {
 	struct iommu_ioas_unmap unmap;
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
+	struct iommu_viommu_alloc viommu;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -384,6 +385,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 val64),
 	IOCTL_OP(IOMMU_VFIO_IOAS, iommufd_vfio_ioas, struct iommu_vfio_ioas,
 		 __reserved),
+	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
+		 struct iommu_viommu_alloc, out_viommu_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -519,6 +522,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_FAULT] = {
 		.destroy = iommufd_fault_destroy,
 	},
+	[IOMMUFD_OBJ_VIOMMU] = {
+		.destroy = iommufd_viommu_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
new file mode 100644
index 0000000000000..200653a4bf579
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu.c
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include "iommufd_private.h"
+
+void iommufd_viommu_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_viommu *viommu =
+		container_of(obj, struct iommufd_viommu, obj);
+
+	refcount_dec(&viommu->hwpt->common.obj.users);
+}
+
+int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_alloc *cmd = ucmd->cmd;
+	struct iommufd_hwpt_paging *hwpt_paging;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+
+	hwpt_paging = iommufd_get_hwpt_paging(ucmd, cmd->hwpt_id);
+	if (IS_ERR(hwpt_paging)) {
+		rc = PTR_ERR(hwpt_paging);
+		goto out_put_idev;
+	}
+
+	if (!hwpt_paging->nest_parent) {
+		rc = -EINVAL;
+		goto out_put_hwpt;
+	}
+
+	if (cmd->type != IOMMU_VIOMMU_TYPE_DEFAULT) {
+		rc = -EOPNOTSUPP;
+		goto out_put_hwpt;
+	}
+
+	viommu = iommufd_object_alloc(ucmd->ictx, viommu, IOMMUFD_OBJ_VIOMMU);
+	if (IS_ERR(viommu)) {
+		rc = PTR_ERR(viommu);
+		goto out_put_hwpt;
+	}
+
+	viommu->type = cmd->type;
+	viommu->ictx = ucmd->ictx;
+	viommu->hwpt = hwpt_paging;
+
+	refcount_inc(&viommu->hwpt->common.obj.users);
+
+	cmd->out_viommu_id = viommu->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_abort;
+	iommufd_object_finalize(ucmd->ictx, &viommu->obj);
+	goto out_put_hwpt;
+
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &viommu->obj);
+out_put_hwpt:
+	iommufd_put_object(ucmd->ictx, &hwpt_paging->common.obj);
+out_put_idev:
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index cd4920886ad05..ac77903b5cc48 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -51,6 +51,7 @@ enum {
 	IOMMUFD_CMD_HWPT_GET_DIRTY_BITMAP = 0x8c,
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
+	IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
 };
 
 /**
@@ -852,4 +853,33 @@ struct iommu_fault_alloc {
 	__u32 out_fault_fd;
 };
 #define IOMMU_FAULT_QUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_FAULT_QUEUE_ALLOC)
+
+/**
+ * enum iommu_viommu_type - Virtual IOMMU Type
+ * @IOMMU_VIOMMU_TYPE_DEFAULT: Core-managed VIOMMU type
+ */
+enum iommu_viommu_type {
+	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+};
+
+/**
+ * struct iommu_viommu_alloc - ioctl(IOMMU_VIOMMU_ALLOC)
+ * @size: sizeof(struct iommu_viommu_alloc)
+ * @flags: Must be 0
+ * @type: Type of the virtual IOMMU. Must be defined in enum iommu_viommu_type
+ * @dev_id: The device to allocate this virtual IOMMU for
+ * @hwpt_id: ID of a nesting parent HWPT to associate to
+ * @out_viommu_id: Output virtual IOMMU ID for the allocated object
+ *
+ * Allocate a virtual IOMMU object that holds a (shared) nesting parent HWPT
+ */
+struct iommu_viommu_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 type;
+	__u32 dev_id;
+	__u32 hwpt_id;
+	__u32 out_viommu_id;
+};
+#define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
 #endif

From 3b0bafadd7b4da2bc72e5ad1bdf7af3c569324a5 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 21 May 2024 01:16:11 +0000
Subject: [PATCH 270/352] iommu: Pass in a viommu pointer to domain_alloc_user
 op

With a viommu object wrapping a potentially shareable S2 domain, a nested
domain should be allocated by associating to a viommu instead. Driver can
store this viommu pointer somewhere, so as to later use it calling viommu
helpers for virtual device ID lookup and viommu invalidation.

For drivers without a viommu support, keep the parent domain input, which
should be just viommu->hwpt->common.domain otherwise.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/amd/iommu.c                   | 1 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 1 +
 drivers/iommu/intel/iommu.c                 | 1 +
 drivers/iommu/iommufd/hw_pagetable.c        | 5 +++--
 drivers/iommu/iommufd/selftest.c            | 1 +
 include/linux/iommu.h                       | 2 ++
 6 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index f945bf3253ce0..bf3bca31ffd5e 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -2244,6 +2244,7 @@ static struct iommu_domain *amd_iommu_domain_alloc(unsigned int type)
 static struct iommu_domain *
 amd_iommu_domain_alloc_user(struct device *dev, u32 flags,
 			    struct iommu_domain *parent,
+			    struct iommufd_viommu *viommu,
 			    const struct iommu_user_data *user_data)
 
 {
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 92b5b9b4ccf39..b7eaf9df1f87c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3342,6 +3342,7 @@ arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
 static struct iommu_domain *
 arm_smmu_domain_alloc_user(struct device *dev, u32 flags,
 			   struct iommu_domain *parent,
+			   struct iommufd_viommu *viommu,
 			   const struct iommu_user_data *user_data)
 {
 	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 6155877984c46..9a7c4e9c0aea3 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -3980,6 +3980,7 @@ static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
 static struct iommu_domain *
 intel_iommu_domain_alloc_user(struct device *dev, u32 flags,
 			      struct iommu_domain *parent,
+			      struct iommufd_viommu *viommu,
 			      const struct iommu_user_data *user_data)
 {
 	struct device_domain_info *info = dev_iommu_priv_get(dev);
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index aefde4443671e..c21bb59c4022d 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -137,7 +137,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
 
 	if (ops->domain_alloc_user) {
 		hwpt->domain = ops->domain_alloc_user(idev->dev, flags, NULL,
-						      user_data);
+						      NULL, user_data);
 		if (IS_ERR(hwpt->domain)) {
 			rc = PTR_ERR(hwpt->domain);
 			hwpt->domain = NULL;
@@ -239,7 +239,8 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 
 	hwpt->domain = ops->domain_alloc_user(idev->dev,
 					      flags & ~IOMMU_HWPT_FAULT_ID_VALID,
-					      parent->common.domain, user_data);
+					      parent->common.domain,
+					      NULL, user_data);
 	if (IS_ERR(hwpt->domain)) {
 		rc = PTR_ERR(hwpt->domain);
 		hwpt->domain = NULL;
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index c5c14bbf723cb..d35fa95cb430c 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -319,6 +319,7 @@ __mock_domain_alloc_nested(struct mock_iommu_domain *mock_parent,
 static struct iommu_domain *
 mock_domain_alloc_user(struct device *dev, u32 flags,
 		       struct iommu_domain *parent,
+		       struct iommufd_viommu *viommu,
 		       const struct iommu_user_data *user_data)
 {
 	struct mock_iommu_domain *mock_parent;
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 16dcd4ec9fcce..0543e328a16b2 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -42,6 +42,7 @@ struct notifier_block;
 struct iommu_sva;
 struct iommu_dma_cookie;
 struct iommu_fault_param;
+struct iommufd_viommu;
 
 #define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
 #define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
@@ -565,6 +566,7 @@ struct iommu_ops {
 	struct iommu_domain *(*domain_alloc)(unsigned iommu_domain_type);
 	struct iommu_domain *(*domain_alloc_user)(
 		struct device *dev, u32 flags, struct iommu_domain *parent,
+		struct iommufd_viommu *viommu,
 		const struct iommu_user_data *user_data);
 	struct iommu_domain *(*domain_alloc_paging)(struct device *dev);
 	struct iommu_domain *(*domain_alloc_sva)(struct device *dev,

From 874cef50bc40b547b2f72063dc22a0543799e62d Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 21 May 2024 01:17:30 +0000
Subject: [PATCH 271/352] iommufd: Allow pt_id to carry viommu_id for
 IOMMU_HWPT_ALLOC

Now a VIOMMU can wrap a shareable nested parent HWPT. So, it can act like
a nested parent HWPT to allocate a nested HWPT.

Support that in the IOMMU_HWPT_ALLOC ioctl handler, and update its kdoc.

Also, associate a viommu to an allocating nested HWPT.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 24 ++++++++++++++++++++++--
 drivers/iommu/iommufd/iommufd_private.h |  1 +
 include/uapi/linux/iommufd.h            | 12 ++++++------
 3 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index c21bb59c4022d..06adbcc304bcd 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -57,6 +57,9 @@ void iommufd_hwpt_nested_destroy(struct iommufd_object *obj)
 		container_of(obj, struct iommufd_hwpt_nested, common.obj);
 
 	__iommufd_hwpt_destroy(&hwpt_nested->common);
+
+	if (hwpt_nested->viommu)
+		refcount_dec(&hwpt_nested->viommu->obj.users);
 	refcount_dec(&hwpt_nested->parent->common.obj.users);
 }
 
@@ -213,6 +216,7 @@ iommufd_hwpt_paging_alloc(struct iommufd_ctx *ictx, struct iommufd_ioas *ioas,
  */
 static struct iommufd_hwpt_nested *
 iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
+			  struct iommufd_viommu *viommu,
 			  struct iommufd_hwpt_paging *parent,
 			  struct iommufd_device *idev, u32 flags,
 			  const struct iommu_user_data *user_data)
@@ -234,13 +238,16 @@ iommufd_hwpt_nested_alloc(struct iommufd_ctx *ictx,
 		return ERR_CAST(hwpt_nested);
 	hwpt = &hwpt_nested->common;
 
+	if (viommu)
+		refcount_inc(&viommu->obj.users);
+	hwpt_nested->viommu = viommu;
 	refcount_inc(&parent->common.obj.users);
 	hwpt_nested->parent = parent;
 
 	hwpt->domain = ops->domain_alloc_user(idev->dev,
 					      flags & ~IOMMU_HWPT_FAULT_ID_VALID,
 					      parent->common.domain,
-					      NULL, user_data);
+					      viommu, user_data);
 	if (IS_ERR(hwpt->domain)) {
 		rc = PTR_ERR(hwpt->domain);
 		hwpt->domain = NULL;
@@ -307,7 +314,7 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 		struct iommufd_hwpt_nested *hwpt_nested;
 
 		hwpt_nested = iommufd_hwpt_nested_alloc(
-			ucmd->ictx,
+			ucmd->ictx, NULL,
 			container_of(pt_obj, struct iommufd_hwpt_paging,
 				     common.obj),
 			idev, cmd->flags, &user_data);
@@ -316,6 +323,19 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 			goto out_unlock;
 		}
 		hwpt = &hwpt_nested->common;
+	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+		struct iommufd_hwpt_nested *hwpt_nested;
+		struct iommufd_viommu *viommu;
+
+		viommu = container_of(pt_obj, struct iommufd_viommu, obj);
+		hwpt_nested = iommufd_hwpt_nested_alloc(
+			ucmd->ictx, viommu, viommu->hwpt, idev,
+			cmd->flags, &user_data);
+		if (IS_ERR(hwpt_nested)) {
+			rc = PTR_ERR(hwpt_nested);
+			goto out_unlock;
+		}
+		hwpt = &hwpt_nested->common;
 	} else {
 		rc = -EINVAL;
 		goto out_put_pt;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index d8946efa0fbdd..470a32afcc3e6 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -313,6 +313,7 @@ struct iommufd_hwpt_paging {
 struct iommufd_hwpt_nested {
 	struct iommufd_hw_pagetable common;
 	struct iommufd_hwpt_paging *parent;
+	struct iommufd_viommu *viommu;
 };
 
 static inline bool hwpt_is_paging(struct iommufd_hw_pagetable *hwpt)
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index ac77903b5cc48..51ce6a019c34b 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -430,7 +430,7 @@ enum iommu_hwpt_data_type {
  * @size: sizeof(struct iommu_hwpt_alloc)
  * @flags: Combination of enum iommufd_hwpt_alloc_flags
  * @dev_id: The device to allocate this HWPT for
- * @pt_id: The IOAS or HWPT to connect this HWPT to
+ * @pt_id: The IOAS or HWPT or VIOMMU to connect this HWPT to
  * @out_hwpt_id: The ID of the new HWPT
  * @__reserved: Must be 0
  * @data_type: One of enum iommu_hwpt_data_type
@@ -449,11 +449,11 @@ enum iommu_hwpt_data_type {
  * IOMMU_HWPT_DATA_NONE. The HWPT can be allocated as a parent HWPT for a
  * nesting configuration by passing IOMMU_HWPT_ALLOC_NEST_PARENT via @flags.
  *
- * A user-managed nested HWPT will be created from a given parent HWPT via
- * @pt_id, in which the parent HWPT must be allocated previously via the
- * same ioctl from a given IOAS (@pt_id). In this case, the @data_type
- * must be set to a pre-defined type corresponding to an I/O page table
- * type supported by the underlying IOMMU hardware.
+ * A user-managed nested HWPT will be created from a given VIOMMU (wrapping a
+ * parent HWPT) or a parent HWPT via @pt_id, in which the parent HWPT must be
+ * allocated previously via the same ioctl from a given IOAS (@pt_id). In this
+ * case, the @data_type must be set to a pre-defined type corresponding to an
+ * I/O page table type supported by the underlying IOMMU hardware.
  *
  * If the @data_type is set to IOMMU_HWPT_DATA_NONE, @data_len and
  * @data_uptr should be zero. Otherwise, both @data_len and @data_uptr

From fd6bea3685cc32e43f31525f3ad05795b9571546 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 10 Jan 2024 09:50:06 -0800
Subject: [PATCH 272/352] iommufd/selftest: Add IOMMU_VIOMMU_ALLOC test
 coverage

Use IOMMU_VIOMMU_TYPE_DEFAULT to cover the new IOMMU_VIOMMU_ALLOC ioctl.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 35 +++++++++++++++++++
 tools/testing/selftests/iommu/iommufd_utils.h | 28 +++++++++++++++
 2 files changed, 63 insertions(+)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 6343f4053bd46..5c770e94f299d 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -554,6 +554,41 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 	}
 }
 
+TEST_F(iommufd_ioas, viommu_default)
+{
+	uint32_t dev_id = self->device_id;
+	uint32_t viommu_id = 0;
+	uint32_t hwpt_id = 0;
+
+	if (dev_id) {
+		/* Negative test -- invalid hwpt */
+		test_err_viommu_alloc(ENOENT, dev_id, hwpt_id,
+				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+
+		/* Negative test -- not a nested parent hwpt */
+		test_cmd_hwpt_alloc(dev_id, self->ioas_id, 0, &hwpt_id);
+		test_err_viommu_alloc(EINVAL, dev_id, hwpt_id,
+				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+		test_ioctl_destroy(hwpt_id);
+
+		/* Allocate a nested parent HWP */
+		test_cmd_hwpt_alloc(dev_id, self->ioas_id,
+				    IOMMU_HWPT_ALLOC_NEST_PARENT,
+				    &hwpt_id);
+		/* Negative test -- unsupported viommu type */
+		test_err_viommu_alloc(EOPNOTSUPP, dev_id, hwpt_id,
+				      0xdead, &viommu_id);
+		/* Allocate a default type of viommu */
+		test_cmd_viommu_alloc(dev_id, hwpt_id,
+				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+		test_ioctl_destroy(viommu_id);
+		test_ioctl_destroy(hwpt_id);
+	} else {
+		test_err_viommu_alloc(ENOENT, dev_id, hwpt_id,
+				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+	}
+}
+
 TEST_F(iommufd_ioas, hwpt_attach)
 {
 	/* Create a device attached directly to a hwpt */
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 40f6f14ce136f..307d097db9dd5 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -762,3 +762,31 @@ static int _test_cmd_trigger_iopf(int fd, __u32 device_id, __u32 fault_fd)
 
 #define test_cmd_trigger_iopf(device_id, fault_fd) \
 	ASSERT_EQ(0, _test_cmd_trigger_iopf(self->fd, device_id, fault_fd))
+
+static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id,
+				  __u32 type, __u32 flags, __u32 *viommu_id)
+{
+	struct iommu_viommu_alloc cmd = {
+		.size = sizeof(cmd),
+		.flags = flags,
+		.type = type,
+		.dev_id = device_id,
+		.hwpt_id = hwpt_id,
+	};
+	int ret;
+
+	ret = ioctl(fd, IOMMU_VIOMMU_ALLOC, &cmd);
+	if (ret)
+		return ret;
+	if (viommu_id)
+		*viommu_id = cmd.out_viommu_id;
+	return 0;
+}
+
+#define test_cmd_viommu_alloc(device_id, hwpt_id, type, viommu_id)        \
+	ASSERT_EQ(0, _test_cmd_viommu_alloc(self->fd, device_id, hwpt_id, \
+					    type, 0, viommu_id))
+#define test_err_viommu_alloc(_errno, device_id, hwpt_id, type, viommu_id) \
+	EXPECT_ERRNO(_errno, _test_cmd_viommu_alloc(self->fd, device_id,   \
+						    hwpt_id, type, 0,      \
+						    viommu_id))

From c0915702a5dc21dfdd8f8a4254b4ee4176ab0af2 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 18 Dec 2023 16:01:03 -0800
Subject: [PATCH 273/352] iommufd/viommu: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID
 ioctl

Introduce a pair of new ioctls to set/unset a per-viommu virtual device id
that should be linked to a physical device id via an idev pointer.

Continue the support IOMMU_VIOMMU_TYPE_DEFAULT for a core-managed viommu.
Provide a lookup function for drivers to load device pointer by a virtual
device id.

Add a rw_semaphore protection around the vdev_id list. Any future ioctl
handlers that potentially access the list must grab the lock too.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/device.c          |  12 +++
 drivers/iommu/iommufd/iommufd_private.h |  21 ++++
 drivers/iommu/iommufd/main.c            |   6 ++
 drivers/iommu/iommufd/viommu.c          | 121 ++++++++++++++++++++++++
 include/uapi/linux/iommufd.h            |  40 ++++++++
 5 files changed, 200 insertions(+)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 895f2a59fde13..670a9c1ab8f75 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -136,6 +136,18 @@ void iommufd_device_destroy(struct iommufd_object *obj)
 	struct iommufd_device *idev =
 		container_of(obj, struct iommufd_device, obj);
 
+	/* Unlocked since there should be no race in a destroy() */
+	if (idev->vdev_id) {
+		struct iommufd_vdev_id *vdev_id = idev->vdev_id;
+		struct iommufd_viommu *viommu = vdev_id->viommu;
+		struct iommufd_vdev_id *old;
+
+		old = xa_cmpxchg(&viommu->vdev_ids, vdev_id->id, vdev_id, NULL,
+				 GFP_KERNEL);
+		WARN_ON(old != vdev_id);
+		kfree(vdev_id);
+		idev->vdev_id = NULL;
+	}
 	iommu_device_release_dma_owner(idev->dev);
 	iommufd_put_group(idev->igroup);
 	if (!iommufd_selftest_is_mock_dev(idev->dev))
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 470a32afcc3e6..22ac1ff250f24 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -397,6 +397,7 @@ struct iommufd_device {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
 	struct iommufd_group *igroup;
+	struct iommufd_vdev_id *vdev_id;
 	struct list_head group_item;
 	/* always the physical device */
 	struct device *dev;
@@ -514,11 +515,31 @@ struct iommufd_viommu {
 	struct iommufd_ctx *ictx;
 	struct iommufd_hwpt_paging *hwpt;
 
+	/* The locking order is vdev_ids_rwsem -> igroup::lock */
+	struct rw_semaphore vdev_ids_rwsem;
+	struct xarray vdev_ids;
+
 	unsigned int type;
 };
 
+struct iommufd_vdev_id {
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	u64 id;
+};
+
+static inline struct iommufd_viommu *
+iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_VIOMMU),
+			    struct iommufd_viommu, obj);
+}
+
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
+int iommufd_viommu_set_vdev_id(struct iommufd_ucmd *ucmd);
+int iommufd_viommu_unset_vdev_id(struct iommufd_ucmd *ucmd);
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 288ee51b6829b..199ad90fa36b1 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -334,6 +334,8 @@ union ucmd_buffer {
 	struct iommu_option option;
 	struct iommu_vfio_ioas vfio_ioas;
 	struct iommu_viommu_alloc viommu;
+	struct iommu_viommu_set_vdev_id set_vdev_id;
+	struct iommu_viommu_unset_vdev_id unset_vdev_id;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -387,6 +389,10 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 __reserved),
 	IOCTL_OP(IOMMU_VIOMMU_ALLOC, iommufd_viommu_alloc_ioctl,
 		 struct iommu_viommu_alloc, out_viommu_id),
+	IOCTL_OP(IOMMU_VIOMMU_SET_VDEV_ID, iommufd_viommu_set_vdev_id,
+		 struct iommu_viommu_set_vdev_id, vdev_id),
+	IOCTL_OP(IOMMU_VIOMMU_UNSET_VDEV_ID, iommufd_viommu_unset_vdev_id,
+		 struct iommu_viommu_unset_vdev_id, vdev_id),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 200653a4bf579..8ffcd72b16b8c 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -8,6 +8,15 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 {
 	struct iommufd_viommu *viommu =
 		container_of(obj, struct iommufd_viommu, obj);
+	struct iommufd_vdev_id *vdev_id;
+	unsigned long index;
+
+	xa_for_each(&viommu->vdev_ids, index, vdev_id) {
+		/* Unlocked since there should be no race in a destroy() */
+		vdev_id->idev->vdev_id = NULL;
+		kfree(vdev_id);
+	}
+	xa_destroy(&viommu->vdev_ids);
 
 	refcount_dec(&viommu->hwpt->common.obj.users);
 }
@@ -53,6 +62,9 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
 
+	xa_init(&viommu->vdev_ids);
+	init_rwsem(&viommu->vdev_ids_rwsem);
+
 	refcount_inc(&viommu->hwpt->common.obj.users);
 
 	cmd->out_viommu_id = viommu->obj.id;
@@ -70,3 +82,112 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(ucmd->ictx, &idev->obj);
 	return rc;
 }
+
+int iommufd_viommu_set_vdev_id(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_set_vdev_id *cmd = ucmd->cmd;
+	struct iommufd_vdev_id *vdev_id, *curr;
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	int rc = 0;
+
+	if (cmd->vdev_id > ULONG_MAX)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_put_viommu;
+	}
+
+	down_write(&viommu->vdev_ids_rwsem);
+	mutex_lock(&idev->igroup->lock);
+	if (idev->vdev_id) {
+		rc = -EEXIST;
+		goto out_unlock_igroup;
+	}
+
+	vdev_id = kzalloc(sizeof(*vdev_id), GFP_KERNEL);
+	if (!vdev_id) {
+		rc = -ENOMEM;
+		goto out_unlock_igroup;
+	}
+
+	vdev_id->idev = idev;
+	vdev_id->viommu = viommu;
+	vdev_id->id = cmd->vdev_id;
+
+	curr = xa_cmpxchg(&viommu->vdev_ids, cmd->vdev_id, NULL, vdev_id,
+			  GFP_KERNEL);
+	if (curr) {
+		rc = xa_err(curr) ? : -EBUSY;
+		goto out_free;
+	}
+
+	idev->vdev_id = vdev_id;
+	goto out_unlock_igroup;
+
+out_free:
+	kfree(vdev_id);
+out_unlock_igroup:
+	mutex_unlock(&idev->igroup->lock);
+	up_write(&viommu->vdev_ids_rwsem);
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
+
+int iommufd_viommu_unset_vdev_id(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_viommu_unset_vdev_id *cmd = ucmd->cmd;
+	struct iommufd_viommu *viommu;
+	struct iommufd_vdev_id *old;
+	struct iommufd_device *idev;
+	int rc = 0;
+
+	if (cmd->vdev_id > ULONG_MAX)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	idev = iommufd_get_device(ucmd, cmd->dev_id);
+	if (IS_ERR(idev)) {
+		rc = PTR_ERR(idev);
+		goto out_put_viommu;
+	}
+
+	down_write(&viommu->vdev_ids_rwsem);
+	mutex_lock(&idev->igroup->lock);
+	if (!idev->vdev_id) {
+		rc = -ENOENT;
+		goto out_unlock_igroup;
+	}
+	if (idev->vdev_id->id != cmd->vdev_id) {
+		rc = -EINVAL;
+		goto out_unlock_igroup;
+	}
+
+	old = xa_cmpxchg(&viommu->vdev_ids, idev->vdev_id->id,
+			 idev->vdev_id, NULL, GFP_KERNEL);
+	if (xa_is_err(old)) {
+		rc = xa_err(old);
+		goto out_unlock_igroup;
+	}
+	kfree(old);
+	idev->vdev_id = NULL;
+
+out_unlock_igroup:
+	mutex_unlock(&idev->igroup->lock);
+	up_write(&viommu->vdev_ids_rwsem);
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 51ce6a019c34b..1816e89c922da 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -52,6 +52,8 @@ enum {
 	IOMMUFD_CMD_HWPT_INVALIDATE = 0x8d,
 	IOMMUFD_CMD_FAULT_QUEUE_ALLOC = 0x8e,
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
+	IOMMUFD_CMD_VIOMMU_SET_VDEV_ID = 0x90,
+	IOMMUFD_CMD_VIOMMU_UNSET_VDEV_ID = 0x91,
 };
 
 /**
@@ -882,4 +884,42 @@ struct iommu_viommu_alloc {
 	__u32 out_viommu_id;
 };
 #define IOMMU_VIOMMU_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_ALLOC)
+
+/**
+ * struct iommu_viommu_set_vdev_id - ioctl(IOMMU_VIOMMU_SET_VDEV_ID)
+ * @size: sizeof(struct iommu_viommu_set_vdev_id)
+ * @viommu_id: viommu ID to associate with the device to store its virtual ID
+ * @dev_id: device ID to set its virtual ID
+ * @__reserved: Must be 0
+ * @vdev_id: Virtual device ID
+ *
+ * Set a viommu-specific virtual ID of a device
+ */
+struct iommu_viommu_set_vdev_id {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 __reserved;
+	__aligned_u64 vdev_id;
+};
+#define IOMMU_VIOMMU_SET_VDEV_ID _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_SET_VDEV_ID)
+
+/**
+ * struct iommu_viommu_unset_vdev_id - ioctl(IOMMU_VIOMMU_UNSET_VDEV_ID)
+ * @size: sizeof(struct iommu_viommu_unset_vdev_id)
+ * @viommu_id: viommu ID associated with the device to delete its virtual ID
+ * @dev_id: device ID to unset its virtual ID
+ * @__reserved: Must be 0
+ * @vdev_id: Virtual device ID (for verification)
+ *
+ * Unset a viommu-specific virtual ID of a device
+ */
+struct iommu_viommu_unset_vdev_id {
+	__u32 size;
+	__u32 viommu_id;
+	__u32 dev_id;
+	__u32 __reserved;
+	__aligned_u64 vdev_id;
+};
+#define IOMMU_VIOMMU_UNSET_VDEV_ID _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_UNSET_VDEV_ID)
 #endif

From 8d658eae67affc6566956feb9689e76a0d9936b8 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 4 Apr 2024 03:01:31 +0000
Subject: [PATCH 274/352] iommufd/selftest: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID
 test coverage

A core-managed VIOMMU maintains an xarray to store a list of virtual ids
to mock_devs.

Add test cases to cover the new IOMMU_VIOMMU_SET/UNSET_VDEV_ID ioctls.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 27 +++++++++++-
 tools/testing/selftests/iommu/iommufd_utils.h | 42 +++++++++++++++++++
 2 files changed, 67 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 5c770e94f299d..f383f3bc7c8b3 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -556,9 +556,12 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 
 TEST_F(iommufd_ioas, viommu_default)
 {
+	struct iommu_hwpt_selftest data = {
+		.iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+	};
+	uint32_t nested_hwpt_id = 0, hwpt_id = 0;
 	uint32_t dev_id = self->device_id;
 	uint32_t viommu_id = 0;
-	uint32_t hwpt_id = 0;
 
 	if (dev_id) {
 		/* Negative test -- invalid hwpt */
@@ -575,17 +578,37 @@ TEST_F(iommufd_ioas, viommu_default)
 		test_cmd_hwpt_alloc(dev_id, self->ioas_id,
 				    IOMMU_HWPT_ALLOC_NEST_PARENT,
 				    &hwpt_id);
+		test_cmd_mock_domain_replace(self->stdev_id, hwpt_id);
+
 		/* Negative test -- unsupported viommu type */
 		test_err_viommu_alloc(EOPNOTSUPP, dev_id, hwpt_id,
 				      0xdead, &viommu_id);
-		/* Allocate a default type of viommu */
+
+		/* Allocate a default type of viommu and a nested hwpt on top */
 		test_cmd_viommu_alloc(dev_id, hwpt_id,
 				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+		test_cmd_hwpt_alloc_nested(self->device_id, viommu_id, 0,
+					   &nested_hwpt_id,
+					   IOMMU_HWPT_DATA_SELFTEST, &data,
+					   sizeof(data));
+		test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id);
+
+		/* Set vdev_id to 0x99, unset it, and set to 0x88 */
+		test_cmd_viommu_set_vdev_id(viommu_id, dev_id, 0x99);
+		test_err_viommu_set_vdev_id(EEXIST, viommu_id, dev_id, 0x99);
+		test_err_viommu_unset_vdev_id(EINVAL, viommu_id, dev_id, 0x88);
+		test_cmd_viommu_unset_vdev_id(viommu_id, dev_id, 0x99);
+		test_cmd_viommu_set_vdev_id(viommu_id, dev_id, 0x88);
+
+		test_cmd_mock_domain_replace(self->stdev_id, hwpt_id);
+		test_ioctl_destroy(nested_hwpt_id);
+		test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
 		test_ioctl_destroy(viommu_id);
 		test_ioctl_destroy(hwpt_id);
 	} else {
 		test_err_viommu_alloc(ENOENT, dev_id, hwpt_id,
 				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+		test_err_viommu_set_vdev_id(ENOENT, viommu_id, dev_id, 0x99);
 	}
 }
 
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 307d097db9dd5..be722ea883581 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -790,3 +790,45 @@ static int _test_cmd_viommu_alloc(int fd, __u32 device_id, __u32 hwpt_id,
 	EXPECT_ERRNO(_errno, _test_cmd_viommu_alloc(self->fd, device_id,   \
 						    hwpt_id, type, 0,      \
 						    viommu_id))
+
+static int _test_cmd_viommu_set_vdev_id(int fd, __u32 viommu_id,
+					__u32 idev_id, __u64 vdev_id)
+{
+	struct iommu_viommu_set_vdev_id cmd = {
+		.size = sizeof(cmd),
+		.dev_id = idev_id,
+		.viommu_id = viommu_id,
+		.vdev_id = vdev_id,
+	};
+
+	return ioctl(fd, IOMMU_VIOMMU_SET_VDEV_ID, &cmd);
+}
+
+#define test_cmd_viommu_set_vdev_id(viommu_id, idev_id, vdev_id)       \
+	ASSERT_EQ(0, _test_cmd_viommu_set_vdev_id(self->fd, viommu_id, \
+						  idev_id, vdev_id))
+#define test_err_viommu_set_vdev_id(_errno, viommu_id, idev_id, vdev_id) \
+	EXPECT_ERRNO(_errno,                                             \
+		     _test_cmd_viommu_set_vdev_id(self->fd, viommu_id,   \
+						  idev_id, vdev_id))
+
+static int _test_cmd_viommu_unset_vdev_id(int fd, __u32 viommu_id,
+					  __u32 idev_id, __u64 vdev_id)
+{
+	struct iommu_viommu_unset_vdev_id cmd = {
+		.size = sizeof(cmd),
+		.dev_id = idev_id,
+		.viommu_id = viommu_id,
+		.vdev_id = vdev_id,
+	};
+
+	return ioctl(fd, IOMMU_VIOMMU_UNSET_VDEV_ID, &cmd);
+}
+
+#define test_cmd_viommu_unset_vdev_id(viommu_id, idev_id, vdev_id)       \
+	ASSERT_EQ(0, _test_cmd_viommu_unset_vdev_id(self->fd, viommu_id, \
+						    idev_id, vdev_id))
+#define test_err_viommu_unset_vdev_id(_errno, viommu_id, idev_id, vdev_id) \
+	EXPECT_ERRNO(_errno,                                               \
+		     _test_cmd_viommu_unset_vdev_id(self->fd, viommu_id,   \
+						    idev_id, vdev_id))

From ca87efeecb1e1ca47ba70792b4b5c78310704b4f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 31 May 2024 23:13:14 +0000
Subject: [PATCH 275/352] iommufd/viommu: Add cache_invalidate for
 IOMMU_VIOMMU_TYPE_DEFAULT

Add a default_viommu_ops with a new op for cache invaldiation, similar to
the cache_invalidate_user op in structure iommu_domain_ops, but wider. An
IOMMU driver that allocated a nested domain with a core-managed viommu is
able to use the same viommu pointer for this cache invalidation API.

ARM SMMUv3 for example supports IOTLB and ATC device cache invaldiations.
The IOTLB invalidation is per-VMID, held currently by a parent S2 domain.
The ATC invalidation is per device (Stream ID) that should be tranlsated
by a virtual device ID lookup table. Either case fits the viommu context.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  3 +++
 drivers/iommu/iommufd/viommu.c          |  3 +++
 include/linux/iommu.h                   |  5 +++++
 include/linux/iommufd.h                 | 19 +++++++++++++++++++
 4 files changed, 30 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 22ac1ff250f24..eb580360a2f43 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -5,6 +5,7 @@
 #define __IOMMUFD_PRIVATE_H
 
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/iova_bitmap.h>
 #include <linux/refcount.h>
 #include <linux/rwsem.h>
@@ -519,6 +520,8 @@ struct iommufd_viommu {
 	struct rw_semaphore vdev_ids_rwsem;
 	struct xarray vdev_ids;
 
+	const struct iommufd_viommu_ops *ops;
+
 	unsigned int type;
 };
 
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 8ffcd72b16b8c..a4ba8bff4a267 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -27,6 +27,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	struct iommufd_hwpt_paging *hwpt_paging;
 	struct iommufd_viommu *viommu;
 	struct iommufd_device *idev;
+	struct iommu_domain *domain;
 	int rc;
 
 	if (cmd->flags)
@@ -46,6 +47,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 		rc = -EINVAL;
 		goto out_put_hwpt;
 	}
+	domain = hwpt_paging->common.domain;
 
 	if (cmd->type != IOMMU_VIOMMU_TYPE_DEFAULT) {
 		rc = -EOPNOTSUPP;
@@ -61,6 +63,7 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	viommu->type = cmd->type;
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
+	viommu->ops = domain->ops->default_viommu_ops;
 
 	xa_init(&viommu->vdev_ids);
 	init_rwsem(&viommu->vdev_ids_rwsem);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0543e328a16b2..0b6170f0beea4 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -43,6 +43,7 @@ struct iommu_sva;
 struct iommu_dma_cookie;
 struct iommu_fault_param;
 struct iommufd_viommu;
+struct iommufd_viommu_ops;
 
 #define IOMMU_FAULT_PERM_READ	(1 << 0) /* read */
 #define IOMMU_FAULT_PERM_WRITE	(1 << 1) /* write */
@@ -634,6 +635,8 @@ struct iommu_ops {
  *                         array->entry_num to report the number of handled
  *                         invalidation requests. The driver data structure
  *                         must be defined in include/uapi/linux/iommufd.h
+ * @default_viommu_ops: Driver can choose to use a default core-allocated core-
+ *                      managed viommu object by providing a default viommu ops.
  * @iova_to_phys: translate iova to physical address
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
@@ -664,6 +667,8 @@ struct iommu_domain_ops {
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain,
 				    dma_addr_t iova);
 
+	const struct iommufd_viommu_ops *default_viommu_ops;
+
 	bool (*enforce_cache_coherency)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
 				  unsigned long quirks);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 30f832a60ccb3..85291b3463485 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -13,9 +13,11 @@
 struct device;
 struct file;
 struct iommu_group;
+struct iommu_user_data_array;
 struct iommufd_access;
 struct iommufd_ctx;
 struct iommufd_device;
+struct iommufd_viommu;
 struct page;
 
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
@@ -54,6 +56,23 @@ void iommufd_access_detach(struct iommufd_access *access);
 
 void iommufd_ctx_get(struct iommufd_ctx *ictx);
 
+/**
+ * struct iommufd_viommu_ops - viommu specific operations
+ * @cache_invalidate: Flush hardware cache used by a viommu. It can be used for
+ *                    any IOMMU hardware specific cache as long as a viommu has
+ *                    enough information to identify it: for example, a VMID or
+ *                    a vdev_id lookup table.
+ *                    The @array passes in the cache invalidation requests, in
+ *                    form of a driver data structure. A driver must update the
+ *                    array->entry_num to report the number of handled requests.
+ *                    The data structure of the array entry must be defined in
+ *                    include/uapi/linux/iommufd.h
+ */
+struct iommufd_viommu_ops {
+	int (*cache_invalidate)(struct iommufd_viommu *viommu,
+				struct iommu_user_data_array *array);
+};
+
 #if IS_ENABLED(CONFIG_IOMMUFD)
 struct iommufd_ctx *iommufd_ctx_from_file(struct file *file);
 struct iommufd_ctx *iommufd_ctx_from_fd(int fd);

From f9e4f65b88d3274e261b04dc195e9594be43ad96 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 12 Dec 2023 21:08:03 -0800
Subject: [PATCH 276/352] iommufd: Allow hwpt_id to carry viommu_id for
 IOMMU_HWPT_INVALIDATE

With a VIOMMU object, use space can flush any IOMMU related cache that can
be directed using the viommu. It is similar to IOMMU_HWPT_INVALIDATE uAPI,
but can cover a wider range than IOTLB, such as device cache or desciprtor
cache.

Allow hwpt_id of the iommu_hwpt_invalidate structure to carry a viommu_id,
and reuse the IOMMU_HWPT_INVALIDATE uAPI for VIOMMU invalidations. Driver
can define a different structure for VIOMMU invalidations v.s. HWPT ones.

Update the uAPI, kdoc, and selftest case accordingly.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/hw_pagetable.c    | 32 +++++++++++++++++++------
 include/uapi/linux/iommufd.h            |  9 ++++---
 tools/testing/selftests/iommu/iommufd.c |  4 ++--
 3 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 06adbcc304bcd..6aaec1b32abc3 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -432,7 +432,7 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
 		.entry_len = cmd->entry_len,
 		.entry_num = cmd->entry_num,
 	};
-	struct iommufd_hw_pagetable *hwpt;
+	struct iommufd_object *pt_obj;
 	u32 done_num = 0;
 	int rc;
 
@@ -446,17 +446,35 @@ int iommufd_hwpt_invalidate(struct iommufd_ucmd *ucmd)
 		goto out;
 	}
 
-	hwpt = iommufd_get_hwpt_nested(ucmd, cmd->hwpt_id);
-	if (IS_ERR(hwpt)) {
-		rc = PTR_ERR(hwpt);
+	pt_obj = iommufd_get_object(ucmd->ictx, cmd->hwpt_id, IOMMUFD_OBJ_ANY);
+	if (IS_ERR(pt_obj)) {
+		rc = PTR_ERR(pt_obj);
 		goto out;
 	}
+	if (pt_obj->type == IOMMUFD_OBJ_HWPT_NESTED) {
+		struct iommufd_hw_pagetable *hwpt =
+			container_of(pt_obj, struct iommufd_hw_pagetable, obj);
+
+		rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
+							      &data_array);
+	} else if (pt_obj->type == IOMMUFD_OBJ_VIOMMU) {
+		struct iommufd_viommu *viommu =
+			container_of(pt_obj, struct iommufd_viommu, obj);
+
+		if (!viommu->ops || !viommu->ops->cache_invalidate) {
+			rc = -EOPNOTSUPP;
+			goto out_put_pt;
+		}
+		rc = viommu->ops->cache_invalidate(viommu, &data_array);
+	} else {
+		rc = -EINVAL;
+		goto out_put_pt;
+	}
 
-	rc = hwpt->domain->ops->cache_invalidate_user(hwpt->domain,
-						      &data_array);
 	done_num = data_array.entry_num;
 
-	iommufd_put_object(ucmd->ictx, &hwpt->obj);
+out_put_pt:
+	iommufd_put_object(ucmd->ictx, pt_obj);
 out:
 	cmd->entry_num = done_num;
 	if (iommufd_ucmd_respond(ucmd, sizeof(*cmd)))
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 1816e89c922da..fd7d16fd441d1 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -729,7 +729,7 @@ struct iommu_hwpt_vtd_s1_invalidate {
 /**
  * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
  * @size: sizeof(struct iommu_hwpt_invalidate)
- * @hwpt_id: ID of a nested HWPT for cache invalidation
+ * @hwpt_id: ID of a nested HWPT or VIOMMU, for cache invalidation
  * @data_uptr: User pointer to an array of driver-specific cache invalidation
  *             data.
  * @data_type: One of enum iommu_hwpt_invalidate_data_type, defining the data
@@ -740,8 +740,11 @@ struct iommu_hwpt_vtd_s1_invalidate {
  *             Output the number of requests successfully handled by kernel.
  * @__reserved: Must be 0.
  *
- * Invalidate the iommu cache for user-managed page table. Modifications on a
- * user-managed page table should be followed by this operation to sync cache.
+ * Invalidate iommu cache for user-managed page table or vIOMMU. Modifications
+ * on a user-managed page table should be followed by this operation, if a HWPT
+ * is passed in via @hwpt_id. Other caches, such as device cache or descriptor
+ * cache can be flushed if a VIOMMU is passed in via the @hwpt_id field.
+ *
  * Each ioctl can support one or more cache invalidation requests in the array
  * that has a total size of @entry_len * @entry_num.
  *
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index f383f3bc7c8b3..12b5a8f78d4bd 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -360,9 +360,9 @@ TEST_F(iommufd_ioas, alloc_hwpt_nested)
 		EXPECT_ERRNO(EBUSY,
 			     _test_ioctl_destroy(self->fd, parent_hwpt_id));
 
-		/* hwpt_invalidate only supports a user-managed hwpt (nested) */
+		/* hwpt_invalidate does not support a parent hwpt */
 		num_inv = 1;
-		test_err_hwpt_invalidate(ENOENT, parent_hwpt_id, inv_reqs,
+		test_err_hwpt_invalidate(EINVAL, parent_hwpt_id, inv_reqs,
 					 IOMMU_HWPT_INVALIDATE_DATA_SELFTEST,
 					 sizeof(*inv_reqs), &num_inv);
 		assert(!num_inv);

From 73461320dc1a6acd3bad1056032a5ab6e82cb94c Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 21 Aug 2024 08:38:30 +0000
Subject: [PATCH 277/352] iommufd/viommu: Add vdev_id helpers for IOMMU drivers

Driver can call the iommufd_viommu_find_device() to find a device pointer
using its per-viommu virtual ID. The returned device must be protected by
the pair of iommufd_viommu_lock/unlock_vdev_id() function.

Put these three functions into a new viommu_api file, to build it with the
IOMMUFD_DRIVER config.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/Makefile     |  2 +-
 drivers/iommu/iommufd/viommu_api.c | 39 ++++++++++++++++++++++++++++++
 include/linux/iommufd.h            | 16 ++++++++++++
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 drivers/iommu/iommufd/viommu_api.c

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index df490e836b301..288ef3e895e39 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -13,4 +13,4 @@ iommufd-y := \
 iommufd-$(CONFIG_IOMMUFD_TEST) += selftest.o
 
 obj-$(CONFIG_IOMMUFD) += iommufd.o
-obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o
+obj-$(CONFIG_IOMMUFD_DRIVER) += iova_bitmap.o viommu_api.o
diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
new file mode 100644
index 0000000000000..e0ee592ce8345
--- /dev/null
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+
+#include "iommufd_private.h"
+
+void iommufd_viommu_lock_vdev_id(struct iommufd_viommu *viommu)
+{
+	down_read(&viommu->vdev_ids_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_lock_vdev_id, IOMMUFD);
+
+void iommufd_viommu_unlock_vdev_id(struct iommufd_viommu *viommu)
+{
+	up_read(&viommu->vdev_ids_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_unlock_vdev_id, IOMMUFD);
+
+/*
+ * Find a device attached to an VIOMMU object using a virtual device ID that was
+ * set via an IOMMUFD_CMD_VIOMMU_SET_VDEV_ID. Callers of this function must call
+ * iommufd_viommu_lock_vdev_id() prior and iommufd_viommu_unlock_vdev_id() after
+ *
+ * Return device or NULL.
+ */
+struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id)
+{
+	struct iommufd_vdev_id *vdev_id;
+
+	lockdep_assert_held(&viommu->vdev_ids_rwsem);
+
+	xa_lock(&viommu->vdev_ids);
+	vdev_id = xa_load(&viommu->vdev_ids, (unsigned long)id);
+	xa_unlock(&viommu->vdev_ids);
+	if (!vdev_id || vdev_id->id != id)
+		return NULL;
+	return vdev_id->idev->dev;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_device, IOMMUFD);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 85291b3463485..364f151d281dc 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -89,6 +89,9 @@ int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx);
 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx);
+void iommufd_viommu_lock_vdev_id(struct iommufd_viommu *viommu);
+void iommufd_viommu_unlock_vdev_id(struct iommufd_viommu *viommu);
+struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -129,5 +132,18 @@ static inline int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx)
 {
 	return -EOPNOTSUPP;
 }
+
+void iommufd_viommu_lock_vdev_id(struct iommufd_viommu *viommu)
+{
+}
+
+void iommufd_viommu_unlock_vdev_id(struct iommufd_viommu *viommu)
+{
+}
+
+struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id)
+{
+	return NULL;
+}
 #endif /* CONFIG_IOMMUFD */
 #endif

From 9ded30c2ea86c560c9f72bc342f519252a314de4 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Wed, 21 Aug 2024 17:40:28 +0000
Subject: [PATCH 278/352] iommu: Add iommu_copy_struct_from_full_user_array
 helper

The iommu_copy_struct_from_user_array helper can be used to copy a single
entry from a user array which might not be efficient if the array is big.

Add a new iommu_copy_struct_from_full_user_array to copy the entire user
array at once. Update the existing iommu_copy_struct_from_user_array kdoc
accordingly.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/iommu.h | 49 ++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 0b6170f0beea4..eabab0830be1c 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -494,7 +494,9 @@ static inline int __iommu_copy_struct_from_user_array(
  * @index: Index to the location in the array to copy user data from
  * @min_last: The last member of the data structure @kdst points in the
  *            initial version.
- * Return 0 for success, otherwise -error.
+ *
+ * Copy a single entry from a user array. Return 0 for success, otherwise
+ * -error.
  */
 #define iommu_copy_struct_from_user_array(kdst, user_array, data_type, index, \
 					  min_last)                           \
@@ -502,6 +504,51 @@ static inline int __iommu_copy_struct_from_user_array(
 		kdst, user_array, data_type, index, sizeof(*(kdst)),          \
 		offsetofend(typeof(*(kdst)), min_last))
 
+
+/**
+ * iommu_copy_struct_from_full_user_array - Copy iommu driver specific user
+ *         space data from an iommu_user_data_array
+ * @kdst: Pointer to an iommu driver specific user data that is defined in
+ *        include/uapi/linux/iommufd.h
+ * @kdst_entry_size: sizeof(*kdst)
+ * @user_array: Pointer to a struct iommu_user_data_array for a user space
+ *              array
+ * @data_type: The data type of the @kdst. Must match with @user_array->type
+ *
+ * Copy the entire user array. kdst must have room for kdst_entry_size *
+ * user_array->entry_num bytes. Return 0 for success, otherwise -error.
+ */
+static inline int
+iommu_copy_struct_from_full_user_array(void *kdst, size_t kdst_entry_size,
+				       struct iommu_user_data_array *user_array,
+				       unsigned int data_type)
+{
+	unsigned int i;
+	int ret;
+
+	if (user_array->type != data_type)
+		return -EINVAL;
+	if (!user_array->entry_num)
+		return -EINVAL;
+	if (likely(user_array->entry_len == kdst_entry_size)) {
+		if (copy_from_user(kdst, user_array->uptr,
+				   user_array->entry_num *
+					   user_array->entry_len))
+			return -EFAULT;
+	}
+
+	/* Copy item by item */
+	for (i = 0; i != user_array->entry_num; i++) {
+		ret = copy_struct_from_user(
+			kdst + kdst_entry_size * i, kdst_entry_size,
+			user_array->uptr + user_array->entry_len * i,
+			user_array->entry_len);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
 /**
  * struct iommu_ops - iommu ops and capabilities
  * @capable: check capability

From b2b813cf6862895d78d82a42aaca667960af9e70 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 18 Jan 2024 14:38:38 -0800
Subject: [PATCH 279/352] iommufd/selftest: Add mock_viommu_invalidate_user op

Similar to the coverage of cache_invalidate_user for iotlb invalidation,
add a device cache and an invalidation op to test IOMMU_VIOMMU_INVALIDATE
ioctl.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_test.h | 25 +++++++++
 drivers/iommu/iommufd/selftest.c     | 76 +++++++++++++++++++++++++++-
 2 files changed, 100 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index f4bc23a92f9a2..368076da10ca2 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -54,6 +54,11 @@ enum {
 	MOCK_NESTED_DOMAIN_IOTLB_NUM = 4,
 };
 
+enum {
+	MOCK_DEV_CACHE_ID_MAX = 3,
+	MOCK_DEV_CACHE_NUM = 4,
+};
+
 struct iommu_test_cmd {
 	__u32 size;
 	__u32 op;
@@ -152,6 +157,7 @@ struct iommu_test_hw_info {
 /* Should not be equal to any defined value in enum iommu_hwpt_data_type */
 #define IOMMU_HWPT_DATA_SELFTEST 0xdead
 #define IOMMU_TEST_IOTLB_DEFAULT 0xbadbeef
+#define IOMMU_TEST_DEV_CACHE_DEFAULT 0xbaddad
 
 /**
  * struct iommu_hwpt_selftest
@@ -180,4 +186,23 @@ struct iommu_hwpt_invalidate_selftest {
 	__u32 iotlb_id;
 };
 
+/* Should not be equal to any defined value in enum iommu_viommu_invalidate_data_type */
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST 0xdeadbeef
+#define IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID 0xdadbeef
+
+/**
+ * struct iommu_viommu_invalidate_selftest - Invalidation data for Mock VIOMMU
+ *                                        (IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST)
+ * @flags: Invalidate flags
+ * @cache_id: Invalidate cache entry index
+ *
+ * If IOMMU_TEST_INVALIDATE_ALL is set in @flags, @cache_id will be ignored
+ */
+struct iommu_viommu_invalidate_selftest {
+#define IOMMU_TEST_INVALIDATE_FLAG_ALL	(1 << 0)
+	__u32 flags;
+	__u32 vdev_id;
+	__u32 cache_id;
+};
+
 #endif
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index d35fa95cb430c..9b8fd138a1995 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -140,6 +140,7 @@ struct mock_dev {
 	struct device dev;
 	unsigned long flags;
 	int id;
+	u32 cache[MOCK_DEV_CACHE_NUM];
 };
 
 struct selftest_obj {
@@ -541,6 +542,74 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea
 	return 0;
 }
 
+static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu,
+					struct iommu_user_data_array *array)
+{
+	struct iommu_viommu_invalidate_selftest *cmds;
+	struct iommu_viommu_invalidate_selftest *cur;
+	struct iommu_viommu_invalidate_selftest *end;
+	int rc;
+
+	/* A zero-length array is allowed to validate the array type */
+	if (array->entry_num == 0 &&
+	    array->type == IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST) {
+		array->entry_num = 0;
+		return 0;
+	}
+
+	cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+	if (!cmds)
+		return -ENOMEM;
+	cur = cmds;
+	end = cmds + array->entry_num;
+
+	static_assert(sizeof(*cmds) == 3 * sizeof(u32));
+	rc = iommu_copy_struct_from_full_user_array(
+			cmds, sizeof(*cmds), array,
+			IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST);
+	if (rc)
+		goto out;
+
+	iommufd_viommu_lock_vdev_id(viommu);
+	while (cur != end) {
+		struct mock_dev *mdev;
+		struct device *dev;
+		int i;
+
+		if (cur->flags & ~IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+			rc = -EOPNOTSUPP;
+			goto out;
+		}
+
+		if (cur->cache_id > MOCK_DEV_CACHE_ID_MAX) {
+			rc = -EINVAL;
+			goto out;
+		}
+
+		dev = iommufd_viommu_find_device(viommu, cur->vdev_id);
+		if (!dev) {
+			rc = -EINVAL;
+			goto out;
+		}
+		mdev = container_of(dev, struct mock_dev, dev);
+
+		if (cur->flags & IOMMU_TEST_INVALIDATE_FLAG_ALL) {
+			/* Invalidate all cache entries and ignore cache_id */
+			for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+				mdev->cache[i] = 0;
+		} else {
+			mdev->cache[cur->cache_id] = 0;
+		}
+
+		cur++;
+	}
+out:
+	iommufd_viommu_unlock_vdev_id(viommu);
+	array->entry_num = cur - cmds;
+	kfree(cmds);
+	return rc;
+}
+
 static const struct iommu_ops mock_ops = {
 	/*
 	 * IOMMU_DOMAIN_BLOCKED cannot be returned from def_domain_type()
@@ -567,6 +636,9 @@ static const struct iommu_ops mock_ops = {
 			.map_pages = mock_domain_map_pages,
 			.unmap_pages = mock_domain_unmap_pages,
 			.iova_to_phys = mock_domain_iova_to_phys,
+			.default_viommu_ops = &(struct iommufd_viommu_ops){
+				.cache_invalidate = mock_viommu_cache_invalidate,
+			},
 		},
 };
 
@@ -692,7 +764,7 @@ static void mock_dev_release(struct device *dev)
 static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 {
 	struct mock_dev *mdev;
-	int rc;
+	int rc, i;
 
 	if (dev_flags &
 	    ~(MOCK_FLAGS_DEVICE_NO_DIRTY | MOCK_FLAGS_DEVICE_HUGE_IOVA))
@@ -706,6 +778,8 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 	mdev->flags = dev_flags;
 	mdev->dev.release = mock_dev_release;
 	mdev->dev.bus = &iommufd_mock_bus_type.bus;
+	for (i = 0; i < MOCK_DEV_CACHE_NUM; i++)
+		mdev->cache[i] = IOMMU_TEST_DEV_CACHE_DEFAULT;
 
 	rc = ida_alloc(&mock_dev_ida, GFP_KERNEL);
 	if (rc < 0)

From 6ed2ba0296eae52d72c5e5e592be471215b43989 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 18 Jan 2024 14:49:59 -0800
Subject: [PATCH 280/352] iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE
 test command

Similar to IOMMU_TEST_OP_MD_CHECK_IOTLB verifying a mock_domain's iotlb,
IOMMU_TEST_OP_DEV_CHECK_CACHE will be used to verify a mock_dev's cache.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_test.h          |  5 ++++
 drivers/iommu/iommufd/selftest.c              | 24 +++++++++++++++++++
 tools/testing/selftests/iommu/iommufd.c       |  7 +++++-
 tools/testing/selftests/iommu/iommufd_utils.h | 24 +++++++++++++++++++
 4 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 368076da10ca2..56bade6146ff9 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -23,6 +23,7 @@ enum {
 	IOMMU_TEST_OP_DIRTY,
 	IOMMU_TEST_OP_MD_CHECK_IOTLB,
 	IOMMU_TEST_OP_TRIGGER_IOPF,
+	IOMMU_TEST_OP_DEV_CHECK_CACHE,
 };
 
 enum {
@@ -140,6 +141,10 @@ struct iommu_test_cmd {
 			__u32 perm;
 			__u64 addr;
 		} trigger_iopf;
+		struct {
+			__u32 id;
+			__u32 cache;
+		} check_dev_cache;
 	};
 	__u32 last;
 };
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 9b8fd138a1995..5298d9c11d3af 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1036,6 +1036,26 @@ static int iommufd_test_md_check_iotlb(struct iommufd_ucmd *ucmd,
 	return rc;
 }
 
+static int iommufd_test_dev_check_cache(struct iommufd_ucmd *ucmd,
+					u32 idev_id, unsigned int cache_id,
+					u32 cache)
+{
+	struct iommufd_device *idev;
+	struct mock_dev *mdev;
+	int rc = 0;
+
+	idev = iommufd_get_device(ucmd, idev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+	mdev = container_of(idev->dev, struct mock_dev, dev);
+
+	if (cache_id > MOCK_DEV_CACHE_ID_MAX ||
+	    mdev->cache[cache_id] != cache)
+		rc = -EINVAL;
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+	return rc;
+}
+
 struct selftest_access {
 	struct iommufd_access *access;
 	struct file *file;
@@ -1545,6 +1565,10 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
 		return iommufd_test_md_check_iotlb(ucmd, cmd->id,
 						   cmd->check_iotlb.id,
 						   cmd->check_iotlb.iotlb);
+	case IOMMU_TEST_OP_DEV_CHECK_CACHE:
+		return iommufd_test_dev_check_cache(ucmd, cmd->id,
+						    cmd->check_dev_cache.id,
+						    cmd->check_dev_cache.cache);
 	case IOMMU_TEST_OP_CREATE_ACCESS:
 		return iommufd_test_create_access(ucmd, cmd->id,
 						  cmd->create_access.flags);
diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 12b5a8f78d4bd..1b45445dbd539 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -220,6 +220,8 @@ FIXTURE_SETUP(iommufd_ioas)
 	for (i = 0; i != variant->mock_domains; i++) {
 		test_cmd_mock_domain(self->ioas_id, &self->stdev_id,
 				     &self->hwpt_id, &self->device_id);
+		test_cmd_dev_check_cache_all(self->device_id,
+					     IOMMU_TEST_DEV_CACHE_DEFAULT);
 		self->base_iova = MOCK_APERTURE_START;
 	}
 }
@@ -1442,9 +1444,12 @@ FIXTURE_SETUP(iommufd_mock_domain)
 
 	ASSERT_GE(ARRAY_SIZE(self->hwpt_ids), variant->mock_domains);
 
-	for (i = 0; i != variant->mock_domains; i++)
+	for (i = 0; i != variant->mock_domains; i++) {
 		test_cmd_mock_domain(self->ioas_id, &self->stdev_ids[i],
 				     &self->hwpt_ids[i], &self->idev_ids[i]);
+		test_cmd_dev_check_cache_all(self->idev_ids[0],
+					     IOMMU_TEST_DEV_CACHE_DEFAULT);
+	}
 	self->hwpt_id = self->hwpt_ids[0];
 
 	self->mmap_flags = MAP_SHARED | MAP_ANONYMOUS;
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index be722ea883581..d697a7aa55c91 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -234,6 +234,30 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i
 			test_cmd_hwpt_check_iotlb(hwpt_id, i, expected);       \
 	})
 
+#define test_cmd_dev_check_cache(device_id, cache_id, expected)                \
+	({                                                                     \
+		struct iommu_test_cmd test_cmd = {                             \
+			.size = sizeof(test_cmd),                              \
+			.op = IOMMU_TEST_OP_DEV_CHECK_CACHE,                   \
+			.id = device_id,                                       \
+			.check_dev_cache = {                                   \
+				.id = cache_id,                                \
+				.cache = expected,                             \
+			},                                                     \
+		};                                                             \
+		ASSERT_EQ(0,                                                   \
+			  ioctl(self->fd,                                      \
+				_IOMMU_TEST_CMD(IOMMU_TEST_OP_DEV_CHECK_CACHE),\
+				&test_cmd));                                   \
+	})
+
+#define test_cmd_dev_check_cache_all(device_id, expected)                      \
+	({                                                                     \
+		int c;                                                         \
+		for (c = 0; c < MOCK_DEV_CACHE_NUM; c++)                       \
+			test_cmd_dev_check_cache(device_id, c, expected);      \
+	})
+
 static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
 				     uint32_t data_type, uint32_t lreq,
 				     uint32_t *nreqs)

From 75a41c31f725bd08d6df27cd8d82803c994ab43f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 18 Jan 2024 16:37:45 -0800
Subject: [PATCH 281/352] iommufd/selftest: Add VIOMMU coverage for
 IOMMU_HWPT_INVALIDATE ioctl

Add a viommu_cache test function to cover VIOMMU invalidations using the
updated IOMMU_VIOMMU_INVALIDATE ioctl, with similar postive and negative
cases to the existing iotlb ones.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 190 ++++++++++++++++++
 tools/testing/selftests/iommu/iommufd_utils.h |  32 +++
 2 files changed, 222 insertions(+)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 1b45445dbd539..6f1014cc208b4 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -614,6 +614,196 @@ TEST_F(iommufd_ioas, viommu_default)
 	}
 }
 
+TEST_F(iommufd_ioas, viommu_dev_cache)
+{
+	struct iommu_viommu_invalidate_selftest inv_reqs[2] = {};
+	struct iommu_hwpt_selftest data = {
+		.iotlb = IOMMU_TEST_IOTLB_DEFAULT,
+	};
+	uint32_t nested_hwpt_id = 0, hwpt_id = 0;
+	uint32_t dev_id = self->device_id;
+	uint32_t viommu_id = 0;
+	uint32_t num_inv;
+
+	if (dev_id) {
+		test_cmd_hwpt_alloc(dev_id, self->ioas_id,
+				    IOMMU_HWPT_ALLOC_NEST_PARENT, &hwpt_id);
+		test_cmd_viommu_alloc(dev_id, hwpt_id,
+				      IOMMU_VIOMMU_TYPE_DEFAULT, &viommu_id);
+		test_cmd_hwpt_alloc_nested(self->device_id, viommu_id, 0,
+					   &nested_hwpt_id,
+					   IOMMU_HWPT_DATA_SELFTEST, &data,
+					   sizeof(data));
+		test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id);
+		test_cmd_viommu_set_vdev_id(viommu_id, dev_id, 0x99);
+
+		test_cmd_dev_check_cache_all(dev_id,
+					     IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/* Check data_type by passing zero-length array */
+		num_inv = 0;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: Invalid data_type */
+		num_inv = 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST_INVALID,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: structure size sanity */
+		num_inv = 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs) + 1, &num_inv);
+		assert(!num_inv);
+
+		num_inv = 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   1, &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid flag is passed */
+		num_inv = 1;
+		inv_reqs[0].flags = 0xffffffff;
+		inv_reqs[0].vdev_id = 0x99;
+		test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid data_uptr when array is not empty */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		test_err_viommu_invalidate(EINVAL, viommu_id, NULL,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid entry_len when array is not empty */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   0, &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid cache_id */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/* Negative test: invalid vdev_id */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x9;
+		inv_reqs[0].cache_id = 0;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(!num_inv);
+
+		/*
+		 * Invalidate the 1st cache entry but fail the 2nd request
+		 * due to invalid flags configuration in the 2nd request.
+		 */
+		num_inv = 2;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 0;
+		inv_reqs[1].flags = 0xffffffff;
+		inv_reqs[1].vdev_id = 0x99;
+		inv_reqs[1].cache_id = 1;
+		test_err_viommu_invalidate(EOPNOTSUPP, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache(dev_id, 0, 0);
+		test_cmd_dev_check_cache(dev_id, 1,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 2,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 3,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/*
+		 * Invalidate the 1st cache entry but fail the 2nd request
+		 * due to invalid cache_id configuration in the 2nd request.
+		 */
+		num_inv = 2;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 0;
+		inv_reqs[1].flags = 0;
+		inv_reqs[1].vdev_id = 0x99;
+		inv_reqs[1].cache_id = MOCK_DEV_CACHE_ID_MAX + 1;
+		test_err_viommu_invalidate(EINVAL, viommu_id, inv_reqs,
+					   IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache(dev_id, 0, 0);
+		test_cmd_dev_check_cache(dev_id, 1,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 2,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 3,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/* Invalidate the 2nd cache entry and verify */
+		num_inv = 1;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 1;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache(dev_id, 0, 0);
+		test_cmd_dev_check_cache(dev_id, 1, 0);
+		test_cmd_dev_check_cache(dev_id, 2,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+		test_cmd_dev_check_cache(dev_id, 3,
+					 IOMMU_TEST_DEV_CACHE_DEFAULT);
+
+		/* Invalidate the 3rd and 4th cache entries and verify */
+		num_inv = 2;
+		inv_reqs[0].flags = 0;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].cache_id = 2;
+		inv_reqs[1].flags = 0;
+		inv_reqs[1].vdev_id = 0x99;
+		inv_reqs[1].cache_id = 3;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 2);
+		test_cmd_dev_check_cache_all(dev_id, 0);
+
+		/* Invalidate all cache entries for nested_dev_id[1] and verify */
+		num_inv = 1;
+		inv_reqs[0].vdev_id = 0x99;
+		inv_reqs[0].flags = IOMMU_TEST_INVALIDATE_FLAG_ALL;
+		test_cmd_viommu_invalidate(viommu_id, inv_reqs,
+					   sizeof(*inv_reqs), &num_inv);
+		assert(num_inv == 1);
+		test_cmd_dev_check_cache_all(dev_id, 0);
+
+		test_cmd_mock_domain_replace(self->stdev_id, hwpt_id);
+		test_ioctl_destroy(nested_hwpt_id);
+		test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+		test_ioctl_destroy(viommu_id);
+		test_ioctl_destroy(hwpt_id);
+	}
+}
+
 TEST_F(iommufd_ioas, hwpt_attach)
 {
 	/* Create a device attached directly to a hwpt */
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index d697a7aa55c91..0a81827b903f7 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -289,6 +289,38 @@ static int _test_cmd_hwpt_invalidate(int fd, __u32 hwpt_id, void *reqs,
 					     data_type, lreq, nreqs));   \
 	})
 
+static int _test_cmd_viommu_invalidate(int fd, __u32 viommu_id, void *reqs,
+				       uint32_t data_type, uint32_t lreq,
+				       uint32_t *nreqs)
+{
+	struct iommu_hwpt_invalidate cmd = {
+		.size = sizeof(cmd),
+		.hwpt_id = viommu_id,
+		.data_type = data_type,
+		.data_uptr = (uint64_t)reqs,
+		.entry_len = lreq,
+		.entry_num = *nreqs,
+	};
+	int rc = ioctl(fd, IOMMU_HWPT_INVALIDATE, &cmd);
+	*nreqs = cmd.entry_num;
+	return rc;
+}
+
+#define test_cmd_viommu_invalidate(viommu, reqs, lreq, nreqs)                  \
+	({                                                                     \
+		ASSERT_EQ(0,                                                   \
+			  _test_cmd_viommu_invalidate(self->fd, viommu, reqs,  \
+					IOMMU_VIOMMU_INVALIDATE_DATA_SELFTEST, \
+					lreq, nreqs));                         \
+	})
+#define test_err_viommu_invalidate(_errno, viommu_id, reqs, data_type, lreq,   \
+				 nreqs)                                        \
+	({                                                                     \
+		EXPECT_ERRNO(_errno, _test_cmd_viommu_invalidate(              \
+					     self->fd, viommu_id, reqs,        \
+					     data_type, lreq, nreqs));         \
+	})
+
 static int _test_cmd_access_replace_ioas(int fd, __u32 access_id,
 					 unsigned int ioas_id)
 {

From 946c836aa50c833eda0fffbff7f37a1fd3078e4e Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 23 May 2024 06:51:47 +0000
Subject: [PATCH 282/352] iommufd/viommu: Add iommufd_viommu_to_parent_domain
 helper

Allow an IOMMU driver to convert a core-managed viommu to a nested parent
domain for the info that the domain holds.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/viommu_api.c | 14 ++++++++++++++
 include/linux/iommufd.h            |  8 ++++++++
 2 files changed, 22 insertions(+)

diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
index e0ee592ce8345..3772a5892a6c8 100644
--- a/drivers/iommu/iommufd/viommu_api.c
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -37,3 +37,17 @@ struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id)
 	return vdev_id->idev->dev;
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_viommu_find_device, IOMMUFD);
+
+/*
+ * Convert a viommu to its encapsulated nest parent domain. Caller must be aware
+ * of the lifecycle of the viommu pointer. Only call this function in a callback
+ * function where viommu is passed in by the iommu/iommufd core.
+ */
+struct iommu_domain *
+iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu)
+{
+	if (!viommu || !viommu->hwpt)
+		return NULL;
+	return viommu->hwpt->common.domain;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_to_parent_domain, IOMMUFD);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 364f151d281dc..f7c265c6de7c1 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -92,6 +92,8 @@ int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx);
 void iommufd_viommu_lock_vdev_id(struct iommufd_viommu *viommu);
 void iommufd_viommu_unlock_vdev_id(struct iommufd_viommu *viommu);
 struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id);
+struct iommu_domain *
+iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -145,5 +147,11 @@ struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id)
 {
 	return NULL;
 }
+
+static inline struct iommu_domain *
+iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu)
+{
+	return NULL;
+}
 #endif /* CONFIG_IOMMUFD */
 #endif

From f0fda87c7a287c2a3d72c8c5a4779fa93f9d64a6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 10 Feb 2023 19:37:59 -0800
Subject: [PATCH 283/352] iommu/arm-smmu-v3: Add arm_smmu_cache_invalidate_user

Add arm_smmu_cache_invalidate_user() function for user space to invalidate
IOTLB entries that are still cached by the hardware.

Add struct iommu_hwpt_arm_smmuv3_invalidate defining an invalidation entry
that is simply the native format of a 128-bit TLBI command. Scan commands
against the permitted command list and fix their VMID fields.

Co-developed-by: Eric Auger <eric.auger@redhat.com>
Signed-off-by: Eric Auger <eric.auger@redhat.com>
Co-developed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 115 ++++++++++++++++++++
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |   1 +
 include/uapi/linux/iommufd.h                |  21 ++++
 3 files changed, 137 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index b7eaf9df1f87c..0d0ffd89da5b0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3263,9 +3263,116 @@ static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
 	kfree(container_of(domain, struct arm_smmu_nested_domain, domain));
 }
 
+/*
+ * Convert, in place, the raw invalidation command into an internal format that
+ * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are
+ * stored in CPU endian.
+ *
+ * Enforce the VMID on the command.
+ */
+static int
+arm_smmu_convert_user_cmd(struct arm_smmu_domain *s2_parent,
+			  struct iommu_hwpt_arm_smmuv3_invalidate *cmd)
+{
+	u16 vmid = s2_parent->s2_cfg.vmid;
+
+	cmd->cmd[0] = le64_to_cpu(cmd->cmd[0]);
+	cmd->cmd[1] = le64_to_cpu(cmd->cmd[1]);
+
+	switch (cmd->cmd[0] & CMDQ_0_OP) {
+	case CMDQ_OP_TLBI_NSNH_ALL:
+		/* Convert to NH_ALL */
+		cmd->cmd[0] = CMDQ_OP_TLBI_NH_ALL |
+			      FIELD_PREP(CMDQ_TLBI_0_VMID, vmid);
+		cmd->cmd[1] = 0;
+		break;
+	case CMDQ_OP_TLBI_NH_VA:
+	case CMDQ_OP_TLBI_NH_VAA:
+	case CMDQ_OP_TLBI_NH_ALL:
+	case CMDQ_OP_TLBI_NH_ASID:
+		cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID;
+		cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vmid);
+		break;
+	default:
+		return -EIO;
+	}
+	return 0;
+}
+
+static int __arm_smmu_cache_invalidate_user(struct arm_smmu_domain *s2_parent,
+					    struct iommu_user_data_array *array)
+{
+	struct arm_smmu_device *smmu = s2_parent->smmu;
+	struct iommu_hwpt_arm_smmuv3_invalidate *last_batch;
+	struct iommu_hwpt_arm_smmuv3_invalidate *cmds;
+	struct iommu_hwpt_arm_smmuv3_invalidate *cur;
+	struct iommu_hwpt_arm_smmuv3_invalidate *end;
+	struct arm_smmu_cmdq_ent ent;
+	struct arm_smmu_cmdq *cmdq;
+	int ret;
+
+	/* A zero-length array is allowed to validate the array type */
+	if (array->entry_num == 0 &&
+	    array->type == IOMMU_HWPT_INVALIDATE_DATA_ARM_SMMUV3) {
+		array->entry_num = 0;
+		return 0;
+	}
+
+	cmds = kcalloc(array->entry_num, sizeof(*cmds), GFP_KERNEL);
+	if (!cmds)
+		return -ENOMEM;
+	cur = cmds;
+	end = cmds + array->entry_num;
+
+	static_assert(sizeof(*cmds) == 2 * sizeof(u64));
+	ret = iommu_copy_struct_from_full_user_array(
+		cmds, sizeof(*cmds), array,
+		IOMMU_HWPT_INVALIDATE_DATA_ARM_SMMUV3);
+	if (ret)
+		goto out;
+
+	ent.opcode = cmds->cmd[0] & CMDQ_0_OP;
+	cmdq = arm_smmu_get_cmdq(smmu, &ent);
+
+	last_batch = cmds;
+	while (cur != end) {
+		ret = arm_smmu_convert_user_cmd(s2_parent, cur);
+		if (ret)
+			goto out;
+
+		/* FIXME work in blocks of CMDQ_BATCH_ENTRIES and copy each block? */
+		cur++;
+		if (cur != end && (cur - last_batch) != CMDQ_BATCH_ENTRIES - 1)
+			continue;
+
+		ret = arm_smmu_cmdq_issue_cmdlist(smmu, cmdq, last_batch->cmd,
+						  cur - last_batch, true);
+		if (ret) {
+			cur--;
+			goto out;
+		}
+		last_batch = cur;
+	}
+out:
+	array->entry_num = cur - cmds;
+	kfree(cmds);
+	return ret;
+}
+
+static int arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
+					  struct iommu_user_data_array *array)
+{
+	struct arm_smmu_nested_domain *nested_domain =
+		container_of(domain, struct arm_smmu_nested_domain, domain);
+
+	return __arm_smmu_cache_invalidate_user(
+			nested_domain->s2_parent, array);
+}
+
 static const struct iommu_domain_ops arm_smmu_nested_ops = {
 	.attach_dev = arm_smmu_attach_dev_nested,
 	.free = arm_smmu_domain_nested_free,
+	.cache_invalidate_user	= arm_smmu_cache_invalidate_user,
 };
 
 static struct iommu_domain *
@@ -3293,6 +3400,14 @@ arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
 	    !(master->smmu->features & ARM_SMMU_FEAT_S2FWB))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	/*
+	 * FORCE_SYNC is not set with FEAT_NESTING. Some study of the exact HW
+	 * defect is needed to determine if arm_smmu_cache_invalidate_user()
+	 * needs any change to remove this.
+	 */
+	if (WARN_ON(master->smmu->options & ARM_SMMU_OPT_CMDQ_FORCE_SYNC))
+		return ERR_PTR(-EOPNOTSUPP);
+
 	ret = iommu_copy_struct_from_user(&arg, user_data,
 					  IOMMU_HWPT_DATA_ARM_SMMUV3, ste);
 	if (ret)
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 5378541fe697a..1996e374c688d 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -527,6 +527,7 @@ struct arm_smmu_cmdq_ent {
 		#define CMDQ_OP_TLBI_NH_ALL     0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
+		#define CMDQ_OP_TLBI_NH_VAA	0x13
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
 		#define CMDQ_OP_TLBI_EL2_ASID	0x21
 		#define CMDQ_OP_TLBI_EL2_VA	0x22
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index fd7d16fd441d1..f3aefb11f681e 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -685,9 +685,11 @@ struct iommu_hwpt_get_dirty_bitmap {
  * enum iommu_hwpt_invalidate_data_type - IOMMU HWPT Cache Invalidation
  *                                        Data Type
  * @IOMMU_HWPT_INVALIDATE_DATA_VTD_S1: Invalidation data for VTD_S1
+ * @IOMMU_HWPT_INVALIDATE_DATA_ARM_SMMUV3: Invalidation data for ARM SMMUv3
  */
 enum iommu_hwpt_invalidate_data_type {
 	IOMMU_HWPT_INVALIDATE_DATA_VTD_S1 = 0,
+	IOMMU_HWPT_INVALIDATE_DATA_ARM_SMMUV3 = 1,
 };
 
 /**
@@ -726,6 +728,25 @@ struct iommu_hwpt_vtd_s1_invalidate {
 	__u32 __reserved;
 };
 
+/**
+ * struct iommu_hwpt_arm_smmuv3_invalidate - ARM SMMUv3 cahce invalidation
+ *         (IOMMU_HWPT_INVALIDATE_DATA_ARM_SMMUV3)
+ * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
+ *       Must be little-endian.
+ *
+ * Supported command list:
+ *     CMDQ_OP_TLBI_NSNH_ALL
+ *     CMDQ_OP_TLBI_NH_VA
+ *     CMDQ_OP_TLBI_NH_VAA
+ *     CMDQ_OP_TLBI_NH_ALL
+ *     CMDQ_OP_TLBI_NH_ASID
+ *
+ * -EIO will be returned if the command is not supported.
+ */
+struct iommu_hwpt_arm_smmuv3_invalidate {
+	__aligned_u64 cmd[2];
+};
+
 /**
  * struct iommu_hwpt_invalidate - ioctl(IOMMU_HWPT_INVALIDATE)
  * @size: sizeof(struct iommu_hwpt_invalidate)

From e2375ebed322815cfd5b1ee808361574bf0edd04 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Sat, 18 May 2024 03:55:23 +0000
Subject: [PATCH 284/352] iommu/arm-smmu-v3: Add
 arm_smmu_viommu_cache_invalidate

Add an arm_smmu_viommu_cache_invalidate() function for user space to issue
cache invalidation commands via viommu.

The viommu invalidation takes the same native format of a 128-bit command,
as the hwpt invalidation. Thus, reuse the same driver data structure, but
make it wider to accept CMDQ_OP_ATC_INV and CMDQ_OP_CFGI_CD{_ALL}.

Scan the commands against the supported ist and fix the VMIDs and SIDs.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 76 ++++++++++++++++++++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 +
 include/uapi/linux/iommufd.h                |  7 +-
 3 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 0d0ffd89da5b0..434a3cf1ce223 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3263,15 +3263,32 @@ static void arm_smmu_domain_nested_free(struct iommu_domain *domain)
 	kfree(container_of(domain, struct arm_smmu_nested_domain, domain));
 }
 
+static int arm_smmu_convert_viommu_vdev_id(struct iommufd_viommu *viommu,
+					   u32 vdev_id, u32 *sid)
+{
+	struct arm_smmu_master *master;
+	struct device *dev;
+
+	dev = iommufd_viommu_find_device(viommu, vdev_id);
+	if (!dev)
+		return -EIO;
+	master = dev_iommu_priv_get(dev);
+
+	if (sid)
+		*sid = master->streams[0].id;
+	return 0;
+}
+
 /*
  * Convert, in place, the raw invalidation command into an internal format that
  * can be passed to arm_smmu_cmdq_issue_cmdlist(). Internally commands are
  * stored in CPU endian.
  *
- * Enforce the VMID on the command.
+ * Enforce the VMID or the SID on the command.
  */
 static int
 arm_smmu_convert_user_cmd(struct arm_smmu_domain *s2_parent,
+			  struct iommufd_viommu *viommu,
 			  struct iommu_hwpt_arm_smmuv3_invalidate *cmd)
 {
 	u16 vmid = s2_parent->s2_cfg.vmid;
@@ -3293,13 +3310,46 @@ arm_smmu_convert_user_cmd(struct arm_smmu_domain *s2_parent,
 		cmd->cmd[0] &= ~CMDQ_TLBI_0_VMID;
 		cmd->cmd[0] |= FIELD_PREP(CMDQ_TLBI_0_VMID, vmid);
 		break;
+	case CMDQ_OP_ATC_INV:
+	case CMDQ_OP_CFGI_CD:
+	case CMDQ_OP_CFGI_CD_ALL:
+		if (viommu) {
+			u32 sid, vsid = FIELD_GET(CMDQ_CFGI_0_SID, cmd->cmd[0]);
+
+			if (arm_smmu_convert_viommu_vdev_id(viommu, vsid, &sid))
+				return -EIO;
+			cmd->cmd[0] &= ~CMDQ_CFGI_0_SID;
+			cmd->cmd[0] |= FIELD_PREP(CMDQ_CFGI_0_SID, sid);
+			break;
+		}
+		fallthrough;
 	default:
 		return -EIO;
 	}
 	return 0;
 }
 
+static inline bool
+arm_smmu_must_lock_vdev_id(struct iommu_hwpt_arm_smmuv3_invalidate *cmds,
+			   unsigned int num_cmds)
+{
+	int i;
+
+	for (i = 0; i < num_cmds; i++) {
+		switch (cmds[i].cmd[0] & CMDQ_0_OP) {
+		case CMDQ_OP_ATC_INV:
+		case CMDQ_OP_CFGI_CD:
+		case CMDQ_OP_CFGI_CD_ALL:
+			return true;
+		default:
+			continue;
+		}
+	}
+	return false;
+}
+
 static int __arm_smmu_cache_invalidate_user(struct arm_smmu_domain *s2_parent,
+					    struct iommufd_viommu *viommu,
 					    struct iommu_user_data_array *array)
 {
 	struct arm_smmu_device *smmu = s2_parent->smmu;
@@ -3309,6 +3359,7 @@ static int __arm_smmu_cache_invalidate_user(struct arm_smmu_domain *s2_parent,
 	struct iommu_hwpt_arm_smmuv3_invalidate *end;
 	struct arm_smmu_cmdq_ent ent;
 	struct arm_smmu_cmdq *cmdq;
+	bool must_lock = false;
 	int ret;
 
 	/* A zero-length array is allowed to validate the array type */
@@ -3331,12 +3382,17 @@ static int __arm_smmu_cache_invalidate_user(struct arm_smmu_domain *s2_parent,
 	if (ret)
 		goto out;
 
+	if (viommu)
+		must_lock = arm_smmu_must_lock_vdev_id(cmds, array->entry_num);
+	if (must_lock)
+		iommufd_viommu_lock_vdev_id(viommu);
+
 	ent.opcode = cmds->cmd[0] & CMDQ_0_OP;
 	cmdq = arm_smmu_get_cmdq(smmu, &ent);
 
 	last_batch = cmds;
 	while (cur != end) {
-		ret = arm_smmu_convert_user_cmd(s2_parent, cur);
+		ret = arm_smmu_convert_user_cmd(s2_parent, viommu, cur);
 		if (ret)
 			goto out;
 
@@ -3354,6 +3410,8 @@ static int __arm_smmu_cache_invalidate_user(struct arm_smmu_domain *s2_parent,
 		last_batch = cur;
 	}
 out:
+	if (must_lock)
+		iommufd_viommu_unlock_vdev_id(viommu);
 	array->entry_num = cur - cmds;
 	kfree(cmds);
 	return ret;
@@ -3366,7 +3424,7 @@ static int arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
 		container_of(domain, struct arm_smmu_nested_domain, domain);
 
 	return __arm_smmu_cache_invalidate_user(
-			nested_domain->s2_parent, array);
+			nested_domain->s2_parent, NULL, array);
 }
 
 static const struct iommu_domain_ops arm_smmu_nested_ops = {
@@ -3857,6 +3915,15 @@ static int arm_smmu_def_domain_type(struct device *dev)
 	return 0;
 }
 
+static int arm_smmu_viommu_cache_invalidate(struct iommufd_viommu *viommu,
+					    struct iommu_user_data_array *array)
+{
+	struct iommu_domain *domain = iommufd_viommu_to_parent_domain(viommu);
+
+	return __arm_smmu_cache_invalidate_user(
+			to_smmu_domain(domain), viommu, array);
+}
+
 static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
@@ -3887,6 +3954,9 @@ static struct iommu_ops arm_smmu_ops = {
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
 		.free			= arm_smmu_domain_free_paging,
+		.default_viommu_ops = &(const struct iommufd_viommu_ops) {
+			.cache_invalidate = arm_smmu_viommu_cache_invalidate,
+		}
 	}
 };
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 1996e374c688d..0e160adef5ea6 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -10,6 +10,7 @@
 
 #include <linux/bitfield.h>
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/kernel.h>
 #include <linux/mmzone.h>
 #include <linux/sizes.h>
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index f3aefb11f681e..0d973486b604c 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -734,13 +734,18 @@ struct iommu_hwpt_vtd_s1_invalidate {
  * @cmd: 128-bit cache invalidation command that runs in SMMU CMDQ.
  *       Must be little-endian.
  *
- * Supported command list:
+ * Supported command list when passing in a HWPT via @hwpt_id:
  *     CMDQ_OP_TLBI_NSNH_ALL
  *     CMDQ_OP_TLBI_NH_VA
  *     CMDQ_OP_TLBI_NH_VAA
  *     CMDQ_OP_TLBI_NH_ALL
  *     CMDQ_OP_TLBI_NH_ASID
  *
+ * Additional to the list above, when passing in a VIOMMU via @hwpt_id:
+ *     CMDQ_OP_ATC_INV
+ *     CMDQ_OP_CFGI_CD
+ *     CMDQ_OP_CFGI_CD_ALL
+ *
  * -EIO will be returned if the command is not supported.
  */
 struct iommu_hwpt_arm_smmuv3_invalidate {

From d94e8d849e2de7dcd779ffc71748e6b74a19c5dc Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Fri, 12 Jul 2024 14:40:55 -0300
Subject: [PATCH 285/352] iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED

Now, ATC invalidation can be done with the VIOMMU invalidation op. A guest
owned IOMMU_DOMAIN_NESTED can do an ATS too. Allow it to pass in the EATS
field via the vSTE words.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 15 ++++++++++++---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |  1 +
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 434a3cf1ce223..5d6e94a0ea845 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3233,8 +3233,6 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
 		.master = master,
 		.old_domain = iommu_get_domain_for_dev(dev),
 		.ssid = IOMMU_NO_PASID,
-		/* Currently invalidation of ATC is not supported */
-		.disable_ats = true,
 	};
 	struct arm_smmu_ste ste;
 	int ret;
@@ -3244,6 +3242,15 @@ static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
 		return -EINVAL;
 
 	mutex_lock(&arm_smmu_asid_lock);
+	/*
+	 * The VM has to control the actual ATS state at the PCI device because
+	 * we forward the invalidations directly from the VM. If the VM doesn't
+	 * think ATS is on it will not generate ATC flushes and the ATC will
+	 * become incoherent. Since we can't access the actual virtual PCI ATS
+	 * config bit here base this off the EATS value in the STE. If the EATS
+	 * is set then the VM must generate ATC flushes.
+	 */
+	state.disable_ats = !nested_domain->enable_ats;
 	ret = arm_smmu_attach_prepare(&state, domain);
 	if (ret) {
 		mutex_unlock(&arm_smmu_asid_lock);
@@ -3492,8 +3499,9 @@ arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
 	    cfg != STRTAB_STE_0_CFG_S1_TRANS)
 		return ERR_PTR(-EIO);
 
+	/* Only Full ATS or ATS UR is supported */
 	eats = FIELD_GET(STRTAB_STE_1_EATS, le64_to_cpu(arg.ste[1]));
-	if (eats != STRTAB_STE_1_EATS_ABT)
+	if (eats != STRTAB_STE_1_EATS_ABT && eats != STRTAB_STE_1_EATS_TRANS)
 		return ERR_PTR(-EIO);
 
 	if (cfg != STRTAB_STE_0_CFG_S1_TRANS)
@@ -3506,6 +3514,7 @@ arm_smmu_domain_alloc_nesting(struct device *dev, u32 flags,
 	nested_domain->domain.type = IOMMU_DOMAIN_NESTED;
 	nested_domain->domain.ops = &arm_smmu_nested_ops;
 	nested_domain->s2_parent = smmu_parent;
+	nested_domain->enable_ats = eats == STRTAB_STE_1_EATS_TRANS;
 	nested_domain->ste[0] = arg.ste[0];
 	nested_domain->ste[1] = arg.ste[1] & ~cpu_to_le64(STRTAB_STE_1_EATS);
 
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 0e160adef5ea6..6b0c068184ba0 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -837,6 +837,7 @@ struct arm_smmu_domain {
 struct arm_smmu_nested_domain {
 	struct iommu_domain domain;
 	struct arm_smmu_domain *s2_parent;
+	u8 enable_ats : 1;
 
 	__le64 ste[2];
 };

From bb68aaf314af30290fdef6bd4a63c68024589984 Mon Sep 17 00:00:00 2001
From: Jason Gunthorpe <jgg@nvidia.com>
Date: Mon, 19 Aug 2024 14:22:05 -0300
Subject: [PATCH 286/352] iommu/arm-smmu-v3: Update comments about ATS and
 bypass

The SMMUv3 spec has a note that BYPASS and ATS don't work together under
the STE EATS field definition. However there is another section "13.6.4
Full ATS skipping stage 1" that explains under certain conditions BYPASS
and ATS do work together if the STE is using S1DSS to select BYPASS and
the CD table has the possibility for a substream.

When these comments were written the understanding was that all forms of
BYPASS just didn't work and this was to be a future problem to solve.

It turns out that ATS and IDENTITY will always work just fine:

 - If STE.Config = BYPASS then the PCI ATS is disabled

 - If a PASID domain is attached then S1DSS = BYPASS and ATS will be
   enabled. This meets the requirements of 13.6.4 to automatically
   generate 1:1 ATS replies on the RID.

Update the comments to reflect this.

Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 5d6e94a0ea845..a5e44c274351f 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -2835,9 +2835,14 @@ static int arm_smmu_attach_prepare(struct arm_smmu_attach_state *state,
 		 * Translation Requests and Translated transactions are denied
 		 * as though ATS is disabled for the stream (STE.EATS == 0b00),
 		 * causing F_BAD_ATS_TREQ and F_TRANSL_FORBIDDEN events
-		 * (IHI0070Ea 5.2 Stream Table Entry). Thus ATS can only be
-		 * enabled if we have arm_smmu_domain, those always have page
-		 * tables.
+		 * (IHI0070Ea 5.2 Stream Table Entry).
+		 *
+		 * However, if we have installed a CD table and are using S1DSS
+		 * then ATS will work in S1DSS bypass. See "13.6.4 Full ATS
+		 * skipping stage 1".
+		 *
+		 * Disable ATS if we are going to create a normal 0b100 bypass
+		 * STE.
 		 */
 		state->ats_enabled = !state->disable_ats &&
 				     arm_smmu_ats_supported(master);
@@ -3162,8 +3167,10 @@ static void arm_smmu_attach_dev_ste(struct iommu_domain *domain,
 	if (arm_smmu_ssids_in_use(&master->cd_table)) {
 		/*
 		 * If a CD table has to be present then we need to run with ATS
-		 * on even though the RID will fail ATS queries with UR. This is
-		 * because we have no idea what the PASID's need.
+		 * on because we have to assume a PASID is using ATS. For
+		 * IDENTITY this will setup things so that S1DSS=bypass which
+		 * follows the explanation in "13.6.4 Full ATS skipping stage 1"
+		 * and allows for ATS on the RID to work.
 		 */
 		state.cd_needs_ats = true;
 		arm_smmu_attach_prepare(&state, domain);

From 9c051b3e636029e0d4197ab8015e82d9082f55c8 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 12 Dec 2023 21:28:35 -0800
Subject: [PATCH 287/352] cover-letter: iommufd: Add VIOMMU infrastructure
 (Part-1)

This series introduces a new VIOMMU infrastructure and related ioctls.

IOMMUFD has been using the HWPT infrastructure for all cases, including a
nested IO page table support. Yet, there're limitations for an HWPT-based
structure to support some advanced HW-accelerated features, such as CMDQV
on NVIDIA Grace, and HW-accelerated vIOMMU on AMD. Even for a multi-IOMMU
environment, it is not straightforward for nested HWPTs to share the same
parent HWPT (stage-2 IO pagetable), with the HWPT infrastructure alone.

The new VIOMMU object is an additional layer, between the nested HWPT and
its parent HWPT, to give to both the IOMMUFD core and an IOMMU driver an
additional structure to support HW-accelerated feature:
                     ----------------------------
 ----------------    |         |  paging_hwpt0  |
 | hwpt_nested0 |--->| viommu0 ------------------
 ----------------    |         | HW-accel feats |
                     ----------------------------

On a multi-IOMMU system, the VIOMMU object can be instanced to the number
of vIOMMUs in a guest VM, while holding the same parent HWPT to share the
stage-2 IO pagetable. Each VIOMMU then just need to only allocate its own
VMID to attach the shared stage-2 IO pagetable to the physical IOMMU:
                     ----------------------------
 ----------------    |         |  paging_hwpt0  |
 | hwpt_nested0 |--->| viommu0 ------------------
 ----------------    |         |     VMID0      |
                     ----------------------------
                     ----------------------------
 ----------------    |         |  paging_hwpt0  |
 | hwpt_nested1 |--->| viommu1 ------------------
 ----------------    |         |     VMID1      |
                     ----------------------------

As an initial part-1, add ioctls to support a VIOMMU-based invalidation:
    IOMMUFD_CMD_VIOMMU_ALLOC to allocate a VIOMMU object
    IOMMUFD_CMD_VIOMMU_SET/UNSET_VDEV_ID to set/clear device's virtual ID
    (Resue IOMMUFD_CMD_HWPT_INVALIDATE for a VIOMMU object to flush cache
     by a given driver data)

Worth noting that the VDEV_ID is for a per-VIOMMU device list for drivers
to look up the device's physical instance from its virtual ID in a VM. It
is essential for a VIOMMU-based invalidation where the request contains a
device's virtual ID for its device cache flush, e.g. ATC invalidation.

As for the implementation of the series, add an IOMMU_VIOMMU_TYPE_DEFAULT
type for a core-allocated-core-managed VIOMMU object, allowing drivers to
simply hook a default viommu ops for viommu-based invalidation alone. And
provide some viommu helpers to drivers for VDEV_ID translation and parent
domain lookup. Add VIOMMU invalidation support to ARM SMMUv3 driver for a
real world use case. This adds supports of arm-smmuv-v3's CMDQ_OP_ATC_INV
and CMDQ_OP_CFGI_CD/ALL commands, supplementing HWPT-based invalidations.

In the future, drivers will also be able to choose a driver-managed type
to hold its own structure by adding a new type to enum iommu_viommu_type.
More VIOMMU-based structures and ioctls will be introduced in part-2/3 to
support a driver-managed VIOMMU, e.g. VQUEUE object for a HW accelerated
queue, VIRQ (or VEVENT) object for IRQ injections. Although we repurposed
the VIOMMU object from an earlier RFC discussion, for a referece:
https://lore.kernel.org/all/cover.1712978212.git.nicolinc@nvidia.com/

This series is on Github:
https://github.com/nicolinc/iommufd/commits/iommufd_viommu_p1-v2
Paring QEMU branch for testing:
https://github.com/nicolinc/qemu/commits/wip/for_iommufd_viommu_p1-v2

Changelog
v2
 * Limited vdev_id to one per idev
 * Added a rw_sem to protect the vdev_id list
 * Reworked driver-level APIs with proper lockings
 * Added a new viommu_api file for IOMMUFD_DRIVER config
 * Dropped useless iommu_dev point from the viommu structure
 * Added missing index numnbers to new types in the uAPI header
 * Dropped IOMMU_VIOMMU_INVALIDATE uAPI; Instead, reuse the HWPT one
 * Reworked mock_viommu_cache_invalidate() using the new iommu helper
 * Reordered details of set/unset_vdev_id handlers for proper lockings
v1
 https://lore.kernel.org/all/cover.1723061377.git.nicolinc@nvidia.com/

Thanks!
Nicolin

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 453a83234db08d0169fc51f9eb0543a9595e9cf6 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 10 Jul 2024 20:28:33 +0000
Subject: [PATCH 288/352] iommufd: Rename IOMMUFD_OBJ_FAULT to
 IOMMUFD_OBJ_EVENT_IOPF

A fault object was designed exclusively for hwpt's IO page faults. But the
implementation of the object could actually be used for other purposes too
such as hardware IRQ and event.

Define a common event structure. Embed it into another iommufd_event_iopf,
similar to hwpt_paging holding a common hwpt.

Roll out a minimal level of renamings and abstractions. Add a common event
ops to prepare for IOMMUFD_OBJ_EVENT_VIRQ. Also move event handlers to the
header, which will be called by a viommu_api module for IOMMUFD_DRIVER.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/fault.c           | 270 +++++++++++++-----------
 drivers/iommu/iommufd/hw_pagetable.c    |  12 +-
 drivers/iommu/iommufd/iommufd_private.h |  87 +++++---
 drivers/iommu/iommufd/main.c            |   8 +-
 4 files changed, 224 insertions(+), 153 deletions(-)

diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/fault.c
index df03411c87289..8fea142e1ac20 100644
--- a/drivers/iommu/iommufd/fault.c
+++ b/drivers/iommu/iommufd/fault.c
@@ -16,7 +16,9 @@
 #include "../iommu-priv.h"
 #include "iommufd_private.h"
 
-static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
+/* IOMMUFD_OBJ_EVENT_IOPF Functions */
+
+static int iommufd_event_iopf_enable(struct iommufd_device *idev)
 {
 	struct device *dev = idev->dev;
 	int ret;
@@ -45,7 +47,7 @@ static int iommufd_fault_iopf_enable(struct iommufd_device *idev)
 	return ret;
 }
 
-static void iommufd_fault_iopf_disable(struct iommufd_device *idev)
+static void iommufd_event_iopf_disable(struct iommufd_device *idev)
 {
 	mutex_lock(&idev->iopf_lock);
 	if (!WARN_ON(idev->iopf_enabled == 0)) {
@@ -55,8 +57,8 @@ static void iommufd_fault_iopf_disable(struct iommufd_device *idev)
 	mutex_unlock(&idev->iopf_lock);
 }
 
-static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
-				     struct iommufd_device *idev)
+static int __event_iopf_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+					  struct iommufd_device *idev)
 {
 	struct iommufd_attach_handle *handle;
 	int ret;
@@ -74,37 +76,37 @@ static int __fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
 	return ret;
 }
 
-int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
-				    struct iommufd_device *idev)
+int iommufd_event_iopf_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+					 struct iommufd_device *idev)
 {
 	int ret;
 
 	if (!hwpt->fault)
 		return -EINVAL;
 
-	ret = iommufd_fault_iopf_enable(idev);
+	ret = iommufd_event_iopf_enable(idev);
 	if (ret)
 		return ret;
 
-	ret = __fault_domain_attach_dev(hwpt, idev);
+	ret = __event_iopf_domain_attach_dev(hwpt, idev);
 	if (ret)
-		iommufd_fault_iopf_disable(idev);
+		iommufd_event_iopf_disable(idev);
 
 	return ret;
 }
 
-static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
-					 struct iommufd_attach_handle *handle)
+static void iommufd_event_iopf_auto_response(struct iommufd_hw_pagetable *hwpt,
+					     struct iommufd_attach_handle *handle)
 {
-	struct iommufd_fault *fault = hwpt->fault;
+	struct iommufd_event_iopf *fault = hwpt->fault;
 	struct iopf_group *group, *next;
 	unsigned long index;
 
 	if (!fault)
 		return;
 
-	mutex_lock(&fault->mutex);
-	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+	mutex_lock(&fault->common.mutex);
+	list_for_each_entry_safe(group, next, &fault->common.deliver, node) {
 		if (group->attach_handle != &handle->handle)
 			continue;
 		list_del(&group->node);
@@ -119,7 +121,7 @@ static void iommufd_auto_response_faults(struct iommufd_hw_pagetable *hwpt,
 		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
 		iopf_free_group(group);
 	}
-	mutex_unlock(&fault->mutex);
+	mutex_unlock(&fault->common.mutex);
 }
 
 static struct iommufd_attach_handle *
@@ -134,21 +136,21 @@ iommufd_device_get_attach_handle(struct iommufd_device *idev)
 	return to_iommufd_handle(handle);
 }
 
-void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
-				     struct iommufd_device *idev)
+void iommufd_event_iopf_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
+					  struct iommufd_device *idev)
 {
 	struct iommufd_attach_handle *handle;
 
 	handle = iommufd_device_get_attach_handle(idev);
 	iommu_detach_group_handle(hwpt->domain, idev->igroup->group);
-	iommufd_auto_response_faults(hwpt, handle);
-	iommufd_fault_iopf_disable(idev);
+	iommufd_event_iopf_auto_response(hwpt, handle);
+	iommufd_event_iopf_disable(idev);
 	kfree(handle);
 }
 
-static int __fault_domain_replace_dev(struct iommufd_device *idev,
-				      struct iommufd_hw_pagetable *hwpt,
-				      struct iommufd_hw_pagetable *old)
+static int __event_iopf_domain_replace_dev(struct iommufd_device *idev,
+					   struct iommufd_hw_pagetable *hwpt,
+					   struct iommufd_hw_pagetable *old)
 {
 	struct iommufd_attach_handle *handle, *curr = NULL;
 	int ret;
@@ -171,43 +173,44 @@ static int __fault_domain_replace_dev(struct iommufd_device *idev,
 	}
 
 	if (!ret && curr) {
-		iommufd_auto_response_faults(old, curr);
+		iommufd_event_iopf_auto_response(old, curr);
 		kfree(curr);
 	}
 
 	return ret;
 }
 
-int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
-				     struct iommufd_hw_pagetable *hwpt,
-				     struct iommufd_hw_pagetable *old)
+int iommufd_event_iopf_domain_replace_dev(struct iommufd_device *idev,
+					  struct iommufd_hw_pagetable *hwpt,
+					  struct iommufd_hw_pagetable *old)
 {
 	bool iopf_off = !hwpt->fault && old->fault;
 	bool iopf_on = hwpt->fault && !old->fault;
 	int ret;
 
 	if (iopf_on) {
-		ret = iommufd_fault_iopf_enable(idev);
+		ret = iommufd_event_iopf_enable(idev);
 		if (ret)
 			return ret;
 	}
 
-	ret = __fault_domain_replace_dev(idev, hwpt, old);
+	ret = __event_iopf_domain_replace_dev(idev, hwpt, old);
 	if (ret) {
 		if (iopf_on)
-			iommufd_fault_iopf_disable(idev);
+			iommufd_event_iopf_disable(idev);
 		return ret;
 	}
 
 	if (iopf_off)
-		iommufd_fault_iopf_disable(idev);
+		iommufd_event_iopf_disable(idev);
 
 	return 0;
 }
 
-void iommufd_fault_destroy(struct iommufd_object *obj)
+void iommufd_event_iopf_destroy(struct iommufd_object *obj)
 {
-	struct iommufd_fault *fault = container_of(obj, struct iommufd_fault, obj);
+	struct iommufd_event *event =
+		container_of(obj, struct iommufd_event, obj);
 	struct iopf_group *group, *next;
 
 	/*
@@ -216,17 +219,17 @@ void iommufd_fault_destroy(struct iommufd_object *obj)
 	 * accessing this pointer. Therefore, acquiring the mutex here
 	 * is unnecessary.
 	 */
-	list_for_each_entry_safe(group, next, &fault->deliver, node) {
+	list_for_each_entry_safe(group, next, &event->deliver, node) {
 		list_del(&group->node);
 		iopf_group_response(group, IOMMU_PAGE_RESP_INVALID);
 		iopf_free_group(group);
 	}
 }
 
-static void iommufd_compose_fault_message(struct iommu_fault *fault,
-					  struct iommu_hwpt_pgfault *hwpt_fault,
-					  struct iommufd_device *idev,
-					  u32 cookie)
+static void iommufd_compose_iopf_message(struct iommu_fault *fault,
+					 struct iommu_hwpt_pgfault *hwpt_fault,
+					 struct iommufd_device *idev,
+					 u32 cookie)
 {
 	hwpt_fault->flags = fault->prm.flags;
 	hwpt_fault->dev_id = idev->obj.id;
@@ -238,11 +241,12 @@ static void iommufd_compose_fault_message(struct iommu_fault *fault,
 	hwpt_fault->cookie = cookie;
 }
 
-static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
-				       size_t count, loff_t *ppos)
+static ssize_t iommufd_event_iopf_fops_read(struct iommufd_event *event,
+					    char __user *buf, size_t count,
+					    loff_t *ppos)
 {
+	struct iommufd_event_iopf *fault = to_event_iopf(event);
 	size_t fault_size = sizeof(struct iommu_hwpt_pgfault);
-	struct iommufd_fault *fault = filep->private_data;
 	struct iommu_hwpt_pgfault data;
 	struct iommufd_device *idev;
 	struct iopf_group *group;
@@ -253,9 +257,9 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
 	if (*ppos || count % fault_size)
 		return -ESPIPE;
 
-	mutex_lock(&fault->mutex);
-	while (!list_empty(&fault->deliver) && count > done) {
-		group = list_first_entry(&fault->deliver,
+	mutex_lock(&event->mutex);
+	while (!list_empty(&event->deliver) && count > done) {
+		group = list_first_entry(&event->deliver,
 					 struct iopf_group, node);
 
 		if (group->fault_count * fault_size > count - done)
@@ -268,9 +272,8 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
 
 		idev = to_iommufd_handle(group->attach_handle)->idev;
 		list_for_each_entry(iopf, &group->faults, list) {
-			iommufd_compose_fault_message(&iopf->fault,
-						      &data, idev,
-						      group->cookie);
+			iommufd_compose_iopf_message(&iopf->fault, &data,
+						     idev, group->cookie);
 			if (copy_to_user(buf + done, &data, fault_size)) {
 				xa_erase(&fault->response, group->cookie);
 				rc = -EFAULT;
@@ -281,16 +284,17 @@ static ssize_t iommufd_fault_fops_read(struct file *filep, char __user *buf,
 
 		list_del(&group->node);
 	}
-	mutex_unlock(&fault->mutex);
+	mutex_unlock(&event->mutex);
 
 	return done == 0 ? rc : done;
 }
 
-static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *buf,
-					size_t count, loff_t *ppos)
+static ssize_t iommufd_event_iopf_fops_write(struct iommufd_event *event,
+					     const char __user *buf,
+					     size_t count, loff_t *ppos)
 {
 	size_t response_size = sizeof(struct iommu_hwpt_page_response);
-	struct iommufd_fault *fault = filep->private_data;
+	struct iommufd_event_iopf *fault = to_event_iopf(event);
 	struct iommu_hwpt_page_response response;
 	struct iopf_group *group;
 	size_t done = 0;
@@ -299,7 +303,7 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b
 	if (*ppos || count % response_size)
 		return -ESPIPE;
 
-	mutex_lock(&fault->mutex);
+	mutex_lock(&event->mutex);
 	while (count > done) {
 		rc = copy_from_user(&response, buf + done, response_size);
 		if (rc)
@@ -325,119 +329,149 @@ static ssize_t iommufd_fault_fops_write(struct file *filep, const char __user *b
 		iopf_free_group(group);
 		done += response_size;
 	}
-	mutex_unlock(&fault->mutex);
+	mutex_unlock(&event->mutex);
 
 	return done == 0 ? rc : done;
 }
 
-static __poll_t iommufd_fault_fops_poll(struct file *filep,
+static const struct iommufd_event_ops iommufd_event_iopf_ops = {
+	.read = &iommufd_event_iopf_fops_read,
+	.write = &iommufd_event_iopf_fops_write,
+};
+
+/* Common Event Functions */
+
+static ssize_t iommufd_event_fops_read(struct file *filep, char __user *buf,
+				       size_t count, loff_t *ppos)
+{
+	struct iommufd_event *event = filep->private_data;
+
+	if (!event->ops || !event->ops->read)
+		return -EOPNOTSUPP;
+	return event->ops->read(event, buf, count, ppos);
+}
+
+static ssize_t iommufd_event_fops_write(struct file *filep,
+					const char __user *buf,
+					size_t count, loff_t *ppos)
+{
+	struct iommufd_event *event = filep->private_data;
+
+	if (!event->ops || !event->ops->write)
+		return -EOPNOTSUPP;
+	return event->ops->write(event, buf, count, ppos);
+}
+
+static __poll_t iommufd_event_fops_poll(struct file *filep,
 					struct poll_table_struct *wait)
 {
-	struct iommufd_fault *fault = filep->private_data;
+	struct iommufd_event *event = filep->private_data;
 	__poll_t pollflags = EPOLLOUT;
 
-	poll_wait(filep, &fault->wait_queue, wait);
-	mutex_lock(&fault->mutex);
-	if (!list_empty(&fault->deliver))
+	poll_wait(filep, &event->wait_queue, wait);
+	mutex_lock(&event->mutex);
+	if (!list_empty(&event->deliver))
 		pollflags |= EPOLLIN | EPOLLRDNORM;
-	mutex_unlock(&fault->mutex);
+	mutex_unlock(&event->mutex);
 
 	return pollflags;
 }
 
-static int iommufd_fault_fops_release(struct inode *inode, struct file *filep)
+static void iommufd_event_deinit(struct iommufd_event *event)
 {
-	struct iommufd_fault *fault = filep->private_data;
+	refcount_dec(&event->obj.users);
+	iommufd_ctx_put(event->ictx);
+	mutex_destroy(&event->mutex);
+}
 
-	refcount_dec(&fault->obj.users);
-	iommufd_ctx_put(fault->ictx);
+static int iommufd_event_fops_release(struct inode *inode, struct file *filep)
+{
+	iommufd_event_deinit((struct iommufd_event *)filep->private_data);
 	return 0;
 }
 
-static const struct file_operations iommufd_fault_fops = {
+static const struct file_operations iommufd_event_fops = {
 	.owner		= THIS_MODULE,
 	.open		= nonseekable_open,
-	.read		= iommufd_fault_fops_read,
-	.write		= iommufd_fault_fops_write,
-	.poll		= iommufd_fault_fops_poll,
-	.release	= iommufd_fault_fops_release,
+	.read		= iommufd_event_fops_read,
+	.write		= iommufd_event_fops_write,
+	.poll		= iommufd_event_fops_poll,
+	.release	= iommufd_event_fops_release,
 	.llseek		= no_llseek,
 };
 
-int iommufd_fault_alloc(struct iommufd_ucmd *ucmd)
+static int iommufd_event_init(struct iommufd_event *event, char *name,
+			      struct iommufd_ctx *ictx, int *out_fdno,
+			      const struct iommufd_event_ops *ops)
 {
-	struct iommu_fault_alloc *cmd = ucmd->cmd;
-	struct iommufd_fault *fault;
 	struct file *filep;
 	int fdno;
+
+	event->ops = ops;
+	event->ictx = ictx;
+	INIT_LIST_HEAD(&event->deliver);
+	mutex_init(&event->mutex);
+	init_waitqueue_head(&event->wait_queue);
+
+	filep = anon_inode_getfile(name, &iommufd_event_fops,
+				   event, O_RDWR);
+	if (IS_ERR(filep))
+		return PTR_ERR(filep);
+
+	refcount_inc(&event->obj.users);
+	iommufd_ctx_get(event->ictx);
+	event->filep = filep;
+
+	fdno = get_unused_fd_flags(O_CLOEXEC);
+	if (fdno < 0) {
+		fput(filep);
+		iommufd_event_deinit(event);
+		return fdno;
+	}
+	if (out_fdno)
+		*out_fdno = fdno;
+	return 0;
+}
+
+int iommufd_event_iopf_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_fault_alloc *cmd = ucmd->cmd;
+	struct iommufd_event_iopf *event_iopf;
+	int fdno;
 	int rc;
 
 	if (cmd->flags)
 		return -EOPNOTSUPP;
 
-	fault = iommufd_object_alloc(ucmd->ictx, fault, IOMMUFD_OBJ_FAULT);
-	if (IS_ERR(fault))
-		return PTR_ERR(fault);
+	event_iopf = __iommufd_object_alloc(ucmd->ictx, event_iopf,
+					    IOMMUFD_OBJ_EVENT_IOPF, common.obj);
+	if (IS_ERR(event_iopf))
+		return PTR_ERR(event_iopf);
 
-	fault->ictx = ucmd->ictx;
-	INIT_LIST_HEAD(&fault->deliver);
-	xa_init_flags(&fault->response, XA_FLAGS_ALLOC1);
-	mutex_init(&fault->mutex);
-	init_waitqueue_head(&fault->wait_queue);
+	xa_init_flags(&event_iopf->response, XA_FLAGS_ALLOC1);
 
-	filep = anon_inode_getfile("[iommufd-pgfault]", &iommufd_fault_fops,
-				   fault, O_RDWR);
-	if (IS_ERR(filep)) {
-		rc = PTR_ERR(filep);
+	rc = iommufd_event_init(&event_iopf->common, "[iommufd-pgfault]",
+				ucmd->ictx, &fdno, &iommufd_event_iopf_ops);
+	if (rc)
 		goto out_abort;
-	}
-
-	refcount_inc(&fault->obj.users);
-	iommufd_ctx_get(fault->ictx);
-	fault->filep = filep;
 
-	fdno = get_unused_fd_flags(O_CLOEXEC);
-	if (fdno < 0) {
-		rc = fdno;
-		goto out_fput;
-	}
-
-	cmd->out_fault_id = fault->obj.id;
+	cmd->out_fault_id = event_iopf->common.obj.id;
 	cmd->out_fault_fd = fdno;
 
 	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
 	if (rc)
 		goto out_put_fdno;
-	iommufd_object_finalize(ucmd->ictx, &fault->obj);
+	iommufd_object_finalize(ucmd->ictx, &event_iopf->common.obj);
 
-	fd_install(fdno, fault->filep);
+	fd_install(fdno, event_iopf->common.filep);
 
 	return 0;
 out_put_fdno:
 	put_unused_fd(fdno);
-out_fput:
-	fput(filep);
-	refcount_dec(&fault->obj.users);
-	iommufd_ctx_put(fault->ictx);
+	fput(event_iopf->common.filep);
+	iommufd_event_deinit(&event_iopf->common);
 out_abort:
-	iommufd_object_abort_and_destroy(ucmd->ictx, &fault->obj);
+	iommufd_object_abort_and_destroy(ucmd->ictx, &event_iopf->common.obj);
 
 	return rc;
 }
-
-int iommufd_fault_iopf_handler(struct iopf_group *group)
-{
-	struct iommufd_hw_pagetable *hwpt;
-	struct iommufd_fault *fault;
-
-	hwpt = group->attach_handle->domain->fault_data;
-	fault = hwpt->fault;
-
-	mutex_lock(&fault->mutex);
-	list_add_tail(&group->node, &fault->deliver);
-	mutex_unlock(&fault->mutex);
-
-	wake_up_interruptible(&fault->wait_queue);
-
-	return 0;
-}
diff --git a/drivers/iommu/iommufd/hw_pagetable.c b/drivers/iommu/iommufd/hw_pagetable.c
index 6aaec1b32abc3..ca5c003a02da3 100644
--- a/drivers/iommu/iommufd/hw_pagetable.c
+++ b/drivers/iommu/iommufd/hw_pagetable.c
@@ -14,7 +14,7 @@ static void __iommufd_hwpt_destroy(struct iommufd_hw_pagetable *hwpt)
 		iommu_domain_free(hwpt->domain);
 
 	if (hwpt->fault)
-		refcount_dec(&hwpt->fault->obj.users);
+		refcount_dec(&hwpt->fault->common.obj.users);
 }
 
 void iommufd_hwpt_paging_destroy(struct iommufd_object *obj)
@@ -342,18 +342,18 @@ int iommufd_hwpt_alloc(struct iommufd_ucmd *ucmd)
 	}
 
 	if (cmd->flags & IOMMU_HWPT_FAULT_ID_VALID) {
-		struct iommufd_fault *fault;
+		struct iommufd_event_iopf *fault;
 
-		fault = iommufd_get_fault(ucmd, cmd->fault_id);
+		fault = iommufd_get_event_iopf(ucmd, cmd->fault_id);
 		if (IS_ERR(fault)) {
 			rc = PTR_ERR(fault);
 			goto out_hwpt;
 		}
 		hwpt->fault = fault;
-		hwpt->domain->iopf_handler = iommufd_fault_iopf_handler;
+		hwpt->domain->iopf_handler = iommufd_event_iopf_handler;
 		hwpt->domain->fault_data = hwpt;
-		refcount_inc(&fault->obj.users);
-		iommufd_put_object(ucmd->ictx, &fault->obj);
+		refcount_inc(&fault->common.obj.users);
+		iommufd_put_object(ucmd->ictx, &fault->common.obj);
 	}
 
 	cmd->out_hwpt_id = hwpt->obj.id;
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index eb580360a2f43..2d9f257f3ed5f 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -19,6 +19,7 @@ struct iommu_domain;
 struct iommu_group;
 struct iommu_option;
 struct iommufd_device;
+struct iommufd_event;
 
 struct iommufd_ctx {
 	struct file *file;
@@ -131,7 +132,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_HWPT_NESTED,
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
-	IOMMUFD_OBJ_FAULT,
+	IOMMUFD_OBJ_EVENT_IOPF,
 	IOMMUFD_OBJ_VIOMMU,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
@@ -297,7 +298,7 @@ int iommufd_check_iova_range(struct io_pagetable *iopt,
 struct iommufd_hw_pagetable {
 	struct iommufd_object obj;
 	struct iommu_domain *domain;
-	struct iommufd_fault *fault;
+	struct iommufd_event_iopf *fault;
 };
 
 struct iommufd_hwpt_paging {
@@ -437,24 +438,42 @@ void iopt_remove_access(struct io_pagetable *iopt,
 			u32 iopt_access_list_id);
 void iommufd_access_destroy_object(struct iommufd_object *obj);
 
+struct iommufd_event_ops {
+	ssize_t (*read)(struct iommufd_event *event, char __user *buf,
+			size_t count, loff_t *ppos);
+	ssize_t (*write)(struct iommufd_event *event, const char __user *buf,
+			size_t count, loff_t *ppos);
+};
+
 /*
- * An iommufd_fault object represents an interface to deliver I/O page faults
- * to the user space. These objects are created/destroyed by the user space and
- * associated with hardware page table objects during page-table allocation.
+ * An iommufd_event object represents an interface to deliver IOMMU events
+ * to the user space. These objects are created/destroyed by the user space.
  */
-struct iommufd_fault {
+struct iommufd_event {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
 	struct file *filep;
 
-	/* The lists of outstanding faults protected by below mutex. */
+	const struct iommufd_event_ops *ops;
+
+	/* The lists of outstanding events protected by below mutex. */
 	struct mutex mutex;
 	struct list_head deliver;
-	struct xarray response;
 
 	struct wait_queue_head wait_queue;
 };
 
+static inline int iommufd_event_notify(struct iommufd_event *event,
+				       struct list_head *node)
+{
+	mutex_lock(&event->mutex);
+	list_add_tail(node, &event->deliver);
+	mutex_unlock(&event->mutex);
+
+	wake_up_interruptible(&event->wait_queue);
+	return 0;
+}
+
 struct iommufd_attach_handle {
 	struct iommu_attach_handle handle;
 	struct iommufd_device *idev;
@@ -463,31 +482,49 @@ struct iommufd_attach_handle {
 /* Convert an iommu attach handle to iommufd handle. */
 #define to_iommufd_handle(hdl)	container_of(hdl, struct iommufd_attach_handle, handle)
 
-static inline struct iommufd_fault *
-iommufd_get_fault(struct iommufd_ucmd *ucmd, u32 id)
+struct iommufd_event_iopf {
+	struct iommufd_event common;
+	struct xarray response;
+};
+
+static inline struct iommufd_event_iopf *
+to_event_iopf(struct iommufd_event *event)
+{
+	return container_of(event, struct iommufd_event_iopf, common);
+}
+
+static inline struct iommufd_event_iopf *
+iommufd_get_event_iopf(struct iommufd_ucmd *ucmd, u32 id)
 {
 	return container_of(iommufd_get_object(ucmd->ictx, id,
-					       IOMMUFD_OBJ_FAULT),
-			    struct iommufd_fault, obj);
+					       IOMMUFD_OBJ_EVENT_IOPF),
+			    struct iommufd_event_iopf, common.obj);
 }
 
-int iommufd_fault_alloc(struct iommufd_ucmd *ucmd);
-void iommufd_fault_destroy(struct iommufd_object *obj);
-int iommufd_fault_iopf_handler(struct iopf_group *group);
+int iommufd_event_iopf_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_event_iopf_destroy(struct iommufd_object *obj);
+
+static inline int iommufd_event_iopf_handler(struct iopf_group *group)
+{
+	struct iommufd_hw_pagetable *hwpt =
+		group->attach_handle->domain->fault_data;
+
+	return iommufd_event_notify(&hwpt->fault->common, &group->node);
+}
 
-int iommufd_fault_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
-				    struct iommufd_device *idev);
-void iommufd_fault_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
-				     struct iommufd_device *idev);
-int iommufd_fault_domain_replace_dev(struct iommufd_device *idev,
-				     struct iommufd_hw_pagetable *hwpt,
-				     struct iommufd_hw_pagetable *old);
+int iommufd_event_iopf_domain_attach_dev(struct iommufd_hw_pagetable *hwpt,
+					 struct iommufd_device *idev);
+void iommufd_event_iopf_domain_detach_dev(struct iommufd_hw_pagetable *hwpt,
+					  struct iommufd_device *idev);
+int iommufd_event_iopf_domain_replace_dev(struct iommufd_device *idev,
+					  struct iommufd_hw_pagetable *hwpt,
+					  struct iommufd_hw_pagetable *old);
 
 static inline int iommufd_hwpt_attach_device(struct iommufd_hw_pagetable *hwpt,
 					     struct iommufd_device *idev)
 {
 	if (hwpt->fault)
-		return iommufd_fault_domain_attach_dev(hwpt, idev);
+		return iommufd_event_iopf_domain_attach_dev(hwpt, idev);
 
 	return iommu_attach_group(hwpt->domain, idev->igroup->group);
 }
@@ -496,7 +533,7 @@ static inline void iommufd_hwpt_detach_device(struct iommufd_hw_pagetable *hwpt,
 					      struct iommufd_device *idev)
 {
 	if (hwpt->fault)
-		iommufd_fault_domain_detach_dev(hwpt, idev);
+		iommufd_event_iopf_domain_detach_dev(hwpt, idev);
 
 	iommu_detach_group(hwpt->domain, idev->igroup->group);
 }
@@ -506,7 +543,7 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 					      struct iommufd_hw_pagetable *old)
 {
 	if (old->fault || hwpt->fault)
-		return iommufd_fault_domain_replace_dev(idev, hwpt, old);
+		return iommufd_event_iopf_domain_replace_dev(idev, hwpt, old);
 
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 199ad90fa36b1..015f492afab1c 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -359,8 +359,8 @@ struct iommufd_ioctl_op {
 	}
 static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
-	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_fault_alloc, struct iommu_fault_alloc,
-		 out_fault_fd),
+	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_event_iopf_alloc,
+		 struct iommu_fault_alloc, out_fault_fd),
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
@@ -525,8 +525,8 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 		.destroy = iommufd_hwpt_nested_destroy,
 		.abort = iommufd_hwpt_nested_abort,
 	},
-	[IOMMUFD_OBJ_FAULT] = {
-		.destroy = iommufd_fault_destroy,
+	[IOMMUFD_OBJ_EVENT_IOPF] = {
+		.destroy = iommufd_event_iopf_destroy,
 	},
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,

From f03c0d4d144f0bdd46a95bfa5e7368a59ed842d1 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 10 Jul 2024 21:27:14 +0000
Subject: [PATCH 289/352] iommufd: Rename fault.c to event.c

Rename the file, aligning with the new event object.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/Makefile             | 2 +-
 drivers/iommu/iommufd/{fault.c => event.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename drivers/iommu/iommufd/{fault.c => event.c} (100%)

diff --git a/drivers/iommu/iommufd/Makefile b/drivers/iommu/iommufd/Makefile
index 288ef3e895e39..baabb714b56cb 100644
--- a/drivers/iommu/iommufd/Makefile
+++ b/drivers/iommu/iommufd/Makefile
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 iommufd-y := \
 	device.o \
-	fault.o \
+	event.o \
 	hw_pagetable.o \
 	io_pagetable.o \
 	ioas.o \
diff --git a/drivers/iommu/iommufd/fault.c b/drivers/iommu/iommufd/event.c
similarity index 100%
rename from drivers/iommu/iommufd/fault.c
rename to drivers/iommu/iommufd/event.c

From d4bb4ccc34384f031532a89434b930afae186cd7 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 8 Aug 2024 01:53:35 +0000
Subject: [PATCH 290/352] iommufd: Add IOMMUFD_OBJ_EVENT_VIRQ and
 IOMMUFD_CMD_VIRQ_ALLOC

Allow a VIOMMU object to allocate VIRQ events. Each VIOMMU is allowed to
have multiple VIRQ events but they must not have a duplicated type.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/event.c           | 136 ++++++++++++++++++++++++
 drivers/iommu/iommufd/iommufd_private.h |  54 ++++++++++
 drivers/iommu/iommufd/main.c            |   5 +
 drivers/iommu/iommufd/viommu.c          |   2 +
 include/uapi/linux/iommufd.h            |  32 ++++++
 5 files changed, 229 insertions(+)

diff --git a/drivers/iommu/iommufd/event.c b/drivers/iommu/iommufd/event.c
index 8fea142e1ac20..f10827ce9cbde 100644
--- a/drivers/iommu/iommufd/event.c
+++ b/drivers/iommu/iommufd/event.c
@@ -339,6 +339,67 @@ static const struct iommufd_event_ops iommufd_event_iopf_ops = {
 	.write = &iommufd_event_iopf_fops_write,
 };
 
+/* IOMMUFD_OBJ_EVENT_VIRQ Functions */
+
+void iommufd_event_virq_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_event *event =
+		container_of(obj, struct iommufd_event, obj);
+	struct iommufd_event_virq *event_virq = to_event_virq(event);
+	struct iommufd_viommu_irq *virq, *next;
+
+	/*
+	 * The iommufd object's reference count is zero at this point.
+	 * We can be confident that no other threads are currently
+	 * accessing this pointer. Therefore, acquiring the mutex here
+	 * is unnecessary.
+	 */
+	list_for_each_entry_safe(virq, next, &event->deliver, node) {
+		list_del(&virq->node);
+		kfree(virq);
+	}
+	destroy_workqueue(event_virq->irq_wq);
+	list_del(&event_virq->node);
+	refcount_dec(&event_virq->viommu->obj.users);
+}
+
+static ssize_t
+iommufd_event_virq_fops_read(struct iommufd_event *event,
+			     char __user *buf, size_t count, loff_t *ppos)
+{
+	size_t done = 0;
+	int rc = 0;
+
+	if (*ppos)
+		return -ESPIPE;
+
+	mutex_lock(&event->mutex);
+	while (!list_empty(&event->deliver) && count > done) {
+		struct iommufd_viommu_irq *virq =
+			list_first_entry(&event->deliver,
+					 struct iommufd_viommu_irq, node);
+		void *virq_data = (void *)virq + sizeof(*virq);
+
+		if (virq->irq_len > count - done)
+			break;
+
+		if (copy_to_user(buf + done, virq_data, virq->irq_len)) {
+			rc = -EFAULT;
+			break;
+		}
+		done += virq->irq_len;
+		list_del(&virq->node);
+		kfree(virq);
+	}
+	mutex_unlock(&event->mutex);
+
+	return done == 0 ? rc : done;
+}
+
+static const struct iommufd_event_ops iommufd_event_virq_ops = {
+	.read = &iommufd_event_virq_fops_read,
+};
+
 /* Common Event Functions */
 
 static ssize_t iommufd_event_fops_read(struct file *filep, char __user *buf,
@@ -475,3 +536,78 @@ int iommufd_event_iopf_alloc(struct iommufd_ucmd *ucmd)
 
 	return rc;
 }
+
+int iommufd_event_virq_alloc(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_virq_alloc *cmd = ucmd->cmd;
+	struct iommufd_event_virq *event_virq;
+	struct workqueue_struct *irq_wq;
+	struct iommufd_viommu *viommu;
+	int fdno;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+	if (cmd->type == IOMMU_VIRQ_TYPE_NONE)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+	down_write(&viommu->virqs_rwsem);
+
+	if (iommufd_viommu_find_event_virq(viommu, cmd->type)) {
+		rc = -EEXIST;
+		goto out_unlock_virqs;
+	}
+
+	event_virq = __iommufd_object_alloc(ucmd->ictx, event_virq,
+					    IOMMUFD_OBJ_EVENT_VIRQ, common.obj);
+	if (IS_ERR(event_virq)) {
+		rc = PTR_ERR(event_virq);
+		goto out_unlock_virqs;
+	}
+
+	irq_wq = alloc_workqueue("viommu_irq/%d", WQ_UNBOUND, 0,
+				 event_virq->common.obj.id);
+	if (!irq_wq) {
+		rc = -ENOMEM;
+		goto out_abort;
+	}
+
+	rc = iommufd_event_init(&event_virq->common, "[iommufd-viommu-irq]",
+				ucmd->ictx, &fdno, &iommufd_event_virq_ops);
+	if (rc)
+		goto out_irq_wq;
+
+	event_virq->irq_wq = irq_wq;
+	event_virq->viommu = viommu;
+	event_virq->type = cmd->type;
+	cmd->out_virq_id = event_virq->common.obj.id;
+	cmd->out_virq_fd = fdno;
+
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_put_fdno;
+	iommufd_object_finalize(ucmd->ictx, &event_virq->common.obj);
+
+	fd_install(fdno, event_virq->common.filep);
+
+	list_add_tail(&event_virq->node, &viommu->virqs);
+	refcount_inc(&viommu->obj.users);
+
+	goto out_unlock_virqs;
+out_put_fdno:
+	put_unused_fd(fdno);
+	fput(event_virq->common.filep);
+	iommufd_event_deinit(&event_virq->common);
+out_irq_wq:
+	destroy_workqueue(irq_wq);
+out_abort:
+	iommufd_object_abort_and_destroy(ucmd->ictx, &event_virq->common.obj);
+out_unlock_virqs:
+	up_write(&viommu->virqs_rwsem);
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+
+	return rc;
+}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 2d9f257f3ed5f..9791cf4a7ff00 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -133,6 +133,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_IOAS,
 	IOMMUFD_OBJ_ACCESS,
 	IOMMUFD_OBJ_EVENT_IOPF,
+	IOMMUFD_OBJ_EVENT_VIRQ,
 	IOMMUFD_OBJ_VIOMMU,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
@@ -548,6 +549,43 @@ static inline int iommufd_hwpt_replace_device(struct iommufd_device *idev,
 	return iommu_group_replace_domain(idev->igroup->group, hwpt->domain);
 }
 
+struct iommufd_event_virq {
+	struct iommufd_event common;
+	struct iommufd_viommu *viommu;
+	struct workqueue_struct *irq_wq;
+	struct list_head node;
+
+	unsigned int type;
+};
+
+static inline struct iommufd_event_virq *
+to_event_virq(struct iommufd_event *event)
+{
+	return container_of(event, struct iommufd_event_virq, common);
+}
+
+static inline struct iommufd_event_virq *
+iommufd_get_event_virq(struct iommufd_ucmd *ucmd, u32 id)
+{
+	return container_of(iommufd_get_object(ucmd->ictx, id,
+					       IOMMUFD_OBJ_EVENT_VIRQ),
+			    struct iommufd_event_virq, common.obj);
+}
+
+int iommufd_event_virq_alloc(struct iommufd_ucmd *ucmd);
+void iommufd_event_virq_destroy(struct iommufd_object *obj);
+
+struct iommufd_viommu_irq {
+	struct iommufd_event_virq *event_virq;
+	struct list_head node;
+	ssize_t irq_len;
+};
+
+static inline int iommufd_event_virq_handler(struct iommufd_viommu_irq *virq)
+{
+	return iommufd_event_notify(&virq->event_virq->common, &virq->node);
+}
+
 struct iommufd_viommu {
 	struct iommufd_object obj;
 	struct iommufd_ctx *ictx;
@@ -556,6 +594,8 @@ struct iommufd_viommu {
 	/* The locking order is vdev_ids_rwsem -> igroup::lock */
 	struct rw_semaphore vdev_ids_rwsem;
 	struct xarray vdev_ids;
+	struct rw_semaphore virqs_rwsem;
+	struct list_head virqs;
 
 	const struct iommufd_viommu_ops *ops;
 
@@ -576,6 +616,20 @@ iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
 			    struct iommufd_viommu, obj);
 }
 
+static inline struct iommufd_event_virq *
+iommufd_viommu_find_event_virq(struct iommufd_viommu *viommu, u32 type)
+{
+	struct iommufd_event_virq *event_virq, *next;
+
+	lockdep_assert_held(&viommu->virqs_rwsem);
+
+	list_for_each_entry_safe(event_virq, next, &viommu->virqs, node) {
+		if (event_virq->type == type)
+			return event_virq;
+	}
+	return NULL;
+}
+
 int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_viommu_set_vdev_id(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 015f492afab1c..22381ba031b5e 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -361,6 +361,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 	IOCTL_OP(IOMMU_DESTROY, iommufd_destroy, struct iommu_destroy, id),
 	IOCTL_OP(IOMMU_FAULT_QUEUE_ALLOC, iommufd_event_iopf_alloc,
 		 struct iommu_fault_alloc, out_fault_fd),
+	IOCTL_OP(IOMMU_VIRQ_ALLOC, iommufd_event_virq_alloc,
+		 struct iommu_virq_alloc, out_virq_fd),
 	IOCTL_OP(IOMMU_GET_HW_INFO, iommufd_get_hw_info, struct iommu_hw_info,
 		 __reserved),
 	IOCTL_OP(IOMMU_HWPT_ALLOC, iommufd_hwpt_alloc, struct iommu_hwpt_alloc,
@@ -528,6 +530,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_EVENT_IOPF] = {
 		.destroy = iommufd_event_iopf_destroy,
 	},
+	[IOMMUFD_OBJ_EVENT_VIRQ] = {
+		.destroy = iommufd_event_virq_destroy,
+	},
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
 	},
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index a4ba8bff4a267..9adc9c62ada9e 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -67,6 +67,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 
 	xa_init(&viommu->vdev_ids);
 	init_rwsem(&viommu->vdev_ids_rwsem);
+	INIT_LIST_HEAD(&viommu->virqs);
+	init_rwsem(&viommu->virqs_rwsem);
 
 	refcount_inc(&viommu->hwpt->common.obj.users);
 
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 0d973486b604c..f9ec07efed8d6 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -54,6 +54,7 @@ enum {
 	IOMMUFD_CMD_VIOMMU_ALLOC = 0x8f,
 	IOMMUFD_CMD_VIOMMU_SET_VDEV_ID = 0x90,
 	IOMMUFD_CMD_VIOMMU_UNSET_VDEV_ID = 0x91,
+	IOMMUFD_CMD_VIRQ_ALLOC = 0x92,
 };
 
 /**
@@ -951,4 +952,35 @@ struct iommu_viommu_unset_vdev_id {
 	__aligned_u64 vdev_id;
 };
 #define IOMMU_VIOMMU_UNSET_VDEV_ID _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIOMMU_UNSET_VDEV_ID)
+
+/**
+ * enum iommu_virq_type - Virtual IRQ Type
+ * @IOMMU_VIRQ_TYPE_NONE: INVALID type
+ */
+enum iommu_virq_type {
+	IOMMU_VIRQ_TYPE_NONE = 0,
+};
+
+/**
+ * struct iommu_virq_alloc - ioctl(IOMMU_VIRQ_ALLOC)
+ * @size: sizeof(struct iommu_virq_alloc)
+ * @flags: Must be 0
+ * @viommu: viommu ID to associate the virtual IRQ with
+ * @type: Type of the virtual IRQ. Must be defined in enum iommu_virq_type
+ * @out_virq_id: The ID of the new VIRQ
+ * @out_fault_fd: The fd of the new VIRQ
+ *
+ * Explicitly allocate a virtual IRQ handler for a VIOMMU. A VIOMMU can have
+ * multiple FDs for different @type, but is confined to have only one FD per
+ * @type.
+ */
+struct iommu_virq_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 type;
+	__u32 out_virq_id;
+	__u32 out_virq_fd;
+};
+#define IOMMU_VIRQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIRQ_ALLOC)
 #endif

From d44fbf5d5e90742c5f17eaf6b38c37c3fa3a4c15 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Sat, 1 Jun 2024 01:56:09 +0000
Subject: [PATCH 291/352] iommufd/viommu: Allow drivers to control vdev_id
 lifecycle

The iommufd core provides a lookup helper for an IOMMU driver to find a
device pointer by device's per-viommu virtual ID. Yet a driver may need
an inverted lookup to find a device's per-viommu virtual ID by a device
pointer, e.g. when reporting virtual IRQs/events back to the user space.
In this case, it'd be unsafe for iommufd core to do an inverted lookup,
as the driver can't track the lifecycle of a viommu object or a vdev_id
object.

Meanwhile, some HW can even support virtual device ID lookup by its HW-
accelerated virtualization feature. E.g. Tegra241 CMDQV HW supports to
execute vanilla guest-issued SMMU commands containing virtual Stream ID
but requires software to configure a link between virtual Stream ID and
physical Stream ID via HW registers. So not only the iommufd core needs
a vdev_id lookup table, drivers will want one too.

Given the two justifications above, it's the best practice to provide a
a pair of set_vdev_id/unset_vdev_id ops in the viommu ops, so a driver
can implement them to control a vdev_id's lifecycle, and configure the
HW properly if required.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/device.c          |  2 ++
 drivers/iommu/iommufd/iommufd_private.h |  6 ------
 drivers/iommu/iommufd/viommu.c          | 23 +++++++++++++++++++----
 include/linux/iommufd.h                 | 13 +++++++++++++
 4 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index 670a9c1ab8f75..dfe7a37053013 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -145,6 +145,8 @@ void iommufd_device_destroy(struct iommufd_object *obj)
 		old = xa_cmpxchg(&viommu->vdev_ids, vdev_id->id, vdev_id, NULL,
 				 GFP_KERNEL);
 		WARN_ON(old != vdev_id);
+		if (vdev_id->viommu->ops && vdev_id->viommu->ops->unset_vdev_id)
+			vdev_id->viommu->ops->unset_vdev_id(vdev_id);
 		kfree(vdev_id);
 		idev->vdev_id = NULL;
 	}
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 9791cf4a7ff00..299ee05054f0c 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -602,12 +602,6 @@ struct iommufd_viommu {
 	unsigned int type;
 };
 
-struct iommufd_vdev_id {
-	struct iommufd_viommu *viommu;
-	struct iommufd_device *idev;
-	u64 id;
-};
-
 static inline struct iommufd_viommu *
 iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
 {
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 9adc9c62ada9e..b1eb900b7fbf5 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -13,6 +13,8 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 
 	xa_for_each(&viommu->vdev_ids, index, vdev_id) {
 		/* Unlocked since there should be no race in a destroy() */
+		if (viommu->ops && viommu->ops->unset_vdev_id)
+			viommu->ops->unset_vdev_id(vdev_id);
 		vdev_id->idev->vdev_id = NULL;
 		kfree(vdev_id);
 	}
@@ -116,10 +118,18 @@ int iommufd_viommu_set_vdev_id(struct iommufd_ucmd *ucmd)
 		goto out_unlock_igroup;
 	}
 
-	vdev_id = kzalloc(sizeof(*vdev_id), GFP_KERNEL);
-	if (!vdev_id) {
-		rc = -ENOMEM;
-		goto out_unlock_igroup;
+	if (viommu->ops && viommu->ops->set_vdev_id) {
+		vdev_id = viommu->ops->set_vdev_id(viommu, idev->dev, cmd->vdev_id);
+		if (IS_ERR(vdev_id)) {
+			rc = PTR_ERR(vdev_id);
+			goto out_unlock_igroup;
+		}
+	} else {
+		vdev_id = kzalloc(sizeof(*vdev_id), GFP_KERNEL);
+		if (!vdev_id) {
+			rc = -ENOMEM;
+			goto out_unlock_igroup;
+		}
 	}
 
 	vdev_id->idev = idev;
@@ -137,6 +147,8 @@ int iommufd_viommu_set_vdev_id(struct iommufd_ucmd *ucmd)
 	goto out_unlock_igroup;
 
 out_free:
+	if (viommu->ops && viommu->ops->unset_vdev_id)
+		viommu->ops->unset_vdev_id(vdev_id);
 	kfree(vdev_id);
 out_unlock_igroup:
 	mutex_unlock(&idev->igroup->lock);
@@ -185,6 +197,9 @@ int iommufd_viommu_unset_vdev_id(struct iommufd_ucmd *ucmd)
 		rc = xa_err(old);
 		goto out_unlock_igroup;
 	}
+
+	if (viommu->ops && viommu->ops->unset_vdev_id)
+		viommu->ops->unset_vdev_id(old);
 	kfree(old);
 	idev->vdev_id = NULL;
 
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index f7c265c6de7c1..c89583c7c7924 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -56,8 +56,18 @@ void iommufd_access_detach(struct iommufd_access *access);
 
 void iommufd_ctx_get(struct iommufd_ctx *ictx);
 
+struct iommufd_vdev_id {
+	struct iommufd_viommu *viommu;
+	struct iommufd_device *idev;
+	u64 id;
+};
+
 /**
  * struct iommufd_viommu_ops - viommu specific operations
+ * @set_vdev_id: Set a virtual device id for a device assigned to a viommu.
+ *               Driver allocates an iommufd_vdev_id and return its pointer.
+ * @unset_vdev_id: Unset a virtual device id for a device assigned to a viommu.
+ *                 iommufd core frees the memory pointed by an iommufd_vdev_id.
  * @cache_invalidate: Flush hardware cache used by a viommu. It can be used for
  *                    any IOMMU hardware specific cache as long as a viommu has
  *                    enough information to identify it: for example, a VMID or
@@ -69,6 +79,9 @@ void iommufd_ctx_get(struct iommufd_ctx *ictx);
  *                    include/uapi/linux/iommufd.h
  */
 struct iommufd_viommu_ops {
+	struct iommufd_vdev_id *(*set_vdev_id)(struct iommufd_viommu *viommu,
+					       struct device *dev, u64 id);
+	void (*unset_vdev_id)(struct iommufd_vdev_id *vdev_id);
 	int (*cache_invalidate)(struct iommufd_viommu *viommu,
 				struct iommu_user_data_array *array);
 };

From c301d0865c7e1a335c2b29278edcba4ae73df06a Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 21 Aug 2024 20:38:43 +0000
Subject: [PATCH 292/352] iommufd/viommu: Add iommufd_vdev_id_to_dev helper

This helps drivers to get the dev pointer held by the vdev_id structure.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/viommu_api.c | 14 ++++++++++++++
 include/linux/iommufd.h            |  7 +++++++
 2 files changed, 21 insertions(+)

diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
index 3772a5892a6c8..82eb33e047cf8 100644
--- a/drivers/iommu/iommufd/viommu_api.c
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -51,3 +51,17 @@ iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu)
 	return viommu->hwpt->common.domain;
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_viommu_to_parent_domain, IOMMUFD);
+
+/*
+ * Fetch the dev pointer in the vdev_id structure. Caller must make ensure the
+ * lifecycle of the vdev_id structure, likely by adding a driver-level lock to
+ * protect the passed-in vdev_id for any race against a potential unset_vdev_id
+ * callback.
+ */
+struct device *iommufd_vdev_id_to_dev(struct iommufd_vdev_id *vdev_id)
+{
+	if (!vdev_id || !vdev_id->viommu)
+		return NULL;
+	return vdev_id->idev->dev;
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_vdev_id_to_dev, IOMMUFD);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index c89583c7c7924..88d6586a424f1 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -99,6 +99,7 @@ void iommufd_access_unpin_pages(struct iommufd_access *access,
 				unsigned long iova, unsigned long length);
 int iommufd_access_rw(struct iommufd_access *access, unsigned long iova,
 		      void *data, size_t len, unsigned int flags);
+struct device *iommufd_vdev_id_to_dev(struct iommufd_vdev_id *vdev_id);
 int iommufd_vfio_compat_ioas_get_id(struct iommufd_ctx *ictx, u32 *out_ioas_id);
 int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx);
 int iommufd_vfio_compat_set_no_iommu(struct iommufd_ctx *ictx);
@@ -138,6 +139,12 @@ static inline int iommufd_access_rw(struct iommufd_access *access, unsigned long
 	return -EOPNOTSUPP;
 }
 
+static inline struct device *
+iommufd_vdev_id_to_dev(struct iommufd_vdev_id *vdev_id)
+{
+	return NULL;
+}
+
 static inline int iommufd_vfio_compat_ioas_create(struct iommufd_ctx *ictx)
 {
 	return -EOPNOTSUPP;

From c760eaa9b0a79cde79fe468595b9e2175737f52c Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 21 Aug 2024 22:53:05 +0000
Subject: [PATCH 293/352] iommufd/viommu: Add iommufd_viommu_report_irq helper

This allows IOMMU drivers to report to user space hypervisors IRQs/events
that belong to a viommu.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/viommu_api.c | 40 ++++++++++++++++++++++++++++++
 include/linux/iommufd.h            |  8 ++++++
 2 files changed, 48 insertions(+)

diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
index 82eb33e047cf8..d075727a1b38f 100644
--- a/drivers/iommu/iommufd/viommu_api.c
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -65,3 +65,43 @@ struct device *iommufd_vdev_id_to_dev(struct iommufd_vdev_id *vdev_id)
 	return vdev_id->idev->dev;
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_vdev_id_to_dev, IOMMUFD);
+
+/**
+ * IOMMU drivers can call this helper to report a per-VIOMMU virtual IRQ. Caller
+ * must ensure the lifecycle of the viommu object, likely by passing it from a
+ * vdev_id structure that was set via a set_vdev_id callback and by holding the
+ * same driver-level lock to protect the passed-in vdev_id from any race against
+ * a potential unset_vdev_id callback.
+ */
+void iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
+			       void *irq_ptr, size_t irq_len)
+{
+	struct iommufd_event_virq *event_virq;
+	struct iommufd_viommu_irq *virq;
+	void *irq_data;
+
+	might_sleep();
+
+	if (!viommu)
+		return;
+
+	down_read(&viommu->virqs_rwsem);
+
+	event_virq = iommufd_viommu_find_event_virq(viommu, type);
+	if (!event_virq)
+		goto out_unlock_vdev_ids;
+
+	virq = kzalloc(sizeof(*virq) + irq_len, GFP_KERNEL);
+	if (!virq)
+		goto out_unlock_vdev_ids;
+	irq_data = (void *)virq + sizeof(*virq);
+	memcpy(irq_data, irq_ptr, irq_len);
+
+	virq->event_virq = event_virq;
+	virq->irq_len = irq_len;
+
+	iommufd_event_virq_handler(virq);
+out_unlock_vdev_ids:
+	up_read(&viommu->virqs_rwsem);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_irq, IOMMUFD);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 88d6586a424f1..346a6257ed0ca 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -108,6 +108,8 @@ void iommufd_viommu_unlock_vdev_id(struct iommufd_viommu *viommu);
 struct device *iommufd_viommu_find_device(struct iommufd_viommu *viommu, u64 id);
 struct iommu_domain *
 iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu);
+void iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
+			       void *irq_ptr, size_t irq_len);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -173,5 +175,11 @@ iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu)
 {
 	return NULL;
 }
+
+static inline void
+iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
+			  void *irq_ptr, size_t irq_len)
+{
+}
 #endif /* CONFIG_IOMMUFD */
 #endif

From 002511d7459fb5a9e1cc0bfef61794f2f1366504 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 21 Aug 2024 20:46:01 +0000
Subject: [PATCH 294/352] iommufd/selftest: Implement
 mock_viommu_set/unset_vdev_id

So that the driver can take the control of vdev_id's lifecycle. This will
be used by the VIRQ feature in the following patches.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/selftest.c | 36 ++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index 5298d9c11d3af..d7805d9e6d5c3 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -138,6 +138,8 @@ enum selftest_obj_type {
 
 struct mock_dev {
 	struct device dev;
+	struct mutex lock;
+	struct iommufd_vdev_id *vdev_id;
 	unsigned long flags;
 	int id;
 	u32 cache[MOCK_DEV_CACHE_NUM];
@@ -542,6 +544,36 @@ static int mock_dev_disable_feat(struct device *dev, enum iommu_dev_features fea
 	return 0;
 }
 
+static struct iommufd_vdev_id *
+mock_viommu_set_vdev_id(struct iommufd_viommu *viommu, struct device *dev,
+			u64 id)
+{
+	struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+	struct iommufd_vdev_id *vdev_id;
+
+	vdev_id = kzalloc(sizeof(*vdev_id), GFP_KERNEL);
+	if (!vdev_id)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&mdev->lock);
+	mdev->vdev_id = vdev_id;
+	mutex_unlock(&mdev->lock);
+
+	return vdev_id;
+}
+
+static void mock_viommu_unset_vdev_id(struct iommufd_vdev_id *vdev_id)
+{
+	struct device *dev = iommufd_vdev_id_to_dev(vdev_id);
+	struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
+
+	mutex_lock(&mdev->lock);
+	mdev->vdev_id = NULL;
+	mutex_unlock(&mdev->lock);
+
+	/* IOMMUFD core frees the memory of vdev_id */
+}
+
 static int mock_viommu_cache_invalidate(struct iommufd_viommu *viommu,
 					struct iommu_user_data_array *array)
 {
@@ -637,6 +669,8 @@ static const struct iommu_ops mock_ops = {
 			.unmap_pages = mock_domain_unmap_pages,
 			.iova_to_phys = mock_domain_iova_to_phys,
 			.default_viommu_ops = &(struct iommufd_viommu_ops){
+				.set_vdev_id = mock_viommu_set_vdev_id,
+				.unset_vdev_id = mock_viommu_unset_vdev_id,
 				.cache_invalidate = mock_viommu_cache_invalidate,
 			},
 		},
@@ -758,6 +792,7 @@ static void mock_dev_release(struct device *dev)
 	struct mock_dev *mdev = container_of(dev, struct mock_dev, dev);
 
 	ida_free(&mock_dev_ida, mdev->id);
+	mutex_destroy(&mdev->lock);
 	kfree(mdev);
 }
 
@@ -774,6 +809,7 @@ static struct mock_dev *mock_dev_create(unsigned long dev_flags)
 	if (!mdev)
 		return ERR_PTR(-ENOMEM);
 
+	mutex_init(&mdev->lock);
 	device_initialize(&mdev->dev);
 	mdev->flags = dev_flags;
 	mdev->dev.release = mock_dev_release;

From 670d5ef5a4e4fd1c2cc2621231bd1260e12c6cf9 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 18 Jul 2024 23:03:45 +0000
Subject: [PATCH 295/352] iommufd/selftest: Add IOMMU_TEST_OP_TRIGGER_VIRQ for
 VIRQ coverage

The handler will get vdev_id structure from the given mdev and convert it
to its per-viommu virtual device ID to mimic a real IOMMU driver.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_test.h | 10 ++++++++++
 drivers/iommu/iommufd/selftest.c     | 30 ++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd_test.h b/drivers/iommu/iommufd/iommufd_test.h
index 56bade6146ff9..736ae5f8152e8 100644
--- a/drivers/iommu/iommufd/iommufd_test.h
+++ b/drivers/iommu/iommufd/iommufd_test.h
@@ -24,6 +24,7 @@ enum {
 	IOMMU_TEST_OP_MD_CHECK_IOTLB,
 	IOMMU_TEST_OP_TRIGGER_IOPF,
 	IOMMU_TEST_OP_DEV_CHECK_CACHE,
+	IOMMU_TEST_OP_TRIGGER_VIRQ,
 };
 
 enum {
@@ -145,6 +146,9 @@ struct iommu_test_cmd {
 			__u32 id;
 			__u32 cache;
 		} check_dev_cache;
+		struct {
+			__u32 dev_id;
+		} trigger_virq;
 	};
 	__u32 last;
 };
@@ -210,4 +214,10 @@ struct iommu_viommu_invalidate_selftest {
 	__u32 cache_id;
 };
 
+#define IOMMU_VIRQ_TYPE_SELFTEST 0xbeefbeef
+
+struct iommu_viommu_irq_selftest {
+	__u32 vdev_id;
+};
+
 #endif
diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c
index d7805d9e6d5c3..469b9863be78a 100644
--- a/drivers/iommu/iommufd/selftest.c
+++ b/drivers/iommu/iommufd/selftest.c
@@ -1560,6 +1560,34 @@ static int iommufd_test_trigger_iopf(struct iommufd_ucmd *ucmd,
 	return 0;
 }
 
+static int iommufd_test_trigger_virq(struct iommufd_ucmd *ucmd,
+				     struct iommu_test_cmd *cmd)
+{
+	struct iommufd_device *idev;
+	struct mock_dev *mdev;
+
+	idev = iommufd_get_device(ucmd, cmd->trigger_virq.dev_id);
+	if (IS_ERR(idev))
+		return PTR_ERR(idev);
+	mdev = container_of(idev->dev, struct mock_dev, dev);
+
+	mutex_lock(&mdev->lock);
+	if (mdev->vdev_id) {
+		struct iommu_viommu_irq_selftest test = {
+			.vdev_id = mdev->vdev_id->id,
+		};
+
+		iommufd_viommu_report_irq(mdev->vdev_id->viommu,
+					  IOMMU_VIRQ_TYPE_SELFTEST,
+					  &test, sizeof(test));
+	}
+	mutex_unlock(&mdev->lock);
+
+	iommufd_put_object(ucmd->ictx, &idev->obj);
+
+	return 0;
+}
+
 void iommufd_selftest_destroy(struct iommufd_object *obj)
 {
 	struct selftest_obj *sobj = container_of(obj, struct selftest_obj, obj);
@@ -1641,6 +1669,8 @@ int iommufd_test(struct iommufd_ucmd *ucmd)
 					  cmd->dirty.flags);
 	case IOMMU_TEST_OP_TRIGGER_IOPF:
 		return iommufd_test_trigger_iopf(ucmd, cmd);
+	case IOMMU_TEST_OP_TRIGGER_VIRQ:
+		return iommufd_test_trigger_virq(ucmd, cmd);
 	default:
 		return -EOPNOTSUPP;
 	}

From 26788033434eae63bdadc21c72932aa782696574 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 18 Jul 2024 23:04:02 +0000
Subject: [PATCH 296/352] iommufd/selftest: Add EVENT_VIRQ test coverage

Trigger an IRQ giving an idev ID, to test the loopback whether receiving
or not the vdev_id that was set to the idev by the line above.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 tools/testing/selftests/iommu/iommufd.c       | 11 ++++
 tools/testing/selftests/iommu/iommufd_utils.h | 64 +++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c
index 6f1014cc208b4..11208f53fdce0 100644
--- a/tools/testing/selftests/iommu/iommufd.c
+++ b/tools/testing/selftests/iommu/iommufd.c
@@ -564,6 +564,8 @@ TEST_F(iommufd_ioas, viommu_default)
 	uint32_t nested_hwpt_id = 0, hwpt_id = 0;
 	uint32_t dev_id = self->device_id;
 	uint32_t viommu_id = 0;
+	uint32_t virq_id;
+	uint32_t virq_fd;
 
 	if (dev_id) {
 		/* Negative test -- invalid hwpt */
@@ -595,16 +597,25 @@ TEST_F(iommufd_ioas, viommu_default)
 					   sizeof(data));
 		test_cmd_mock_domain_replace(self->stdev_id, nested_hwpt_id);
 
+		test_cmd_virq_alloc(viommu_id, IOMMU_VIRQ_TYPE_SELFTEST,
+				    &virq_id, &virq_fd);
+		test_err_virq_alloc(EEXIST, viommu_id, IOMMU_VIRQ_TYPE_SELFTEST,
+				    &virq_id, &virq_fd);
+
 		/* Set vdev_id to 0x99, unset it, and set to 0x88 */
 		test_cmd_viommu_set_vdev_id(viommu_id, dev_id, 0x99);
+		test_cmd_trigger_virq(dev_id, virq_fd, 0x99);
 		test_err_viommu_set_vdev_id(EEXIST, viommu_id, dev_id, 0x99);
 		test_err_viommu_unset_vdev_id(EINVAL, viommu_id, dev_id, 0x88);
 		test_cmd_viommu_unset_vdev_id(viommu_id, dev_id, 0x99);
 		test_cmd_viommu_set_vdev_id(viommu_id, dev_id, 0x88);
+		test_cmd_trigger_virq(dev_id, virq_fd, 0x88);
+		close(virq_fd);
 
 		test_cmd_mock_domain_replace(self->stdev_id, hwpt_id);
 		test_ioctl_destroy(nested_hwpt_id);
 		test_cmd_mock_domain_replace(self->stdev_id, self->ioas_id);
+		test_ioctl_destroy(virq_id);
 		test_ioctl_destroy(viommu_id);
 		test_ioctl_destroy(hwpt_id);
 	} else {
diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h
index 0a81827b903f7..9fec38f45e0ee 100644
--- a/tools/testing/selftests/iommu/iommufd_utils.h
+++ b/tools/testing/selftests/iommu/iommufd_utils.h
@@ -9,6 +9,7 @@
 #include <sys/ioctl.h>
 #include <stdint.h>
 #include <assert.h>
+#include <poll.h>
 
 #include "../kselftest_harness.h"
 #include "../../../../drivers/iommu/iommufd/iommufd_test.h"
@@ -888,3 +889,66 @@ static int _test_cmd_viommu_unset_vdev_id(int fd, __u32 viommu_id,
 	EXPECT_ERRNO(_errno,                                               \
 		     _test_cmd_viommu_unset_vdev_id(self->fd, viommu_id,   \
 						    idev_id, vdev_id))
+
+static int _test_ioctl_virq_alloc(int fd, __u32 viommu_id, __u32 type,
+				  __u32 *virq_id, __u32 *virq_fd)
+{
+	struct iommu_virq_alloc cmd = {
+		.size = sizeof(cmd),
+		.type = type,
+		.viommu_id = viommu_id,
+	};
+	int ret;
+
+	ret = ioctl(fd, IOMMU_VIRQ_ALLOC, &cmd);
+	if (ret)
+		return ret;
+	if (virq_id)
+		*virq_id = cmd.out_virq_id;
+	if (virq_fd)
+		*virq_fd = cmd.out_virq_fd;
+	return 0;
+}
+
+#define test_cmd_virq_alloc(viommu_id, type, virq_id, virq_fd)         \
+	ASSERT_EQ(0, _test_ioctl_virq_alloc(self->fd, viommu_id, type, \
+					    virq_id, virq_fd))
+#define test_err_virq_alloc(_errno, viommu_id, type, virq_id, virq_fd) \
+	EXPECT_ERRNO(_errno,                                           \
+		     _test_ioctl_virq_alloc(self->fd, viommu_id, type, \
+					    virq_id, virq_fd))
+
+static int _test_cmd_trigger_virq(int fd, __u32 dev_id,
+				  __u32 event_fd, __u32 vdev_id)
+{
+	struct iommu_test_cmd trigger_virq_cmd = {
+		.size = sizeof(trigger_virq_cmd),
+		.op = IOMMU_TEST_OP_TRIGGER_VIRQ,
+		.trigger_virq = {
+			.dev_id = dev_id,
+		},
+	};
+	struct pollfd pollfd = { .fd = event_fd, .events = POLLIN };
+	struct iommu_viommu_irq_selftest irq;
+	ssize_t bytes;
+	int ret;
+
+	ret = ioctl(fd, _IOMMU_TEST_CMD(IOMMU_TEST_OP_TRIGGER_VIRQ),
+		    &trigger_virq_cmd);
+	if (ret)
+		return ret;
+
+	ret = poll(&pollfd, 1, 1000);
+	if (ret < 0)
+		return ret;
+
+	bytes = read(event_fd, &irq, sizeof(irq));
+	if (bytes <= 0)
+		return -EIO;
+
+	return irq.vdev_id == vdev_id ? 0 : -EINVAL;
+}
+
+#define test_cmd_trigger_virq(dev_id, event_fd, vdev_id)      \
+	ASSERT_EQ(0, _test_cmd_trigger_virq(self->fd, dev_id, \
+					    event_fd, vdev_id))

From 6f7f44162a77b8e5711fbf8a60903c7e00169697 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 13 Aug 2024 17:59:08 +0000
Subject: [PATCH 297/352] iommu/arm-smmu-v3: Report virtual IRQ for device in
 user space

Aside from the IOPF framework, iommufd provides an additional pathway to
report a hardware event or IRQ, via the VIRQ of VIOMMU infrastructure.

Implement the set/unset_vdev_id viommu ops, to take control of vdev_id's
lifecycle. Lock it properly so the threaded IRQ handler can read out the
viommu pointer and the virtual SID, to call iommufd_viommu_report_irq().

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 109 +++++++++++++++-----
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h |   2 +
 include/uapi/linux/iommufd.h                |  14 +++
 3 files changed, 97 insertions(+), 28 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index a5e44c274351f..8e07331f95eee 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -1815,6 +1815,7 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 {
 	int ret = 0;
 	u32 perm = 0;
+	struct iommu_domain *domain;
 	struct arm_smmu_master *master;
 	bool ssid_valid = evt[0] & EVTQ_0_SSV;
 	u32 sid = FIELD_GET(EVTQ_0_SID, evt[0]);
@@ -1835,41 +1836,59 @@ static int arm_smmu_handle_evt(struct arm_smmu_device *smmu, u64 *evt)
 	if (evt[1] & EVTQ_1_S2)
 		return -EFAULT;
 
-	if (!(evt[1] & EVTQ_1_STALL))
-		return -EOPNOTSUPP;
-
-	if (evt[1] & EVTQ_1_RnW)
-		perm |= IOMMU_FAULT_PERM_READ;
-	else
-		perm |= IOMMU_FAULT_PERM_WRITE;
-
-	if (evt[1] & EVTQ_1_InD)
-		perm |= IOMMU_FAULT_PERM_EXEC;
-
-	if (evt[1] & EVTQ_1_PnU)
-		perm |= IOMMU_FAULT_PERM_PRIV;
-
-	flt->type = IOMMU_FAULT_PAGE_REQ;
-	flt->prm = (struct iommu_fault_page_request) {
-		.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
-		.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
-		.perm = perm,
-		.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
-	};
-
-	if (ssid_valid) {
-		flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
-		flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
-	}
-
 	mutex_lock(&smmu->streams_mutex);
 	master = arm_smmu_find_master(smmu, sid);
 	if (!master) {
 		ret = -EINVAL;
 		goto out_unlock;
 	}
+	domain = iommu_get_domain_for_dev(master->dev);
+
+	if (evt[1] & EVTQ_1_STALL) {
+		if (evt[1] & EVTQ_1_RnW)
+			perm |= IOMMU_FAULT_PERM_READ;
+		else
+			perm |= IOMMU_FAULT_PERM_WRITE;
+
+		if (evt[1] & EVTQ_1_InD)
+			perm |= IOMMU_FAULT_PERM_EXEC;
+
+		if (evt[1] & EVTQ_1_PnU)
+			perm |= IOMMU_FAULT_PERM_PRIV;
+
+		flt->type = IOMMU_FAULT_PAGE_REQ;
+		flt->prm = (struct iommu_fault_page_request) {
+			.flags = IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE,
+			.grpid = FIELD_GET(EVTQ_1_STAG, evt[1]),
+			.perm = perm,
+			.addr = FIELD_GET(EVTQ_2_ADDR, evt[2]),
+		};
+
+		if (ssid_valid) {
+			flt->prm.flags |= IOMMU_FAULT_PAGE_REQUEST_PASID_VALID;
+			flt->prm.pasid = FIELD_GET(EVTQ_0_SSID, evt[0]);
+		}
+
+		ret = iommu_report_device_fault(master->dev, &fault_evt);
+	} else if (domain && domain->type == IOMMU_DOMAIN_NESTED) {
+		mutex_lock(&master->lock);
+		if (master->vdev_id) {
+			struct iommu_virq_arm_smmuv3 virq_data =
+				*(struct iommu_virq_arm_smmuv3 *)evt;
 
-	ret = iommu_report_device_fault(master->dev, &fault_evt);
+			virq_data.evt[0] &= ~EVTQ_0_SID;
+			virq_data.evt[0] |=
+				FIELD_PREP(EVTQ_0_SID, master->vdev_id->id);
+
+			iommufd_viommu_report_irq(master->vdev_id->viommu,
+						  IOMMU_VIRQ_TYPE_ARM_SMMUV3,
+						  &virq_data, sizeof(virq_data));
+		}
+		mutex_unlock(&master->lock);
+	} else {
+		/* Unhandled events should be pinned */
+		ret = -EFAULT;
+	}
 out_unlock:
 	mutex_unlock(&smmu->streams_mutex);
 	return ret;
@@ -3744,6 +3763,7 @@ static struct iommu_device *arm_smmu_probe_device(struct device *dev)
 
 	master->dev = dev;
 	master->smmu = smmu;
+	mutex_init(&master->lock);
 	dev_iommu_priv_set(dev, master);
 
 	ret = arm_smmu_insert_master(smmu, master);
@@ -3796,6 +3816,7 @@ static void arm_smmu_release_device(struct device *dev)
 	arm_smmu_remove_master(master);
 	if (arm_smmu_cdtab_allocated(&master->cd_table))
 		arm_smmu_free_cd_tables(master);
+	mutex_destroy(&master->lock);
 	kfree(master);
 }
 
@@ -3931,6 +3952,36 @@ static int arm_smmu_def_domain_type(struct device *dev)
 	return 0;
 }
 
+static struct iommufd_vdev_id *
+arm_smmu_viommu_set_vdev_id(struct iommufd_viommu *viommu, struct device *dev,
+			    u64 id)
+{
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct iommufd_vdev_id *vdev_id;
+
+	vdev_id = kzalloc(sizeof(*vdev_id), GFP_KERNEL);
+	if (!vdev_id)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_lock(&master->lock);
+	master->vdev_id = vdev_id;
+	mutex_unlock(&master->lock);
+
+	return vdev_id;
+}
+
+static void arm_smmu_viommu_unset_vdev_id(struct iommufd_vdev_id *vdev_id)
+{
+	struct device *dev = iommufd_vdev_id_to_dev(vdev_id);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+	mutex_lock(&master->lock);
+	master->vdev_id = NULL;
+	mutex_unlock(&master->lock);
+
+	/* IOMMUFD core frees the memory of vdev_id */
+}
+
 static int arm_smmu_viommu_cache_invalidate(struct iommufd_viommu *viommu,
 					    struct iommu_user_data_array *array)
 {
@@ -3971,6 +4022,8 @@ static struct iommu_ops arm_smmu_ops = {
 		.iova_to_phys		= arm_smmu_iova_to_phys,
 		.free			= arm_smmu_domain_free_paging,
 		.default_viommu_ops = &(const struct iommufd_viommu_ops) {
+			.set_vdev_id = arm_smmu_viommu_set_vdev_id,
+			.unset_vdev_id = arm_smmu_viommu_unset_vdev_id,
 			.cache_invalidate = arm_smmu_viommu_cache_invalidate,
 		}
 	}
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 6b0c068184ba0..04535e39b78f4 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -793,6 +793,8 @@ struct arm_smmu_master {
 	struct arm_smmu_device		*smmu;
 	struct device			*dev;
 	struct arm_smmu_stream		*streams;
+	struct mutex			lock;
+	struct iommufd_vdev_id		*vdev_id;
 	/* Locked by the iommu core using the group mutex */
 	struct arm_smmu_ctx_desc_cfg	cd_table;
 	unsigned int			num_streams;
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index f9ec07efed8d6..1dc2c0b05af71 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -956,9 +956,23 @@ struct iommu_viommu_unset_vdev_id {
 /**
  * enum iommu_virq_type - Virtual IRQ Type
  * @IOMMU_VIRQ_TYPE_NONE: INVALID type
+ * @IOMMU_VIRQ_TYPE_ARM_SMMUV3: ARM SMMUv3 Virtual Event
  */
 enum iommu_virq_type {
 	IOMMU_VIRQ_TYPE_NONE = 0,
+	IOMMU_VIRQ_TYPE_ARM_SMMUV3 = 1,
+};
+
+/**
+ * struct iommu_virq_arm_smmuv3 - ARM SMMUv3 Virtual IRQ
+ *                                (IOMMU_VIRQ_TYPE_ARM_SMMUV3)
+ * @evt: 256-bit ARM SMMUv3 Event record, little-endian.
+ *
+ * StreamID field reports a virtual device ID. To receive a virtual IRQ for a
+ * device, it must set its virtual device ID via IOMMU_VIOMMU_SET_VDEV_ID.
+ */
+struct iommu_virq_arm_smmuv3 {
+	__aligned_u64 evt[4];
 };
 
 /**

From 91e4b61995ddf0e53d03ea3f48148adcc4f55088 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 13 Aug 2024 17:20:17 +0000
Subject: [PATCH 298/352] cover-letter: iommufd: Add VIOMMU infrastructure
 (Part-2 VIRQ)

As the part-2 of the VIOMMU infrastructure, this series introduces a VIRQ
object after repurposing the existing FAULT object, which provides a nice
notification pathway to the user space already. So, the first thing to do
is reworking the FAULT object.

Mimicing the HWPT structures, add a common EVENT structure to support its
derivatives: EVENT_IOPF (the prior FAULT object) and EVENT_VIRQ (new one).
IOMMUFD_CMD_VIRQ_ALLOC is introduced to allocate EVENT_VIRQ for a VIOMMU.
One VIOMMU can have multiple VIRQs in different types but can not support
multiple VIRQs with the same types.

Drivers might need the VIOMMU's vdev_id list or the exact vdev_id link of
the passthrough device's to forward IRQs/events via the VIOMMU framework.
Thus, extend the set/unset_vdev_id ioctls down to the driver using VIOMMU
ops. This allows drivers to take the control of a vdev_id's lifecycle.

The forwarding part is fairly simple but might need to replace a physical
device ID with a virtual device ID. So, there comes with some helpers for
drivers to use.

As usual, this series comes with the selftest coverage for this new VIRQ,
and with a real world use case in the ARM SMMUv3 driver.

This must be based on the VIOMMU Part-1 series. It's on Github:
https://github.com/nicolinc/iommufd/commits/iommufd_virq-v1
Paring QEMU branch for testing:
https://github.com/nicolinc/qemu/commits/wip/for_iommufd_virq-v1

Thanks!
Nicolin

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 3eae79c3476502b64e0d28cc2a421f02b296ea6a Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 2 Aug 2024 04:24:44 +0000
Subject: [PATCH 299/352] iommufd/device: Enforce reserved IOVA also when
 attached to hwpt_nested

Currently, device reserved regions are only enforced when the device is
attached to an hwpt_paging. In other words, if the device gets attached
to an hwpt_nested directly, the parent hwpt_paging of the hwpt_nested's
would not enforce those reserved IOVAs. This works for most of reserved
region types, but not for IOMMU_RESV_SW_MSI, which is a unique software
defined window, required by a nesting case too to setup an MSI doorbell
on the parent stage-2 hwpt/domain.

Kevin pointed out that:
1) there is no usage using up closely the entire IOVA space yet,
2) guest may change the viommu mode to switch between nested
   and paging then VMM has to take all devices' reserved regions
   into consideration anyway, when composing the GPA space.
Link: https://lore.kernel.org/all/BN9PR11MB5276497781C96415272E6FED8CB12@BN9PR11MB5276.namprd11.prod.outlook.com/

So it would be actually convenient for us to also enforce reserved IOVA
onto the parent hwpt_paging, when attaching a device to an hwpt_nested.

Repurpose the existing attach/replace_paging helpers to attach device's
reserved IOVAs exclusively.

Add a new find_hwpt_paging helper, which is only used by these reserved
IOVA functions, to allow an IOMMUFD_OBJ_HWPT_NESTED hwpt to redirect to
its parent hwpt_paging. Return a NULL in these two helpers for any new
HWPT type in the future.

Suggested-by: Tian, Kevin <kevin.tian@intel.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/device.c          | 52 ++++++++++++-------------
 drivers/iommu/iommufd/iommufd_private.h | 19 +++++++++
 2 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/drivers/iommu/iommufd/device.c b/drivers/iommu/iommufd/device.c
index dfe7a37053013..01bb5c9f415be 100644
--- a/drivers/iommu/iommufd/device.c
+++ b/drivers/iommu/iommufd/device.c
@@ -341,8 +341,9 @@ static int iommufd_group_setup_msi(struct iommufd_group *igroup,
 	return 0;
 }
 
-static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
-				      struct iommufd_device *idev)
+static int
+iommufd_device_attach_reserved_iova(struct iommufd_device *idev,
+				    struct iommufd_hwpt_paging *hwpt_paging)
 {
 	int rc;
 
@@ -368,6 +369,7 @@ static int iommufd_hwpt_paging_attach(struct iommufd_hwpt_paging *hwpt_paging,
 int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 				struct iommufd_device *idev)
 {
+	struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
 	int rc;
 
 	mutex_lock(&idev->igroup->lock);
@@ -377,8 +379,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 		goto err_unlock;
 	}
 
-	if (hwpt_is_paging(hwpt)) {
-		rc = iommufd_hwpt_paging_attach(to_hwpt_paging(hwpt), idev);
+	if (hwpt_paging) {
+		rc = iommufd_device_attach_reserved_iova(idev, hwpt_paging);
 		if (rc)
 			goto err_unlock;
 	}
@@ -401,9 +403,8 @@ int iommufd_hw_pagetable_attach(struct iommufd_hw_pagetable *hwpt,
 	mutex_unlock(&idev->igroup->lock);
 	return 0;
 err_unresv:
-	if (hwpt_is_paging(hwpt))
-		iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
-					  idev->dev);
+	if (hwpt_paging)
+		iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
 err_unlock:
 	mutex_unlock(&idev->igroup->lock);
 	return rc;
@@ -413,6 +414,7 @@ struct iommufd_hw_pagetable *
 iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 {
 	struct iommufd_hw_pagetable *hwpt = idev->igroup->hwpt;
+	struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
 
 	mutex_lock(&idev->igroup->lock);
 	list_del(&idev->group_item);
@@ -420,9 +422,8 @@ iommufd_hw_pagetable_detach(struct iommufd_device *idev)
 		iommufd_hwpt_detach_device(hwpt, idev);
 		idev->igroup->hwpt = NULL;
 	}
-	if (hwpt_is_paging(hwpt))
-		iopt_remove_reserved_iova(&to_hwpt_paging(hwpt)->ioas->iopt,
-					  idev->dev);
+	if (hwpt_paging)
+		iopt_remove_reserved_iova(&hwpt_paging->ioas->iopt, idev->dev);
 	mutex_unlock(&idev->igroup->lock);
 
 	/* Caller must destroy hwpt */
@@ -454,17 +455,17 @@ iommufd_group_remove_reserved_iova(struct iommufd_group *igroup,
 }
 
 static int
-iommufd_group_do_replace_paging(struct iommufd_group *igroup,
-				struct iommufd_hwpt_paging *hwpt_paging)
+iommufd_group_do_replace_reserved_iova(struct iommufd_group *igroup,
+				       struct iommufd_hwpt_paging *hwpt_paging)
 {
-	struct iommufd_hw_pagetable *old_hwpt = igroup->hwpt;
+	struct iommufd_hwpt_paging *old_hwpt_paging;
 	struct iommufd_device *cur;
 	int rc;
 
 	lockdep_assert_held(&igroup->lock);
 
-	if (!hwpt_is_paging(old_hwpt) ||
-	    hwpt_paging->ioas != to_hwpt_paging(old_hwpt)->ioas) {
+	old_hwpt_paging = find_hwpt_paging(igroup->hwpt);
+	if (!old_hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas) {
 		list_for_each_entry(cur, &igroup->device_list, group_item) {
 			rc = iopt_table_enforce_dev_resv_regions(
 				&hwpt_paging->ioas->iopt, cur->dev, NULL);
@@ -487,6 +488,8 @@ static struct iommufd_hw_pagetable *
 iommufd_device_do_replace(struct iommufd_device *idev,
 			  struct iommufd_hw_pagetable *hwpt)
 {
+	struct iommufd_hwpt_paging *hwpt_paging = find_hwpt_paging(hwpt);
+	struct iommufd_hwpt_paging *old_hwpt_paging;
 	struct iommufd_group *igroup = idev->igroup;
 	struct iommufd_hw_pagetable *old_hwpt;
 	unsigned int num_devices;
@@ -505,9 +508,8 @@ iommufd_device_do_replace(struct iommufd_device *idev,
 	}
 
 	old_hwpt = igroup->hwpt;
-	if (hwpt_is_paging(hwpt)) {
-		rc = iommufd_group_do_replace_paging(igroup,
-						     to_hwpt_paging(hwpt));
+	if (hwpt_paging) {
+		rc = iommufd_group_do_replace_reserved_iova(igroup, hwpt_paging);
 		if (rc)
 			goto err_unlock;
 	}
@@ -516,11 +518,10 @@ iommufd_device_do_replace(struct iommufd_device *idev,
 	if (rc)
 		goto err_unresv;
 
-	if (hwpt_is_paging(old_hwpt) &&
-	    (!hwpt_is_paging(hwpt) ||
-	     to_hwpt_paging(hwpt)->ioas != to_hwpt_paging(old_hwpt)->ioas))
-		iommufd_group_remove_reserved_iova(igroup,
-						   to_hwpt_paging(old_hwpt));
+	old_hwpt_paging = find_hwpt_paging(old_hwpt);
+	if (old_hwpt_paging &&
+	    (!hwpt_paging || hwpt_paging->ioas != old_hwpt_paging->ioas))
+		iommufd_group_remove_reserved_iova(igroup, old_hwpt_paging);
 
 	igroup->hwpt = hwpt;
 
@@ -538,9 +539,8 @@ iommufd_device_do_replace(struct iommufd_device *idev,
 	/* Caller must destroy old_hwpt */
 	return old_hwpt;
 err_unresv:
-	if (hwpt_is_paging(hwpt))
-		iommufd_group_remove_reserved_iova(igroup,
-						   to_hwpt_paging(hwpt));
+	if (hwpt_paging)
+		iommufd_group_remove_reserved_iova(igroup, hwpt_paging);
 err_unlock:
 	mutex_unlock(&idev->igroup->lock);
 	return ERR_PTR(rc);
diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 299ee05054f0c..4cb1555991b89 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -330,6 +330,25 @@ to_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
 	return container_of(hwpt, struct iommufd_hwpt_paging, common);
 }
 
+static inline struct iommufd_hwpt_nested *
+to_hwpt_nested(struct iommufd_hw_pagetable *hwpt)
+{
+	return container_of(hwpt, struct iommufd_hwpt_nested, common);
+}
+
+static inline struct iommufd_hwpt_paging *
+find_hwpt_paging(struct iommufd_hw_pagetable *hwpt)
+{
+	switch (hwpt->obj.type) {
+	case IOMMUFD_OBJ_HWPT_PAGING:
+		return to_hwpt_paging(hwpt);
+	case IOMMUFD_OBJ_HWPT_NESTED:
+		return to_hwpt_nested(hwpt)->parent;
+	default:
+		return NULL;
+	}
+}
+
 static inline struct iommufd_hwpt_paging *
 iommufd_get_hwpt_paging(struct iommufd_ucmd *ucmd, u32 id)
 {

From d096dab86181104f926164b515609441683dca42 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 16 Mar 2023 11:42:16 +0000
Subject: [PATCH 300/352] iommu/dma: Support MSIs through nested domains

Currently, iommu-dma is the only place outside of IOMMUFD and drivers
which might need to be aware of the stage 2 domain encapsulated within
a nested domain. This would be still the RMR solution where we're using
host-managed MSIs with an identity mapping at stage 1, where it is
the underlying stage 2 domain which owns an MSI cookie and holds the
corresponding dynamic mappings. Hook up the new op to resolve what we
need from a nested domain.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/dma-iommu.c | 18 ++++++++++++++++--
 include/linux/iommu.h     |  4 ++++
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 19537aef13ffd..d1f64ce5b1b6c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1810,6 +1810,20 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 	return NULL;
 }
 
+/*
+ * Nested domains may not have an MSI cookie or accept mappings, but they may
+ * be related to a domain which does, so we let them tell us what they need.
+ */
+static struct iommu_domain *iommu_dma_get_msi_mapping_domain(struct device *dev)
+{
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+
+	if (domain && domain->type == IOMMU_DOMAIN_NESTED &&
+	    domain->ops && domain->ops->get_msi_mapping_domain)
+		domain = domain->ops->get_msi_mapping_domain(domain);
+	return domain;
+}
+
 /**
  * iommu_dma_prepare_msi() - Map the MSI page in the IOMMU domain
  * @desc: MSI descriptor, will store the MSI page
@@ -1820,7 +1834,7 @@ static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
 int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
 {
 	struct device *dev = msi_desc_to_dev(desc);
-	struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev);
 	struct iommu_dma_msi_page *msi_page;
 	static DEFINE_MUTEX(msi_prepare_lock); /* see below */
 
@@ -1853,7 +1867,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr)
 void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
 {
 	struct device *dev = msi_desc_to_dev(desc);
-	const struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
+	const struct iommu_domain *domain = iommu_dma_get_msi_mapping_domain(dev);
 	const struct iommu_dma_msi_page *msi_page;
 
 	msi_page = msi_desc_get_iommu_cookie(desc);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index eabab0830be1c..02e5a543c79a6 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -690,6 +690,8 @@ struct iommu_ops {
  *                           specific mechanisms.
  * @set_pgtable_quirks: Set io page table quirks (IO_PGTABLE_QUIRK_*)
  * @free: Release the domain after use.
+ * @get_msi_mapping_domain: Return the related iommu_domain that should hold the
+ *                          MSI cookie and accept mapping(s).
  */
 struct iommu_domain_ops {
 	int (*attach_dev)(struct iommu_domain *domain, struct device *dev);
@@ -721,6 +723,8 @@ struct iommu_domain_ops {
 				  unsigned long quirks);
 
 	void (*free)(struct iommu_domain *domain);
+	struct iommu_domain *
+		(*get_msi_mapping_domain)(struct iommu_domain *domain);
 };
 
 /**

From e2bbd702bdce5169e09a474611c5b1d3ed279f22 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 7 Jul 2022 00:15:58 -0700
Subject: [PATCH 301/352] iommu/arm-smmu-v3: Implement
 arm_smmu_get_msi_mapping_domain

In a 1-stage translation setup, a device is attached to a paging domain.
In a 2-stage translation setup, a device is attached to a nested domain,
which does not have the mappings for the MSI page but only an s2_parent
paging domain pointer that holds the mappings.

Add arm_smmu_get_msi_mapping_domain in arm_smmu_nested_ops to return the
correct paging domain.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 8e07331f95eee..28a4de6892ac8 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -3249,6 +3249,15 @@ static struct iommu_domain arm_smmu_blocked_domain = {
 	.ops = &arm_smmu_blocked_ops,
 };
 
+static struct iommu_domain *
+arm_smmu_get_msi_mapping_domain(struct iommu_domain *domain)
+{
+	struct arm_smmu_nested_domain *nested_domain =
+		container_of(domain, struct arm_smmu_nested_domain, domain);
+
+	return &nested_domain->s2_parent->domain;
+}
+
 static int arm_smmu_attach_dev_nested(struct iommu_domain *domain,
 				      struct device *dev)
 {
@@ -3461,6 +3470,7 @@ static int arm_smmu_cache_invalidate_user(struct iommu_domain *domain,
 }
 
 static const struct iommu_domain_ops arm_smmu_nested_ops = {
+	.get_msi_mapping_domain	= arm_smmu_get_msi_mapping_domain,
 	.attach_dev = arm_smmu_attach_dev_nested,
 	.free = arm_smmu_domain_nested_free,
 	.cache_invalidate_user	= arm_smmu_cache_invalidate_user,

From da8338fefef3a4c2e9ef3e9258ad17cc470da5a5 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 22 Aug 2024 06:02:34 +0000
Subject: [PATCH 302/352] cover-letter: Apply RMR solution for MSI mappings

An alternative solution is under development but not finished yet.

Till that, use the RMR solution to test devices using MSIs.

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 1289a582c343784d98b6cb8e56f87a9cb9e2845a Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 15 Nov 2023 10:27:43 +0000
Subject: [PATCH 303/352] WAR: iommufd/pages: Bypass PFNMAP

This is used for GPU memory mapping. The solution is a WAR while waiting
for the upstream solution that would use dmabuf to map the entire range
in a single sequence.

Related topics:
https://lore.kernel.org/kvm/20240624065552.1572580-1-vivek.kasireddy@intel.com/
https://lore.kernel.org/kvm/cover.1719909395.git.leon@kernel.org/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/pages.c | 87 ++++++++++++++++++++++++++++++++---
 1 file changed, 80 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommufd/pages.c b/drivers/iommu/iommufd/pages.c
index 93d806c9c0731..904432b811e8e 100644
--- a/drivers/iommu/iommufd/pages.c
+++ b/drivers/iommu/iommufd/pages.c
@@ -638,9 +638,10 @@ static void batch_unpin(struct pfn_batch *batch, struct iopt_pages *pages,
 		size_t to_unpin = min_t(size_t, npages,
 					batch->npfns[cur] - first_page_off);
 
-		unpin_user_page_range_dirty_lock(
-			pfn_to_page(batch->pfns[cur] + first_page_off),
-			to_unpin, pages->writable);
+		if (pfn_valid(batch->pfns[cur] + first_page_off))
+			unpin_user_page_range_dirty_lock(
+				pfn_to_page(batch->pfns[cur] + first_page_off),
+				to_unpin, pages->writable);
 		iopt_pages_sub_npinned(pages, to_unpin);
 		cur++;
 		first_page_off = 0;
@@ -733,6 +734,42 @@ static void pfn_reader_user_destroy(struct pfn_reader_user *user,
 	user->upages = NULL;
 }
 
+static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
+	       unsigned long vaddr, unsigned long *pfn,
+	       bool write_fault)
+{
+   pte_t *ptep;
+   spinlock_t *ptl;
+   int ret;
+
+   ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+   if (ret) {
+       bool unlocked = false;
+
+       ret = fixup_user_fault(mm, vaddr,
+		      FAULT_FLAG_REMOTE |
+		      (write_fault ?  FAULT_FLAG_WRITE : 0),
+		      &unlocked);
+       if (unlocked)
+	   return -EAGAIN;
+
+       if (ret)
+	   return ret;
+
+       ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
+       if (ret)
+	   return ret;
+   }
+
+   if (write_fault && !pte_write(*ptep))
+       ret = -EFAULT;
+   else
+       *pfn = pte_pfn(*ptep);
+
+   pte_unmap_unlock(ptep, ptl);
+   return ret;
+}
+
 static int pfn_reader_user_pin(struct pfn_reader_user *user,
 			       struct iopt_pages *pages,
 			       unsigned long start_index,
@@ -789,6 +826,42 @@ static int pfn_reader_user_pin(struct pfn_reader_user *user,
 					   user->gup_flags, user->upages,
 					   &user->locked);
 	}
+
+	if (rc < 0) {
+		struct vm_area_struct *vma;
+		unsigned long vaddr;
+		unsigned long pfn;
+		int pinned = 0;
+
+		/* fast path above doesn't hold the lock */
+		if (!user->locked)
+			mmap_read_lock(pages->source_mm);
+		vaddr = untagged_addr_remote(pages->source_mm, uptr);
+retry:
+		vma = vma_lookup(pages->source_mm, vaddr);
+		if (vma && vma->vm_flags & VM_PFNMAP) {
+			do {
+				rc = follow_fault_pfn(vma, pages->source_mm, vaddr,
+						      &pfn, pages->writable);
+				if (rc == -EAGAIN)
+					goto retry;
+				if (!rc) {
+					if (!pfn_valid(pfn)) {
+						user->upages[pinned] = pfn_to_page(pfn);
+						pinned += 1;
+						vaddr += PAGE_SIZE;
+					} else {
+						rc = -EFAULT;
+					}
+				}
+			} while (pinned < npages && vaddr < vma->vm_end && !rc);
+		}
+		if (pinned)
+			rc = pinned;
+		if (!user->locked)
+			mmap_read_unlock(pages->source_mm);
+	}
+
 	if (rc <= 0) {
 		if (WARN_ON(!rc))
 			return -EFAULT;
@@ -1096,10 +1169,10 @@ static void pfn_reader_release_pins(struct pfn_reader *pfns)
 	if (pfns->user.upages_end > pfns->batch_end_index) {
 		size_t npages = pfns->user.upages_end - pfns->batch_end_index;
 
-		/* Any pages not transferred to the batch are just unpinned */
-		unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
-						      pfns->user.upages_start),
-				 npages);
+		if (pfn_valid(page_to_pfn(pfns->user.upages[0])))
+			/* Any pages not transferred to the batch are just unpinned */
+			unpin_user_pages(pfns->user.upages + (pfns->batch_end_index -
+							 pfns->user.upages_start), npages);
 		iopt_pages_sub_npinned(pages, npages);
 		pfns->user.upages_end = pfns->batch_end_index;
 	}

From b50768a4d39e029fa912fca089b24d06184b451a Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 15 Nov 2023 10:28:05 +0000
Subject: [PATCH 304/352] WAR: vfio/pci: Report PASID capability

The upstream solution is under discussion:
https://lore.kernel.org/kvm/20240412082121.33382-1-yi.l.liu@intel.com/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/vfio_pci_config.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c
index 97422aafaa7b5..971e32bc0bb4e 100644
--- a/drivers/vfio/pci/vfio_pci_config.c
+++ b/drivers/vfio/pci/vfio_pci_config.c
@@ -95,7 +95,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = {
 	[PCI_EXT_CAP_ID_LTR]	=	PCI_EXT_CAP_LTR_SIZEOF,
 	[PCI_EXT_CAP_ID_SECPCI]	=	0,	/* not yet */
 	[PCI_EXT_CAP_ID_PMUX]	=	0,	/* not yet */
-	[PCI_EXT_CAP_ID_PASID]	=	0,	/* not yet */
+	[PCI_EXT_CAP_ID_PASID]	=	PCI_EXT_CAP_PASID_SIZEOF,	/* not yet */
 	[PCI_EXT_CAP_ID_DVSEC]	=	0xFF,
 };
 

From 873c71f157f590a99313a5fd2e124ae97da4157b Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 6 Dec 2023 04:32:07 +0000
Subject: [PATCH 305/352] mm: handle poisoning of pfn without struct pages

The kernel MM currently does not handle ECC errors / poison on a memory
region that is not backed by struct pages. If a memory region is mapped
using remap_pfn_range(), but not added to the kernel, MM will not have
associated struct pages. Add a new mechanism to handle memory failure
on such memory.

Make kernel MM expose a function to allow modules managing the device
memory to register a failure function and the physical address space
associated with the device memory. MM maintains this information as
interval tree. The registered memory failure function is used by MM to
notify the kernel module managing the PFN, so that the module may take
any required action. The module for example may use the information
to track the poisoned pages.

In this implementation, kernel MM follows the following sequence similar
(mostly) to the memory_failure() handler for struct page backed memory:
1. memory_failure() is triggered on reception of a poison error. An
absence of struct page is detected and consequently memory_failure_pfn()
is executed.
2. memory_failure_pfn() call the newly introduced failure handler exposed
by the module managing the poisoned memory to notify it of the problematic
PFN.
3. memory_failure_pfn() unmaps the stage-2 mapping to the PFN.
4. memory_failure_pfn() collects the processes mapped to the PFN.
5. memory_failure_pfn() sends SIGBUS (BUS_MCEERR_AO) to all the processes
mapping the faulty PFN using kill_procs().
6. An access to the faulty PFN by an operation in VM at a later point
is trapped and user_mem_abort() is called.
7. The vma ops fault function gets called due to the absence of Stage-2
mapping. It is expected to return VM_FAULT_HWPOISON on the PFN.
8. __gfn_to_pfn_memslot() then returns KVM_PFN_ERR_HWPOISON, which cause
the poison with SIGBUS (BUS_MCEERR_AR) to be sent to the QEMU process
through kvm_send_hwpoison_signal().

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/memory-failure.h |  22 +++++
 include/linux/mm.h             |   1 +
 include/ras/ras_event.h        |   1 +
 mm/Kconfig                     |   1 +
 mm/memory-failure.c            | 147 +++++++++++++++++++++++++++------
 5 files changed, 147 insertions(+), 25 deletions(-)
 create mode 100644 include/linux/memory-failure.h

diff --git a/include/linux/memory-failure.h b/include/linux/memory-failure.h
new file mode 100644
index 0000000000000..9a579960972aa
--- /dev/null
+++ b/include/linux/memory-failure.h
@@ -0,0 +1,22 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_MEMORY_FAILURE_H
+#define _LINUX_MEMORY_FAILURE_H
+
+#include <linux/interval_tree.h>
+
+struct pfn_address_space;
+
+struct pfn_address_space_ops {
+	void (*failure)(struct pfn_address_space *pfn_space, unsigned long pfn);
+};
+
+struct pfn_address_space {
+	struct interval_tree_node node;
+	const struct pfn_address_space_ops *ops;
+	struct address_space *mapping;
+};
+
+int register_pfn_address_space(struct pfn_address_space *pfn_space);
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space);
+
+#endif /* _LINUX_MEMORY_FAILURE_H */
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 28fca5d2d9193..bddda315ef6f6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4061,6 +4061,7 @@ enum mf_action_page_type {
 	MF_MSG_BUDDY,
 	MF_MSG_DAX,
 	MF_MSG_UNSPLIT_THP,
+	MF_MSG_PFN_MAP,
 	MF_MSG_UNKNOWN,
 };
 
diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h
index cbd3ddd7c33d4..05c3e6f6bd020 100644
--- a/include/ras/ras_event.h
+++ b/include/ras/ras_event.h
@@ -373,6 +373,7 @@ TRACE_EVENT(aer_event,
 	EM ( MF_MSG_BUDDY, "free buddy page" )				\
 	EM ( MF_MSG_DAX, "dax page" )					\
 	EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" )			\
+	EM ( MF_MSG_PFN_MAP, "non struct page pfn" )			\
 	EMe ( MF_MSG_UNKNOWN, "unknown page" )
 
 /*
diff --git a/mm/Kconfig b/mm/Kconfig
index 4914eb3216fcc..873d84ccf1dce 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -749,6 +749,7 @@ config MEMORY_FAILURE
 	depends on ARCH_SUPPORTS_MEMORY_FAILURE
 	bool "Enable recovery from hardware memory errors"
 	select MEMORY_ISOLATION
+	select INTERVAL_TREE
 	select RAS
 	help
 	  Enables code to recover from some memory failures on systems
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index e6c957f5128eb..3d79d283cf263 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/memory-failure.h>
 #include <linux/page-flags.h>
 #include <linux/sched/signal.h>
 #include <linux/sched/task.h>
@@ -60,6 +61,7 @@
 #include <linux/pagewalk.h>
 #include <linux/shmem_fs.h>
 #include <linux/sysctl.h>
+#include <linux/pfn_t.h>
 #include "swap.h"
 #include "internal.h"
 #include "ras/ras_event.h"
@@ -144,6 +146,10 @@ static struct ctl_table memory_failure_table[] = {
 	{ }
 };
 
+static struct rb_root_cached pfn_space_itree = RB_ROOT_CACHED;
+
+static DEFINE_MUTEX(pfn_space_lock);
+
 /*
  * Return values:
  *   1:   the page is dissolved (if needed) and taken off from buddy,
@@ -440,15 +446,16 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
  * Schedule a process for later kill.
  * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
  *
- * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
- * filesystem with a memory failure handler has claimed the
- * memory_failure event. In all other cases, page->index and
- * page->mapping are sufficient for mapping the page back to its
- * corresponding user virtual address.
+ * Notice: @pgoff is used when:
+ * a. @p is a fsdax page and a filesystem with a memory failure handler
+ * has claimed the memory_failure event.
+ * b. pgoff is not backed by struct page.
+ * In all other cases, page->index and page->mapping are sufficient
+ * for mapping the page back to its corresponding user virtual address.
  */
 static void __add_to_kill(struct task_struct *tsk, struct page *p,
 			  struct vm_area_struct *vma, struct list_head *to_kill,
-			  unsigned long ksm_addr, pgoff_t fsdax_pgoff)
+			  unsigned long ksm_addr, pgoff_t pgoff)
 {
 	struct to_kill *tk;
 
@@ -458,13 +465,20 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 		return;
 	}
 
-	tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
-	if (is_zone_device_page(p)) {
-		if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
-			tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
-		tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
-	} else
-		tk->size_shift = page_shift(compound_head(p));
+	/* Check for pgoff not backed by struct page */
+	if (!(pfn_valid(pgoff)) && (vma->vm_flags | PFN_MAP)) {
+		tk->addr = vma_pgoff_address(pgoff, 1, vma);
+		tk->size_shift = PAGE_SHIFT;
+	} else {
+		tk->addr = ksm_addr ? ksm_addr : page_address_in_vma(p, vma);
+		if (is_zone_device_page(p)) {
+			if (pgoff != FSDAX_INVALID_PGOFF)
+				tk->addr = vma_pgoff_address(pgoff, 1, vma);
+			tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
+		} else {
+			tk->size_shift = page_shift(compound_head(p));
+		}
+	}
 
 	/*
 	 * Send SIGKILL if "tk->addr == -EFAULT". Also, as
@@ -477,8 +491,8 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p,
 	 * has a mapping for the page.
 	 */
 	if (tk->addr == -EFAULT) {
-		pr_info("Unable to find user space address %lx in %s\n",
-			page_to_pfn(p), tsk->comm);
+		pr_info("Unable to find address %lx in %s\n",
+			pfn_valid(pgoff) ? page_to_pfn(p) : pgoff, tsk->comm);
 	} else if (tk->size_shift == 0) {
 		kfree(tk);
 		return;
@@ -683,8 +697,7 @@ static void collect_procs_file(struct folio *folio, struct page *page,
 	i_mmap_unlock_read(mapping);
 }
 
-#ifdef CONFIG_FS_DAX
-static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
+static void add_to_kill_pgoff(struct task_struct *tsk, struct page *p,
 			      struct vm_area_struct *vma,
 			      struct list_head *to_kill, pgoff_t pgoff)
 {
@@ -692,11 +705,12 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p,
 }
 
 /*
- * Collect processes when the error hit a fsdax page.
+ * Collect processes when the error hit a fsdax page or a PFN not backed by
+ * struct page.
  */
-static void collect_procs_fsdax(struct page *page,
-		struct address_space *mapping, pgoff_t pgoff,
-		struct list_head *to_kill, bool pre_remove)
+static void collect_procs_pgoff(struct page *page,
+				struct address_space *mapping, pgoff_t pgoff,
+				struct list_head *to_kill, bool pre_remove)
 {
 	struct vm_area_struct *vma;
 	struct task_struct *tsk;
@@ -717,13 +731,12 @@ static void collect_procs_fsdax(struct page *page,
 			continue;
 		vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 			if (vma->vm_mm == t->mm)
-				add_to_kill_fsdax(t, page, vma, to_kill, pgoff);
+				add_to_kill_pgoff(t, page, vma, to_kill, pgoff);
 		}
 	}
 	rcu_read_unlock();
 	i_mmap_unlock_read(mapping);
 }
-#endif /* CONFIG_FS_DAX */
 
 /*
  * Collect the processes who have the corrupted page mapped to kill.
@@ -917,6 +930,7 @@ static const char * const action_page_types[] = {
 	[MF_MSG_BUDDY]			= "free buddy page",
 	[MF_MSG_DAX]			= "dax page",
 	[MF_MSG_UNSPLIT_THP]		= "unsplit thp",
+	[MF_MSG_PFN_MAP]		= "non struct page pfn",
 	[MF_MSG_UNKNOWN]		= "unknown page",
 };
 
@@ -1348,7 +1362,8 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
 
 	num_poisoned_pages_inc(pfn);
 
-	update_per_node_mf_stats(pfn, result);
+	if (type != MF_MSG_PFN_MAP)
+		update_per_node_mf_stats(pfn, result);
 
 	pr_err("%#lx: recovery action for %s: %s\n",
 		pfn, action_page_types[type], action_name[result]);
@@ -1841,7 +1856,7 @@ int mf_dax_kill_procs(struct address_space *mapping, pgoff_t index,
 		 * The pre_remove case is revoking access, the memory is still
 		 * good and could theoretically be put back into service.
 		 */
-		collect_procs_fsdax(page, mapping, index, &to_kill, pre_remove);
+		collect_procs_pgoff(page, mapping, index, &to_kill, pre_remove);
 		unmap_and_kill(&to_kill, page_to_pfn(page), mapping,
 				index, mf_flags);
 unlock:
@@ -2180,6 +2195,83 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
 	return rc;
 }
 
+int register_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+	if (!pfn_space)
+		return -EINVAL;
+
+	if (!request_mem_region(pfn_space->node.start << PAGE_SHIFT,
+	    (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT, ""))
+		return -EBUSY;
+
+	mutex_lock(&pfn_space_lock);
+	interval_tree_insert(&pfn_space->node, &pfn_space_itree);
+	mutex_unlock(&pfn_space_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_pfn_address_space);
+
+void unregister_pfn_address_space(struct pfn_address_space *pfn_space)
+{
+	if (!pfn_space)
+		return;
+
+	mutex_lock(&pfn_space_lock);
+	interval_tree_remove(&pfn_space->node, &pfn_space_itree);
+	mutex_unlock(&pfn_space_lock);
+	release_mem_region(pfn_space->node.start << PAGE_SHIFT,
+			   (pfn_space->node.last - pfn_space->node.start + 1) << PAGE_SHIFT);
+}
+EXPORT_SYMBOL_GPL(unregister_pfn_address_space);
+
+static int memory_failure_pfn(unsigned long pfn, int flags)
+{
+	struct interval_tree_node *node;
+	int res = MF_FAILED;
+	LIST_HEAD(tokill);
+
+	mutex_lock(&pfn_space_lock);
+	/*
+	 * Modules registers with MM the address space mapping to the device memory they
+	 * manage. Iterate to identify exactly which address space has mapped to this
+	 * failing PFN.
+	 */
+	for (node = interval_tree_iter_first(&pfn_space_itree, pfn, pfn); node;
+	     node = interval_tree_iter_next(node, pfn, pfn)) {
+		struct pfn_address_space *pfn_space =
+			container_of(node, struct pfn_address_space, node);
+		/*
+		 * Modules managing the device memory need to be conveyed about the
+		 * memory failure so that the poisoned PFN can be tracked.
+		 */
+		if (pfn_space->ops)
+			pfn_space->ops->failure(pfn_space, pfn);
+
+		collect_procs_pgoff(NULL, pfn_space->mapping, pfn, &tokill, false);
+
+		unmap_mapping_range(pfn_space->mapping, pfn << PAGE_SHIFT,
+				    PAGE_SIZE, 0);
+
+		res = MF_RECOVERED;
+	}
+	mutex_unlock(&pfn_space_lock);
+
+	if (res == MF_FAILED)
+		return action_result(pfn, MF_MSG_PFN_MAP, res);
+
+	/*
+	 * Unlike System-RAM there is no possibility to swap in a different
+	 * physical page at a given virtual address, so all userspace
+	 * consumption of direct PFN memory necessitates SIGBUS (i.e.
+	 * MF_MUST_KILL)
+	 */
+	flags |= MF_ACTION_REQUIRED | MF_MUST_KILL;
+	kill_procs(&tokill, true, false, pfn, flags);
+
+	return action_result(pfn, MF_MSG_PFN_MAP, MF_RECOVERED);
+}
+
 /**
  * memory_failure - Handle memory failure of a page.
  * @pfn: Page Number of the corrupted page
@@ -2219,6 +2311,11 @@ int memory_failure(unsigned long pfn, int flags)
 	if (!(flags & MF_SW_SIMULATED))
 		hw_memory_failure = true;
 
+	if (!pfn_valid(pfn) && !arch_is_platform_page(PFN_PHYS(pfn))) {
+		res = memory_failure_pfn(pfn, flags);
+		goto unlock_mutex;
+	}
+
 	p = pfn_to_online_page(pfn);
 	if (!p) {
 		res = arch_memory_failure(pfn, flags);

From f264b2a14ee4f89aa3e04c257517460bc9555844 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 6 Dec 2023 04:32:33 +0000
Subject: [PATCH 306/352] mm: Add poison error check in fixup_user_fault() for
 mapped pfn

The fixup_user_fault() currently does not expect a VM_FAULT_HWPOISON
and hence does not check for it while calling vm_fault_to_errno(). Since
we now have a new code path which can trigger such case, change
fixup_user_fault to look for VM_FAULT_HWPOISON.

Also make hva_to_pfn_remapped check for -EHWPOISON and communicate the
poison fault up to the user_mem_abort().

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 mm/gup.c            | 2 +-
 virt/kvm/kvm_main.c | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/mm/gup.c b/mm/gup.c
index f6d55635742f5..9f1f2a8eb6bd3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1432,7 +1432,7 @@ int fixup_user_fault(struct mm_struct *mm,
 	}
 
 	if (ret & VM_FAULT_ERROR) {
-		int err = vm_fault_to_errno(ret, 0);
+		int err = vm_fault_to_errno(ret, FOLL_HWPOISON);
 
 		if (err)
 			return err;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6a56de7ff82e7..dd34af6500eb8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -3016,6 +3016,12 @@ kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
 		r = hva_to_pfn_remapped(vma, addr, write_fault, writable, &pfn);
 		if (r == -EAGAIN)
 			goto retry;
+
+		if (r == -EHWPOISON) {
+			pfn = KVM_PFN_ERR_HWPOISON;
+			goto exit;
+		}
+
 		if (r < 0)
 			pfn = KVM_PFN_ERR_FAULT;
 	} else {

From 0eaffdf3c35fa4efdedf3c7ea83653838f8488b0 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 6 Dec 2023 04:33:27 +0000
Subject: [PATCH 307/352] mm: Change ghes code to allow poison of non-struct
 pfn

The GHES code allows calling of memory_failure() on the PFNs that pass the
pfn_valid() check. This contract is broken for the remapped PFNs which
fails the check and ghes_do_memory_failure() returns without triggering
memory_failure().

Update code to allow memory_failure() call on PFNs failing pfn_valid().

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/acpi/apei/ghes.c | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ab2a82cb1b0b4..66e037af9592c 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -485,20 +485,10 @@ static void ghes_kick_task_work(struct callback_head *head)
 
 static bool ghes_do_memory_failure(u64 physical_addr, int flags)
 {
-	unsigned long pfn;
-
 	if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
 		return false;
 
-	pfn = PHYS_PFN(physical_addr);
-	if (!pfn_valid(pfn) && !arch_is_platform_page(physical_addr)) {
-		pr_warn_ratelimited(FW_WARN GHES_PFX
-		"Invalid address in generic error data: %#llx\n",
-		physical_addr);
-		return false;
-	}
-
-	memory_failure_queue(pfn, flags);
+	memory_failure_queue(PHYS_PFN(physical_addr), flags);
 	return true;
 }
 

From ac4ba54693290ebdb04ebcecfa816925fe892795 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sun, 25 Feb 2024 12:35:06 +0000
Subject: [PATCH 308/352] vfio/nvgrace-gpu: register device memory for poison
 handling

The nvgrace-gpu-vfio-pci module [1] maps the device memory to the user VA
(Qemu) using remap_pfn_range() without adding the memory to the kernel.
The device memory pages are not backed by struct page. Patches 1-3
implements the mechanism to handle ECC/poison on memory page without
struct page and expose a registration function. This new mechanism is
leveraged here.

The module registers its memory region with the kernel MM for ECC handling
using the register_pfn_address_space() registration API exposed by the
kernel. It also defines a failure callback function pfn_memory_failure()
to get the poisoned PFN from the MM.

The module track poisoned PFN using a hastable. The PFN is communicated
by the kernel MM to the module through the failure function, which push
the appropriate memory offset to the hashtable.

The module also defines a VMA fault ops for the module. It returns
VM_FAULT_HWPOISON in case the memory offset is found in the hashtable.

[1] https://lore.kernel.org/all/20231114081611.30550-1-ankita@nvidia.com/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 147 +++++++++++++++++++++++++++-
 drivers/vfio/vfio_main.c            |   3 +-
 2 files changed, 148 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index a7fd018aa5483..0853c66989948 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -5,6 +5,18 @@
 
 #include <linux/sizes.h>
 #include <linux/vfio_pci_core.h>
+#include <linux/vmalloc.h>
+
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/bitmap.h>
+#include <linux/memory-failure.h>
+#include <linux/hashtable.h>
+#endif
+
+struct h_node {
+	unsigned long mem_offset;
+	struct hlist_node node;
+};
 
 /*
  * The device memory usable to the workloads running in the VM is cached
@@ -36,6 +48,10 @@ struct mem_region {
 		void *memaddr;
 		void __iomem *ioaddr;
 	};                      /* Base virtual address of the region */
+#ifdef CONFIG_MEMORY_FAILURE
+	struct pfn_address_space pfn_address_space;
+	DECLARE_HASHTABLE(htbl, 8);
+#endif
 };
 
 struct nvgrace_gpu_pci_core_device {
@@ -48,6 +64,97 @@ struct nvgrace_gpu_pci_core_device {
 	struct mutex remap_lock;
 };
 
+#ifdef CONFIG_MEMORY_FAILURE
+static void
+nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
+					unsigned long pfn)
+{
+	struct mem_region *region = container_of(pfn_space,
+			struct mem_region, pfn_address_space);
+	unsigned long mem_offset = pfn - pfn_space->node.start;
+	struct h_node *ecc;
+
+	if (mem_offset >= region->memlength)
+		return;
+
+	/*
+	 * MM has called to notify a poisoned page. Track that in the hastable.
+	 */
+	ecc = (struct h_node *)(vzalloc(sizeof(struct h_node)));
+	ecc->mem_offset = mem_offset;
+	hash_add(region->htbl, &ecc->node, ecc->mem_offset);
+}
+
+struct pfn_address_space_ops nvgrace_gpu_vfio_pci_pas_ops = {
+	.failure = nvgrace_gpu_vfio_pci_pfn_memory_failure,
+};
+
+static int
+nvgrace_gpu_vfio_pci_register_pfn_range(struct mem_region *region,
+					struct vm_area_struct *vma)
+{
+	unsigned long nr_pages;
+	int ret = 0;
+
+	nr_pages = region->memlength >> PAGE_SHIFT;
+
+	region->pfn_address_space.node.start = vma->vm_pgoff;
+	region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+	region->pfn_address_space.ops = &nvgrace_gpu_vfio_pci_pas_ops;
+	region->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+	ret = register_pfn_address_space(&region->pfn_address_space);
+
+	return ret;
+}
+
+extern struct vfio_device *vfio_device_from_file(struct file *file);
+
+static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
+{
+	unsigned long mem_offset = vmf->pgoff - vmf->vma->vm_pgoff;
+	struct vfio_device *core_vdev;
+	struct nvgrace_gpu_pci_core_device *nvdev;
+	struct h_node *cur;
+
+	if (!(vmf->vma->vm_file))
+		goto error_exit;
+
+	core_vdev = vfio_device_from_file(vmf->vma->vm_file);
+
+	if (!core_vdev)
+		goto error_exit;
+
+	nvdev = container_of(core_vdev,
+			     struct nvgrace_gpu_pci_core_device,
+			     core_device.vdev);
+
+	/*
+	 * Check if the page is poisoned.
+	 */
+	if (mem_offset < (nvdev->resmem.memlength >> PAGE_SHIFT)) {
+		hash_for_each_possible(nvdev->resmem.htbl, cur, node, mem_offset) {
+			if (cur->mem_offset == mem_offset)
+				return VM_FAULT_HWPOISON;
+		}
+	}
+
+	if (mem_offset < (nvdev->usemem.memlength >> PAGE_SHIFT)) {
+		hash_for_each_possible(nvdev->usemem.htbl, cur, node, mem_offset) {
+			if (cur->mem_offset == mem_offset)
+				return VM_FAULT_HWPOISON;
+		}
+	}
+
+error_exit:
+	return VM_FAULT_ERROR;
+}
+
+static const struct vm_operations_struct nvgrace_gpu_vfio_pci_mmap_ops = {
+	.fault = nvgrace_gpu_vfio_pci_fault,
+};
+#endif
+
 static void nvgrace_gpu_init_fake_bar_emu_regs(struct vfio_device *core_vdev)
 {
 	struct nvgrace_gpu_pci_core_device *nvdev =
@@ -115,6 +222,10 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
 
 	mutex_destroy(&nvdev->remap_lock);
 
+#ifdef CONFIG_MEMORY_FAILURE
+	unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
+	unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
+#endif
 	vfio_pci_core_close_device(core_vdev);
 }
 
@@ -190,7 +301,16 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 
 	vma->vm_pgoff = start_pfn;
 
-	return 0;
+#ifdef CONFIG_MEMORY_FAILURE
+	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
+
+	if (index == VFIO_PCI_BAR2_REGION_INDEX)
+		ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma);
+	else
+		ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma);
+#endif
+
+	return ret;
 }
 
 static long
@@ -846,6 +966,13 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	if (ret)
 		goto out_put_vdev;
 
+#ifdef CONFIG_MEMORY_FAILURE
+	/*
+	 * Initialize the hashtable tracking the poisoned pages.
+	 */
+	hash_init(nvdev->resmem.htbl);
+	hash_init(nvdev->usemem.htbl);
+#endif
 	return ret;
 
 out_put_vdev:
@@ -857,6 +984,24 @@ static void nvgrace_gpu_remove(struct pci_dev *pdev)
 {
 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
 
+#ifdef CONFIG_MEMORY_FAILURE
+	struct h_node *cur;
+	unsigned long bkt;
+	struct hlist_node *tmp_node;
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_device, struct nvgrace_gpu_pci_core_device,
+			     core_device);
+	hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) {
+		hash_del(&cur->node);
+		vfree(cur);
+	}
+
+	hash_for_each_safe(nvdev->usemem.htbl, bkt, tmp_node, cur, node) {
+		hash_del(&cur->node);
+		vfree(cur);
+	}
+#endif
+
 	vfio_pci_core_unregister_device(core_device);
 	vfio_put_device(&core_device->vdev);
 }
diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c
index e97d796a54fba..c2e798579eea9 100644
--- a/drivers/vfio/vfio_main.c
+++ b/drivers/vfio/vfio_main.c
@@ -1321,7 +1321,7 @@ const struct file_operations vfio_device_fops = {
 	.mmap		= vfio_device_fops_mmap,
 };
 
-static struct vfio_device *vfio_device_from_file(struct file *file)
+struct vfio_device *vfio_device_from_file(struct file *file)
 {
 	struct vfio_device_file *df = file->private_data;
 
@@ -1329,6 +1329,7 @@ static struct vfio_device *vfio_device_from_file(struct file *file)
 		return NULL;
 	return df->device;
 }
+EXPORT_SYMBOL_GPL(vfio_device_from_file);
 
 /**
  * vfio_file_is_valid - True if the file is valid vfio file

From a556373fda0384a991a0b00ce6d98ba3917582bd Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Mon, 26 Feb 2024 08:52:18 +0000
Subject: [PATCH 309/352] KVM: arm64: determine memory type from VMA

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/include/asm/kvm_pgtable.h |  8 +++++
 arch/arm64/kvm/hyp/pgtable.c         |  2 +-
 arch/arm64/kvm/mmu.c                 | 44 +++++++++++++++++++++++-----
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 19278dfe79782..b3464da626fc8 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -460,6 +460,14 @@ u64 kvm_pgtable_hyp_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
  */
 u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift);
 
+/**
+ * stage2_has_fwb() - Determine whether FWB is supported
+ * @pgt:    Page-table structure initialised by kvm_pgtable_stage2_init*()
+ *
+ * Return: True if FWB is supported.
+ */
+bool stage2_has_fwb(struct kvm_pgtable *pgt);
+
 /**
  * kvm_pgtable_stage2_pgd_size() - Helper to compute size of a stage-2 PGD
  * @vtcr:	Content of the VTCR register.
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index b7cb9b67a063f..e34cce31bd65f 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -684,7 +684,7 @@ u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift)
 	return vtcr;
 }
 
-static bool stage2_has_fwb(struct kvm_pgtable *pgt)
+bool stage2_has_fwb(struct kvm_pgtable *pgt)
 {
 	if (!cpus_have_final_cap(ARM64_HAS_STAGE2_FWB))
 		return false;
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 29db0d71cdd7e..f727ea1ceb09c 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1374,6 +1374,15 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
 	return vma->vm_flags & VM_MTE_ALLOWED;
 }
 
+/*
+ * Determine the memory region cacheability from VMA's pgprot. This
+ * is used to set the stage 2 PTEs.
+ */
+static unsigned long mapping_type(pgprot_t page_prot)
+{
+	return FIELD_GET(PTE_ATTRINDX_MASK, pgprot_val(page_prot));
+}
+
 static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 			  struct kvm_memory_slot *memslot, unsigned long hva,
 			  bool fault_is_perm)
@@ -1474,6 +1483,22 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
 
+	/*
+	 * Figure out the memory type based on the user va mapping properties
+	 * Only MT_DEVICE_nGnRE and MT_DEVICE_nGnRnE will be set using
+	 * pgprot_device() and pgprot_noncached() respectively.
+	 */
+	if ((mapping_type(vma->vm_page_prot) == MT_DEVICE_nGnRE) ||
+	    (mapping_type(vma->vm_page_prot) == MT_DEVICE_nGnRnE) ||
+	    (mapping_type(vma->vm_page_prot) == MT_NORMAL_NC)) {
+		if (vfio_allow_any_uc)
+			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
+		else
+			prot |= KVM_PGTABLE_PROT_DEVICE;
+	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
+		prot |= KVM_PGTABLE_PROT_X;
+	}
+
 	/* Don't use the VMA after the unlock -- it may have vanished */
 	vma = NULL;
 
@@ -1559,13 +1584,18 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	if (exec_fault)
 		prot |= KVM_PGTABLE_PROT_X;
 
-	if (device) {
-		if (vfio_allow_any_uc)
-			prot |= KVM_PGTABLE_PROT_NORMAL_NC;
-		else
-			prot |= KVM_PGTABLE_PROT_DEVICE;
-	} else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
-		prot |= KVM_PGTABLE_PROT_X;
+	/*
+	 *  When FWB is unsupported KVM needs to do cache flushes
+	 *  (via dcache_clean_inval_poc()) of the underlying memory. This is
+	 *  only possible if the memory is already mapped into the kernel map
+	 *  at the usual spot.
+	 *
+	 *  Validate that there is a struct page for the PFN which maps
+	 *  to the KVA that the flushing code expects.
+	 */
+	if (!stage2_has_fwb(pgt) && !(pfn_valid(pfn))) {
+		ret = -EINVAL;
+		goto out_unlock;
 	}
 
 	/*

From ed0737902b57f8768931d96b1d082018d52248c3 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 4 Dec 2023 22:38:25 +0000
Subject: [PATCH 310/352] arm64: configs: Build NVGRACE_GPU_VFIO_PCI as LKM

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index df6049a879683..f9af660236bcc 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1649,3 +1649,4 @@ CONFIG_CORESIGHT_STM=m
 CONFIG_CORESIGHT_CPU_DEBUG=m
 CONFIG_CORESIGHT_CTI=m
 CONFIG_MEMTEST=y
+CONFIG_NVGRACE_GPU_VFIO_PCI=m

From f8129e0de5c874dbb4697f9639e35c779e99f38d Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Tue, 7 Nov 2023 04:07:47 -0800
Subject: [PATCH 311/352] arm64: configs: Enable IOMMUFD and VFIO_DEVICE_CDEV

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/configs/defconfig | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index f9af660236bcc..5e8d1c7a1c9f2 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1650,3 +1650,8 @@ CONFIG_CORESIGHT_CPU_DEBUG=m
 CONFIG_CORESIGHT_CTI=m
 CONFIG_MEMTEST=y
 CONFIG_NVGRACE_GPU_VFIO_PCI=m
+CONFIG_VFIO_DEVICE_CDEV=y
+CONFIG_FAULT_INJECTION=y
+CONFIG_IOMMUFD_DRIVER=y
+CONFIG_IOMMUFD=y
+CONFIG_IOMMUFD_TEST=y

From 4830d441e32065eb58e3bd6e2841bc8b34d0ae7d Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 22 Jan 2024 06:42:45 +0000
Subject: [PATCH 312/352] arm64: configs: Replace VFIO_CONTAINER with
 IOMMUFD_VFIO_CONTAINER

CONFIG_IOMMUFD_VFIO_CONTAINER is the VFIO compatible mode provided by
iommufd core, to replace VFIO_IOMMU_TYPE1. Enable it instead.

This might be used by VFIO mdev feature.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/configs/defconfig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 5e8d1c7a1c9f2..335d7cd08dc7a 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1651,7 +1651,9 @@ CONFIG_CORESIGHT_CTI=m
 CONFIG_MEMTEST=y
 CONFIG_NVGRACE_GPU_VFIO_PCI=m
 CONFIG_VFIO_DEVICE_CDEV=y
+# CONFIG_VFIO_CONTAINER is not set
 CONFIG_FAULT_INJECTION=y
 CONFIG_IOMMUFD_DRIVER=y
 CONFIG_IOMMUFD=y
 CONFIG_IOMMUFD_TEST=y
+CONFIG_IOMMUFD_VFIO_CONTAINER=y

From 4aef9281f3515081298ecd816299e8a8ab276540 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Feb 2024 21:05:40 +0000
Subject: [PATCH 313/352] cover-letter: Add GPU passthrough support

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From a4208d0d499bfc9b798a976b7f48aa40e84802a2 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Wed, 3 Apr 2024 04:27:44 +0000
Subject: [PATCH 314/352] iommufd: Move iommufd_viommu structs to public
 iommufd header

Prepare for a embedded structure design for driver-managed iommufd_viommu
objects:
    // include/linux/iommufd.h
    struct iommufd_viommu {
        struct iommufd_object obj;
        ....
    };

    // Some IOMMU driver
    struct iommu_driver_viommu {
        struct iommufd_viommu core;
        ....
    };

Move iommufd_viommu structures along with iommufd_object to the public
header.

Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 25 -----------------------
 include/linux/iommufd.h                 | 27 +++++++++++++++++++++++++
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 4cb1555991b89..f38b009775c5d 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -7,7 +7,6 @@
 #include <linux/iommu.h>
 #include <linux/iommufd.h>
 #include <linux/iova_bitmap.h>
-#include <linux/refcount.h>
 #include <linux/rwsem.h>
 #include <linux/uaccess.h>
 #include <linux/xarray.h>
@@ -141,14 +140,6 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_MAX,
 };
 
-/* Base struct for all objects with a userspace ID handle. */
-struct iommufd_object {
-	refcount_t shortterm_users;
-	refcount_t users;
-	enum iommufd_object_type type;
-	unsigned int id;
-};
-
 static inline bool iommufd_lock_obj(struct iommufd_object *obj)
 {
 	if (!refcount_inc_not_zero(&obj->users))
@@ -605,22 +596,6 @@ static inline int iommufd_event_virq_handler(struct iommufd_viommu_irq *virq)
 	return iommufd_event_notify(&virq->event_virq->common, &virq->node);
 }
 
-struct iommufd_viommu {
-	struct iommufd_object obj;
-	struct iommufd_ctx *ictx;
-	struct iommufd_hwpt_paging *hwpt;
-
-	/* The locking order is vdev_ids_rwsem -> igroup::lock */
-	struct rw_semaphore vdev_ids_rwsem;
-	struct xarray vdev_ids;
-	struct rw_semaphore virqs_rwsem;
-	struct list_head virqs;
-
-	const struct iommufd_viommu_ops *ops;
-
-	unsigned int type;
-};
-
 static inline struct iommufd_viommu *
 iommufd_get_viommu(struct iommufd_ucmd *ucmd, u32 id)
 {
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 346a6257ed0ca..6e4eb60435107 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -8,7 +8,9 @@
 
 #include <linux/err.h>
 #include <linux/errno.h>
+#include <linux/refcount.h>
 #include <linux/types.h>
+#include <linux/xarray.h>
 
 struct device;
 struct file;
@@ -17,9 +19,18 @@ struct iommu_user_data_array;
 struct iommufd_access;
 struct iommufd_ctx;
 struct iommufd_device;
+struct iommufd_hwpt_paging;
 struct iommufd_viommu;
 struct page;
 
+/* Base struct for all objects with a userspace ID handle. */
+struct iommufd_object {
+	refcount_t shortterm_users;
+	refcount_t users;
+	unsigned int type; /* enum iommufd_object_type in iommufd_private.h */
+	unsigned int id;
+};
+
 struct iommufd_device *iommufd_device_bind(struct iommufd_ctx *ictx,
 					   struct device *dev, u32 *id);
 void iommufd_device_unbind(struct iommufd_device *idev);
@@ -56,6 +67,22 @@ void iommufd_access_detach(struct iommufd_access *access);
 
 void iommufd_ctx_get(struct iommufd_ctx *ictx);
 
+struct iommufd_viommu {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_hwpt_paging *hwpt;
+
+	/* The locking order is vdev_ids_rwsem -> igroup::lock */
+	struct rw_semaphore vdev_ids_rwsem;
+	struct xarray vdev_ids;
+	struct rw_semaphore virqs_rwsem;
+	struct list_head virqs;
+
+	const struct iommufd_viommu_ops *ops;
+
+	unsigned int type;
+};
+
 struct iommufd_vdev_id {
 	struct iommufd_viommu *viommu;
 	struct iommufd_device *idev;

From b4d01ce4a0d5e43631c5f81119baac39fe445eef Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 4 Apr 2024 23:07:14 +0000
Subject: [PATCH 315/352] iommufd: Rename _iommufd_object_alloc to
 iommufd_object_alloc_elm

Currently, the object allocation function calls:
level-0: iommufd_object_alloc()
level-1:     __iommufd_object_alloc()
level-2:         _iommufd_object_alloc()

So the level-1 and level-2 look inverted.

The level-1 allocator is a container_of converter with a pointer sanity,
backing the level-0 allocator. But the level-2 allocator does the actual
object element allocations. Thus, rename the level-2 allocator, so those
two inverted namings would be:

level-0: iommufd_object_alloc()
level-1:     __iommufd_object_alloc()
level-2:         iommufd_object_alloc_elm()

Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h | 8 ++++----
 drivers/iommu/iommufd/main.c            | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index f38b009775c5d..73d3bd240c4a0 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -220,12 +220,12 @@ iommufd_object_put_and_try_destroy(struct iommufd_ctx *ictx,
 	iommufd_object_remove(ictx, obj, obj->id, 0);
 }
 
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
-					     size_t size,
-					     enum iommufd_object_type type);
+struct iommufd_object *iommufd_object_alloc_elm(struct iommufd_ctx *ictx,
+						size_t size,
+						enum iommufd_object_type type);
 
 #define __iommufd_object_alloc(ictx, ptr, type, obj)                           \
-	container_of(_iommufd_object_alloc(                                    \
+	container_of(iommufd_object_alloc_elm(                                 \
 			     ictx,                                             \
 			     sizeof(*(ptr)) + BUILD_BUG_ON_ZERO(               \
 						      offsetof(typeof(*(ptr)), \
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 22381ba031b5e..596f103bd173c 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -29,9 +29,9 @@ struct iommufd_object_ops {
 static const struct iommufd_object_ops iommufd_object_ops[];
 static struct miscdevice vfio_misc_dev;
 
-struct iommufd_object *_iommufd_object_alloc(struct iommufd_ctx *ictx,
-					     size_t size,
-					     enum iommufd_object_type type)
+struct iommufd_object *iommufd_object_alloc_elm(struct iommufd_ctx *ictx,
+						size_t size,
+						enum iommufd_object_type type)
 {
 	struct iommufd_object *obj;
 	int rc;

From 910f937d1b94855c27ce75d91257209703614d45 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Sat, 1 Jun 2024 01:03:21 +0000
Subject: [PATCH 316/352] iommufd/viommu: Support driver-managed viommu
 allocation

A driver supporting a unique virtualization feature might want to allocate
its own viommu object to hold some info or to initialize some HW feature.

Add a new viommu_alloc op to nested parent domains, for a driver-allocated
viommu use case. Also provide a helper to allocate a structure bundle.

It's suggested that a driver should embed a core-level viommu structure in
its driver-level viommu struct and call the iommufd_viommu_alloc() helper,
in which case the driver should also add another viommu ops for the driver
managed viommu:
    struct my_driver_viommu {
        struct iommufd_viommu core;
        /* driver-owned properties/features */
        ....
    };

    static const struct iommufd_viommu_ops my_driver_viommu_ops = {
        .free = my_driver_viommu_free,
        /* future ops for virtualization features */
        ....
    };

    static struct iommufd_viommu my_driver_viommu_alloc(...)
    {
        struct my_driver_viommu *my_viommu =
                iommufd_viommu_alloc(ictx, my_driver_viommu, core,
                                     my_driver_viommu_ops);
        /* Init my_viommu and related HW feature */
        ....
        return &my_viommu->core;
    }

    static struct iommu_domain_ops my_driver_domain_ops = {
        ....
        .viommu_alloc = my_driver_viommu_alloc,
    };

With this, a driver now can allocate and initialize a viommu object, so it
would need a free op for the viommu too.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/main.c       | 32 ------------------
 drivers/iommu/iommufd/viommu.c     | 20 ++++++++----
 drivers/iommu/iommufd/viommu_api.c | 52 ++++++++++++++++++++++++++++++
 include/linux/iommu.h              | 15 +++++++++
 include/linux/iommufd.h            | 25 ++++++++++++++
 5 files changed, 106 insertions(+), 38 deletions(-)

diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 596f103bd173c..3fcb42506f0fd 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -29,38 +29,6 @@ struct iommufd_object_ops {
 static const struct iommufd_object_ops iommufd_object_ops[];
 static struct miscdevice vfio_misc_dev;
 
-struct iommufd_object *iommufd_object_alloc_elm(struct iommufd_ctx *ictx,
-						size_t size,
-						enum iommufd_object_type type)
-{
-	struct iommufd_object *obj;
-	int rc;
-
-	obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
-	if (!obj)
-		return ERR_PTR(-ENOMEM);
-	obj->type = type;
-	/* Starts out bias'd by 1 until it is removed from the xarray */
-	refcount_set(&obj->shortterm_users, 1);
-	refcount_set(&obj->users, 1);
-
-	/*
-	 * Reserve an ID in the xarray but do not publish the pointer yet since
-	 * the caller hasn't initialized it yet. Once the pointer is published
-	 * in the xarray and visible to other threads we can't reliably destroy
-	 * it anymore, so the caller must complete all errorable operations
-	 * before calling iommufd_object_finalize().
-	 */
-	rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
-		      xa_limit_31b, GFP_KERNEL_ACCOUNT);
-	if (rc)
-		goto out_free;
-	return obj;
-out_free:
-	kfree(obj);
-	return ERR_PTR(rc);
-}
-
 /*
  * Allow concurrent access to the object.
  *
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index b1eb900b7fbf5..301cf3d1219d5 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -20,6 +20,8 @@ void iommufd_viommu_destroy(struct iommufd_object *obj)
 	}
 	xa_destroy(&viommu->vdev_ids);
 
+	if (viommu->ops && viommu->ops->free)
+		viommu->ops->free(viommu);
 	refcount_dec(&viommu->hwpt->common.obj.users);
 }
 
@@ -51,12 +53,19 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	}
 	domain = hwpt_paging->common.domain;
 
-	if (cmd->type != IOMMU_VIOMMU_TYPE_DEFAULT) {
-		rc = -EOPNOTSUPP;
-		goto out_put_hwpt;
-	}
+	if (cmd->type == IOMMU_VIOMMU_TYPE_DEFAULT) {
+		viommu = __iommufd_viommu_alloc(
+				ucmd->ictx, sizeof(*viommu),
+				domain->ops->default_viommu_ops);
+	} else {
+		if (!domain->ops || !domain->ops->viommu_alloc) {
+			rc = -EOPNOTSUPP;
+			goto out_put_hwpt;
+		}
 
-	viommu = iommufd_object_alloc(ucmd->ictx, viommu, IOMMUFD_OBJ_VIOMMU);
+		viommu = domain->ops->viommu_alloc(domain, idev->dev,
+						   ucmd->ictx, cmd->type);
+	}
 	if (IS_ERR(viommu)) {
 		rc = PTR_ERR(viommu);
 		goto out_put_hwpt;
@@ -65,7 +74,6 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd)
 	viommu->type = cmd->type;
 	viommu->ictx = ucmd->ictx;
 	viommu->hwpt = hwpt_paging;
-	viommu->ops = domain->ops->default_viommu_ops;
 
 	xa_init(&viommu->vdev_ids);
 	init_rwsem(&viommu->vdev_ids_rwsem);
diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
index d075727a1b38f..74daa3209905e 100644
--- a/drivers/iommu/iommufd/viommu_api.c
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -105,3 +105,55 @@ void iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
 	up_read(&viommu->virqs_rwsem);
 }
 EXPORT_SYMBOL_NS_GPL(iommufd_viommu_report_irq, IOMMUFD);
+
+struct iommufd_object *iommufd_object_alloc_elm(struct iommufd_ctx *ictx,
+						size_t size,
+						enum iommufd_object_type type)
+{
+	struct iommufd_object *obj;
+	int rc;
+
+	obj = kzalloc(size, GFP_KERNEL_ACCOUNT);
+	if (!obj)
+		return ERR_PTR(-ENOMEM);
+	obj->type = type;
+	/* Starts out bias'd by 1 until it is removed from the xarray */
+	refcount_set(&obj->shortterm_users, 1);
+	refcount_set(&obj->users, 1);
+
+	/*
+	 * Reserve an ID in the xarray but do not publish the pointer yet since
+	 * the caller hasn't initialized it yet. Once the pointer is published
+	 * in the xarray and visible to other threads we can't reliably destroy
+	 * it anymore, so the caller must complete all errorable operations
+	 * before calling iommufd_object_finalize().
+	 */
+	rc = xa_alloc(&ictx->objects, &obj->id, XA_ZERO_ENTRY,
+		      xa_limit_31b, GFP_KERNEL_ACCOUNT);
+	if (rc)
+		goto out_free;
+	return obj;
+out_free:
+	kfree(obj);
+	return ERR_PTR(rc);
+}
+EXPORT_SYMBOL_NS_GPL(iommufd_object_alloc_elm, IOMMUFD);
+
+struct iommufd_viommu *
+__iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
+		       const struct iommufd_viommu_ops *ops)
+{
+	struct iommufd_viommu *viommu;
+	struct iommufd_object *obj;
+
+	if (WARN_ON(size < sizeof(*viommu)))
+		return ERR_PTR(-EINVAL);
+	obj = iommufd_object_alloc_elm(ictx, size, IOMMUFD_OBJ_VIOMMU);
+	if (IS_ERR(obj))
+		return ERR_CAST(obj);
+	viommu = container_of(obj, struct iommufd_viommu, obj);
+	if (ops)
+		viommu->ops = ops;
+	return viommu;
+}
+EXPORT_SYMBOL_NS_GPL(__iommufd_viommu_alloc, IOMMUFD);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 02e5a543c79a6..370fdc1f18721 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -42,6 +42,7 @@ struct notifier_block;
 struct iommu_sva;
 struct iommu_dma_cookie;
 struct iommu_fault_param;
+struct iommufd_ctx;
 struct iommufd_viommu;
 struct iommufd_viommu_ops;
 
@@ -684,6 +685,16 @@ struct iommu_ops {
  *                         must be defined in include/uapi/linux/iommufd.h
  * @default_viommu_ops: Driver can choose to use a default core-allocated core-
  *                      managed viommu object by providing a default viommu ops.
+ *                      Otherwise, i.e. for a driver-managed viommu, viommu_ops
+ *                      should be passed in via iommufd_viommu_alloc() helper in
+ *                      its own viommu_alloc op.
+ * @viommu_alloc: Allocate an iommufd_viommu associating to a nested parent
+ *                @domain as a user space IOMMU instance for HW-accelerated
+ *                features from the physical IOMMU behind the @dev. The
+ *                @viommu_type must be defined in include/uapi/linux/iommufd.h
+ *                It is suggested to call iommufd_viommu_alloc() helper for
+ *                a bundled allocation of the core and the driver structures,
+ *                using the given @ictx pointer.
  * @iova_to_phys: translate iova to physical address
  * @enforce_cache_coherency: Prevent any kind of DMA from bypassing IOMMU_CACHE,
  *                           including no-snoop TLPs on PCIe or other platform
@@ -717,6 +728,10 @@ struct iommu_domain_ops {
 				    dma_addr_t iova);
 
 	const struct iommufd_viommu_ops *default_viommu_ops;
+	struct iommufd_viommu *(*viommu_alloc)(struct iommu_domain *domain,
+					       struct device *dev,
+					       struct iommufd_ctx *ictx,
+					       unsigned int viommu_type);
 
 	bool (*enforce_cache_coherency)(struct iommu_domain *domain);
 	int (*set_pgtable_quirks)(struct iommu_domain *domain,
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 6e4eb60435107..7fa76172fb9fd 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -91,6 +91,8 @@ struct iommufd_vdev_id {
 
 /**
  * struct iommufd_viommu_ops - viommu specific operations
+ * @free: Free all driver-specific parts of an iommufd_viommu. The memory
+ *        of the entire viommu will be free-ed by iommufd core
  * @set_vdev_id: Set a virtual device id for a device assigned to a viommu.
  *               Driver allocates an iommufd_vdev_id and return its pointer.
  * @unset_vdev_id: Unset a virtual device id for a device assigned to a viommu.
@@ -106,6 +108,7 @@ struct iommufd_vdev_id {
  *                    include/uapi/linux/iommufd.h
  */
 struct iommufd_viommu_ops {
+	void (*free)(struct iommufd_viommu *viommu);
 	struct iommufd_vdev_id *(*set_vdev_id)(struct iommufd_viommu *viommu,
 					       struct device *dev, u64 id);
 	void (*unset_vdev_id)(struct iommufd_vdev_id *vdev_id);
@@ -137,6 +140,9 @@ struct iommu_domain *
 iommufd_viommu_to_parent_domain(struct iommufd_viommu *viommu);
 void iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
 			       void *irq_ptr, size_t irq_len);
+struct iommufd_viommu *
+__iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
+		       const struct iommufd_viommu_ops *ops);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -208,5 +214,24 @@ iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
 			  void *irq_ptr, size_t irq_len)
 {
 }
+
+static inline struct iommufd_viommu *
+__iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
+		       const struct iommufd_viommu_ops *ops)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_IOMMUFD */
+
+/*
+ * Helpers for IOMMU driver to allocate driver structures that will be freed by
+ * the iommufd core. Yet, a driver is responsible for its own struct cleanup.
+ */
+#define iommufd_viommu_alloc(ictx, drv_struct, member, ops)                    \
+	container_of(__iommufd_viommu_alloc(ictx,                              \
+					    sizeof(struct drv_struct) +        \
+					    BUILD_BUG_ON_ZERO(offsetof(        \
+						struct drv_struct, member)),   \
+					    ops),                              \
+		     struct drv_struct, member)
 #endif

From d292e37d965fe8e27230ca09e254db829bc1d33f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Sat, 1 Jun 2024 01:56:09 +0000
Subject: [PATCH 317/352] iommufd/viommu: Allow driver-level vdev_id structure

Similar to driver embedding a core-level VIOMMU object in its driver-level
VIOMMU structure, allow it to do the same with the vdev_id structure. This
helps driver to hold extra information related to the vdev_id than merely
the common core level information.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/viommu_api.c | 13 +++++++++++++
 include/linux/iommufd.h            | 11 +++++++++++
 2 files changed, 24 insertions(+)

diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
index 74daa3209905e..d0defc219da48 100644
--- a/drivers/iommu/iommufd/viommu_api.c
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -157,3 +157,16 @@ __iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
 	return viommu;
 }
 EXPORT_SYMBOL_NS_GPL(__iommufd_viommu_alloc, IOMMUFD);
+
+struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size)
+{
+	struct iommufd_vdev_id *vdev_id;
+
+	if (WARN_ON(size < sizeof(*vdev_id)))
+		return ERR_PTR(-EINVAL);
+	vdev_id = kzalloc(size, GFP_KERNEL);
+	if (!vdev_id)
+		return ERR_PTR(-ENOMEM);
+	return vdev_id;
+}
+EXPORT_SYMBOL_NS_GPL(__iommufd_vdev_id_alloc, IOMMUFD);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 7fa76172fb9fd..3827a2bd8570d 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -143,6 +143,7 @@ void iommufd_viommu_report_irq(struct iommufd_viommu *viommu, unsigned int type,
 struct iommufd_viommu *
 __iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
 		       const struct iommufd_viommu_ops *ops);
+struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -221,6 +222,11 @@ __iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_IOMMUFD */
 
 /*
@@ -234,4 +240,9 @@ __iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
 						struct drv_struct, member)),   \
 					    ops),                              \
 		     struct drv_struct, member)
+#define iommufd_vdev_id_alloc(drv_struct, member)                              \
+	container_of(__iommufd_vdev_id_alloc(sizeof(struct drv_struct) +       \
+					     BUILD_BUG_ON_ZERO(offsetof(       \
+						struct drv_struct, member))),  \
+		     struct drv_struct, member)
 #endif

From 3bf44c46e0a1eb600931708eeaa0b90b85a70c8b Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 4 Apr 2024 17:44:57 +0000
Subject: [PATCH 318/352] iommufd: Add struct iommufd_vqueue and its related
 viommu ops

Inroduce a new core structure and its allocator iommufd_vqueue_alloc().

This can be used for a viommu to allocate a HW-accelerated queue, e.g.
NVIDIA's virtual command queue and AMD vIOMMU's command buffer.

Also add a pair of viommu ops for iommufd to forward user space ioctls
to IOMMU drivers.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  1 +
 drivers/iommu/iommufd/viommu_api.c      | 16 +++++++++++++
 include/linux/iommufd.h                 | 30 +++++++++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 73d3bd240c4a0..755aa80c1b2c4 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -134,6 +134,7 @@ enum iommufd_object_type {
 	IOMMUFD_OBJ_EVENT_IOPF,
 	IOMMUFD_OBJ_EVENT_VIRQ,
 	IOMMUFD_OBJ_VIOMMU,
+	IOMMUFD_OBJ_VQUEUE,
 #ifdef CONFIG_IOMMUFD_TEST
 	IOMMUFD_OBJ_SELFTEST,
 #endif
diff --git a/drivers/iommu/iommufd/viommu_api.c b/drivers/iommu/iommufd/viommu_api.c
index d0defc219da48..847c31af48588 100644
--- a/drivers/iommu/iommufd/viommu_api.c
+++ b/drivers/iommu/iommufd/viommu_api.c
@@ -170,3 +170,19 @@ struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size)
 	return vdev_id;
 }
 EXPORT_SYMBOL_NS_GPL(__iommufd_vdev_id_alloc, IOMMUFD);
+
+struct iommufd_vqueue *
+__iommufd_vqueue_alloc(struct iommufd_viommu *viommu, size_t size)
+{
+	struct iommufd_vqueue *vqueue;
+	struct iommufd_object *obj;
+
+	if (WARN_ON(size < sizeof(*vqueue)))
+		return ERR_PTR(-EINVAL);
+	obj = iommufd_object_alloc_elm(viommu->ictx, size, IOMMUFD_OBJ_VQUEUE);
+	if (IS_ERR(obj))
+		return ERR_CAST(obj);
+	vqueue = container_of(obj, struct iommufd_vqueue, obj);
+	return vqueue;
+}
+EXPORT_SYMBOL_NS_GPL(__iommufd_vqueue_alloc, IOMMUFD);
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 3827a2bd8570d..12f480b38d69b 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -15,6 +15,7 @@
 struct device;
 struct file;
 struct iommu_group;
+struct iommu_user_data;
 struct iommu_user_data_array;
 struct iommufd_access;
 struct iommufd_ctx;
@@ -89,6 +90,12 @@ struct iommufd_vdev_id {
 	u64 id;
 };
 
+struct iommufd_vqueue {
+	struct iommufd_object obj;
+	struct iommufd_ctx *ictx;
+	struct iommufd_viommu *viommu;
+};
+
 /**
  * struct iommufd_viommu_ops - viommu specific operations
  * @free: Free all driver-specific parts of an iommufd_viommu. The memory
@@ -106,6 +113,11 @@ struct iommufd_vdev_id {
  *                    array->entry_num to report the number of handled requests.
  *                    The data structure of the array entry must be defined in
  *                    include/uapi/linux/iommufd.h
+ * @vqueue_alloc: Allocate an iommufd_vqueue as a user space command queue for a
+ *                @viommu instance. Queue specific @user_data must be defined in
+ *                the include/uapi/linux/iommufd.h header.
+ * @vqueue_free: Free all driver-specific parts of an iommufd_vqueue. The memory
+ *               of the iommufd_vqueue will be free-ed by iommufd core
  */
 struct iommufd_viommu_ops {
 	void (*free)(struct iommufd_viommu *viommu);
@@ -114,6 +126,10 @@ struct iommufd_viommu_ops {
 	void (*unset_vdev_id)(struct iommufd_vdev_id *vdev_id);
 	int (*cache_invalidate)(struct iommufd_viommu *viommu,
 				struct iommu_user_data_array *array);
+	struct iommufd_vqueue *(*vqueue_alloc)(
+		struct iommufd_viommu *viommu,
+		const struct iommu_user_data *user_data);
+	void (*vqueue_free)(struct iommufd_vqueue *vqueue);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)
@@ -144,6 +160,8 @@ struct iommufd_viommu *
 __iommufd_viommu_alloc(struct iommufd_ctx *ictx, size_t size,
 		       const struct iommufd_viommu_ops *ops);
 struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size);
+struct iommufd_vqueue *
+__iommufd_vqueue_alloc(struct iommufd_viommu *viommu, size_t size);
 #else /* !CONFIG_IOMMUFD */
 static inline struct iommufd_ctx *iommufd_ctx_from_file(struct file *file)
 {
@@ -227,6 +245,12 @@ static inline struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size)
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct iommufd_vqueue *
+__iommufd_vqueue_alloc(struct iommufd_viommu *viommu, size_t size)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
 #endif /* CONFIG_IOMMUFD */
 
 /*
@@ -245,4 +269,10 @@ static inline struct iommufd_vdev_id *__iommufd_vdev_id_alloc(size_t size)
 					     BUILD_BUG_ON_ZERO(offsetof(       \
 						struct drv_struct, member))),  \
 		     struct drv_struct, member)
+#define iommufd_vqueue_alloc(viommu, drv_struct, member)                       \
+	container_of(__iommufd_vqueue_alloc(viommu,                            \
+					    sizeof(struct drv_struct) +        \
+					    BUILD_BUG_ON_ZERO(offsetof(        \
+						struct drv_struct, member))),  \
+		     struct drv_struct, member)
 #endif

From c7814824b4a0f8f51e9e77c1cebd5ff2c3240f41 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 4 Apr 2024 20:50:46 +0000
Subject: [PATCH 319/352] iommufd: Add IOMMUFD_OBJ_VQUEUE and
 IOMMUFD_CMD_VQUEUE_ALLOC

Introduce a new IOMMUFD_OBJ_VQUEU to represent a virtual command queue
instance. And add a new ioctl for user space to allocate it.

As an initial version, ddd IOMMU_VQUEUE_DATA_TEGRA241_CMDQV to the enum
iommu_vqueue_data_type and the corresponding iommu_vqueue_tegra241_cmdqv
data structure.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/iommufd_private.h |  2 +
 drivers/iommu/iommufd/main.c            |  6 +++
 drivers/iommu/iommufd/viommu.c          | 62 +++++++++++++++++++++++++
 include/uapi/linux/iommufd.h            | 33 +++++++++++++
 4 files changed, 103 insertions(+)

diff --git a/drivers/iommu/iommufd/iommufd_private.h b/drivers/iommu/iommufd/iommufd_private.h
index 755aa80c1b2c4..64f35cb096467 100644
--- a/drivers/iommu/iommufd/iommufd_private.h
+++ b/drivers/iommu/iommufd/iommufd_private.h
@@ -623,6 +623,8 @@ int iommufd_viommu_alloc_ioctl(struct iommufd_ucmd *ucmd);
 void iommufd_viommu_destroy(struct iommufd_object *obj);
 int iommufd_viommu_set_vdev_id(struct iommufd_ucmd *ucmd);
 int iommufd_viommu_unset_vdev_id(struct iommufd_ucmd *ucmd);
+int iommufd_vqueue_alloc_ioctl(struct iommufd_ucmd *ucmd);
+void iommufd_vqueue_destroy(struct iommufd_object *obj);
 
 #ifdef CONFIG_IOMMUFD_TEST
 int iommufd_test(struct iommufd_ucmd *ucmd);
diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index 3fcb42506f0fd..d648e685bdc7c 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -304,6 +304,7 @@ union ucmd_buffer {
 	struct iommu_viommu_alloc viommu;
 	struct iommu_viommu_set_vdev_id set_vdev_id;
 	struct iommu_viommu_unset_vdev_id unset_vdev_id;
+	struct iommu_vqueue_alloc vqueue;
 #ifdef CONFIG_IOMMUFD_TEST
 	struct iommu_test_cmd test;
 #endif
@@ -363,6 +364,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
 		 struct iommu_viommu_set_vdev_id, vdev_id),
 	IOCTL_OP(IOMMU_VIOMMU_UNSET_VDEV_ID, iommufd_viommu_unset_vdev_id,
 		 struct iommu_viommu_unset_vdev_id, vdev_id),
+	IOCTL_OP(IOMMU_VQUEUE_ALLOC, iommufd_vqueue_alloc_ioctl,
+		 struct iommu_vqueue_alloc, data_uptr),
 #ifdef CONFIG_IOMMUFD_TEST
 	IOCTL_OP(IOMMU_TEST_CMD, iommufd_test, struct iommu_test_cmd, last),
 #endif
@@ -504,6 +507,9 @@ static const struct iommufd_object_ops iommufd_object_ops[] = {
 	[IOMMUFD_OBJ_VIOMMU] = {
 		.destroy = iommufd_viommu_destroy,
 	},
+	[IOMMUFD_OBJ_VQUEUE] = {
+		.destroy = iommufd_vqueue_destroy,
+	},
 #ifdef CONFIG_IOMMUFD_TEST
 	[IOMMUFD_OBJ_SELFTEST] = {
 		.destroy = iommufd_selftest_destroy,
diff --git a/drivers/iommu/iommufd/viommu.c b/drivers/iommu/iommufd/viommu.c
index 301cf3d1219d5..7818cb033ba41 100644
--- a/drivers/iommu/iommufd/viommu.c
+++ b/drivers/iommu/iommufd/viommu.c
@@ -219,3 +219,65 @@ int iommufd_viommu_unset_vdev_id(struct iommufd_ucmd *ucmd)
 	iommufd_put_object(ucmd->ictx, &viommu->obj);
 	return rc;
 }
+
+void iommufd_vqueue_destroy(struct iommufd_object *obj)
+{
+	struct iommufd_vqueue *vqueue =
+		container_of(obj, struct iommufd_vqueue, obj);
+	struct iommufd_viommu *viommu = vqueue->viommu;
+
+	if (viommu->ops->vqueue_free)
+		viommu->ops->vqueue_free(vqueue);
+	refcount_dec(&viommu->obj.users);
+}
+
+int iommufd_vqueue_alloc_ioctl(struct iommufd_ucmd *ucmd)
+{
+	struct iommu_vqueue_alloc *cmd = ucmd->cmd;
+	const struct iommu_user_data user_data = {
+		.type = cmd->data_type,
+		.uptr = u64_to_user_ptr(cmd->data_uptr),
+		.len = cmd->data_len,
+	};
+	struct iommufd_vqueue *vqueue;
+	struct iommufd_viommu *viommu;
+	int rc;
+
+	if (cmd->flags)
+		return -EOPNOTSUPP;
+	if (!cmd->data_len)
+		return -EINVAL;
+
+	viommu = iommufd_get_viommu(ucmd, cmd->viommu_id);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	if (!viommu->ops || !viommu->ops->vqueue_alloc) {
+		rc = -EOPNOTSUPP;
+		goto out_put_viommu;
+	}
+
+	vqueue = viommu->ops->vqueue_alloc(
+		viommu, user_data.len ? &user_data : NULL);
+	if (IS_ERR(vqueue)) {
+		rc = PTR_ERR(vqueue);
+		goto out_put_viommu;
+	}
+
+	vqueue->viommu = viommu;
+	vqueue->ictx = ucmd->ictx;
+	cmd->out_vqueue_id = vqueue->obj.id;
+	rc = iommufd_ucmd_respond(ucmd, sizeof(*cmd));
+	if (rc)
+		goto out_free;
+	iommufd_object_finalize(ucmd->ictx, &vqueue->obj);
+	refcount_inc(&viommu->obj.users);
+	goto out_put_viommu;
+
+out_free:
+	if (viommu->ops->vqueue_free)
+		viommu->ops->vqueue_free(vqueue);
+out_put_viommu:
+	iommufd_put_object(ucmd->ictx, &viommu->obj);
+	return rc;
+}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 1dc2c0b05af71..653b2b60ea661 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -55,6 +55,7 @@ enum {
 	IOMMUFD_CMD_VIOMMU_SET_VDEV_ID = 0x90,
 	IOMMUFD_CMD_VIOMMU_UNSET_VDEV_ID = 0x91,
 	IOMMUFD_CMD_VIRQ_ALLOC = 0x92,
+	IOMMUFD_CMD_VQUEUE_ALLOC = 0x93,
 };
 
 /**
@@ -997,4 +998,36 @@ struct iommu_virq_alloc {
 	__u32 out_virq_fd;
 };
 #define IOMMU_VIRQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIRQ_ALLOC)
+
+/**
+ * enum iommu_vqueue_data_type - VQUEUE Data Type
+ * @IOMMU_VQUEUE_DATA_NONE: No Data
+ */
+enum iommu_vqueue_data_type {
+	IOMMU_VQUEUE_DATA_NONE = 0,
+};
+
+/**
+ * struct iommu_vqueue_alloc - ioctl(IOMMU_VQUEUE_ALLOC)
+ * @size: sizeof(struct iommu_vqueue_alloc)
+ * @flags: Must be 0
+ * @viommu_id: viommu ID to associate the virtual queue with
+ * @out_vqueue_id: The ID of the new virtual queue
+ * @data_type: One of enum iommu_vqueue_data_type
+ * @data_len: Length of the type specific data
+ * @data_uptr: User pointer to the type specific data
+ *
+ * Allocate an virtual queue object for driver-specific HW-accelerated queue
+ */
+
+struct iommu_vqueue_alloc {
+	__u32 size;
+	__u32 flags;
+	__u32 viommu_id;
+	__u32 out_vqueue_id;
+	__u32 data_type;
+	__u32 data_len;
+	__aligned_u64 data_uptr;
+};
+#define IOMMU_VQUEUE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VQUEUE_ALLOC)
 #endif

From 43cf12179ee39b73c517656324c6924918873e76 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 7 Apr 2023 12:21:11 -0700
Subject: [PATCH 320/352] iommufd: Add mmap infrastructure

Add for sharing the kernel page with user space. This allows to pass
through HW resource (VCMDQ MMIO pages for example) to user space VMM
and guest OS. Use vma->vm_pgoff as the carrier of a viommu_id.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/iommufd/main.c | 40 ++++++++++++++++++++++++++++++++++++
 include/linux/iommufd.h      |  4 ++++
 2 files changed, 44 insertions(+)

diff --git a/drivers/iommu/iommufd/main.c b/drivers/iommu/iommufd/main.c
index d648e685bdc7c..a3b03233a1666 100644
--- a/drivers/iommu/iommufd/main.c
+++ b/drivers/iommu/iommufd/main.c
@@ -11,6 +11,7 @@
 #include <linux/bug.h>
 #include <linux/file.h>
 #include <linux/fs.h>
+#include <linux/iommu.h>
 #include <linux/iommufd.h>
 #include <linux/miscdevice.h>
 #include <linux/module.h>
@@ -407,11 +408,50 @@ static long iommufd_fops_ioctl(struct file *filp, unsigned int cmd,
 	return ret;
 }
 
+static int iommufd_fops_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct iommufd_ctx *ictx = filp->private_data;
+	size_t size = vma->vm_end - vma->vm_start;
+	u32 viommu_id = (u32)vma->vm_pgoff;
+	struct iommufd_viommu *viommu;
+	unsigned long pfn;
+	int rc;
+
+	if (size > PAGE_SIZE)
+		return -EINVAL;
+
+	viommu = container_of(iommufd_get_object(ictx, viommu_id,
+						 IOMMUFD_OBJ_VIOMMU),
+			      struct iommufd_viommu, obj);
+	if (IS_ERR(viommu))
+		return PTR_ERR(viommu);
+
+	if (!viommu->ops->get_mmap_pfn) {
+		rc = -EOPNOTSUPP;
+		goto out_put_viommu;
+	}
+
+	pfn = viommu->ops->get_mmap_pfn(viommu, size);
+	if (!pfn) {
+		rc = -ENOMEM;
+		goto out_put_viommu;
+	}
+
+	vma->vm_pgoff = 0;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
+	rc = remap_pfn_range(vma, vma->vm_start, pfn, size, vma->vm_page_prot);
+out_put_viommu:
+	iommufd_put_object(ictx, &viommu->obj);
+	return rc;
+}
+
 static const struct file_operations iommufd_fops = {
 	.owner = THIS_MODULE,
 	.open = iommufd_fops_open,
 	.release = iommufd_fops_release,
 	.unlocked_ioctl = iommufd_fops_ioctl,
+	.mmap = iommufd_fops_mmap,
 };
 
 /**
diff --git a/include/linux/iommufd.h b/include/linux/iommufd.h
index 12f480b38d69b..24d7532b87c2d 100644
--- a/include/linux/iommufd.h
+++ b/include/linux/iommufd.h
@@ -118,6 +118,8 @@ struct iommufd_vqueue {
  *                the include/uapi/linux/iommufd.h header.
  * @vqueue_free: Free all driver-specific parts of an iommufd_vqueue. The memory
  *               of the iommufd_vqueue will be free-ed by iommufd core
+ * @get_mmap_pfn: Return the PFN of a viommu given a finite size, for user space
+ *                to mmap the page(s)
  */
 struct iommufd_viommu_ops {
 	void (*free)(struct iommufd_viommu *viommu);
@@ -130,6 +132,8 @@ struct iommufd_viommu_ops {
 		struct iommufd_viommu *viommu,
 		const struct iommu_user_data *user_data);
 	void (*vqueue_free)(struct iommufd_vqueue *vqueue);
+	unsigned long (*get_mmap_pfn)(struct iommufd_viommu *viommu,
+				      size_t pgsize);
 };
 
 #if IS_ENABLED(CONFIG_IOMMUFD)

From e848de6c5929eaf8da01d5a1252cd454837876a2 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 30 Aug 2021 19:59:23 -0700
Subject: [PATCH 321/352] iommu/tegra241-cmdqv: Add user-space use support

Add the support via VIOMMU infrastructure for virtualization use case.

This basically allows VMM to allocate VINTFs (as a viommu object) and
assign VCMDQs to it. A VINTF's MMIO page0 can be mmap'd to user space
for VM to access directly without VMEXIT and corresponding hypercall.

As an initial version, the number of VCMDQs per VINTF is fixed to two.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c   |  17 +
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h   |  12 +
 .../iommu/arm/arm-smmu-v3/tegra241-cmdqv.c    | 315 +++++++++++++++++-
 include/uapi/linux/iommufd.h                  |  17 +
 4 files changed, 357 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 28a4de6892ac8..2af95bf1a7cb2 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -4001,6 +4001,22 @@ static int arm_smmu_viommu_cache_invalidate(struct iommufd_viommu *viommu,
 			to_smmu_domain(domain), viommu, array);
 }
 
+static struct iommufd_viommu *
+arm_smmu_domain_viommu_alloc(struct iommu_domain *domain, struct device *dev,
+			     struct iommufd_ctx *ictx, unsigned int viommu_type)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain_devices(domain);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+
+	if (!master || !master->smmu)
+		return ERR_PTR(-ENODEV);
+
+	if (master->smmu->impl_ops && master->smmu->impl_ops->viommu_alloc)
+		return master->smmu->impl_ops->viommu_alloc(
+			master->smmu, smmu_domain, ictx);
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static struct iommu_ops arm_smmu_ops = {
 	.identity_domain	= &arm_smmu_identity_domain,
 	.blocked_domain		= &arm_smmu_blocked_domain,
@@ -4031,6 +4047,7 @@ static struct iommu_ops arm_smmu_ops = {
 		.iotlb_sync		= arm_smmu_iotlb_sync,
 		.iova_to_phys		= arm_smmu_iova_to_phys,
 		.free			= arm_smmu_domain_free_paging,
+		.viommu_alloc		= arm_smmu_domain_viommu_alloc,
 		.default_viommu_ops = &(const struct iommufd_viommu_ops) {
 			.set_vdev_id = arm_smmu_viommu_set_vdev_id,
 			.unset_vdev_id = arm_smmu_viommu_unset_vdev_id,
diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
index 04535e39b78f4..0ec13a55a3b3c 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.h
@@ -16,6 +16,7 @@
 #include <linux/sizes.h>
 
 struct arm_smmu_device;
+struct arm_smmu_domain;
 
 /* MMIO registers */
 #define ARM_SMMU_IDR0			0x0
@@ -708,6 +709,9 @@ struct arm_smmu_impl_ops {
 	int (*init_structures)(struct arm_smmu_device *smmu);
 	struct arm_smmu_cmdq *(*get_secondary_cmdq)(
 		struct arm_smmu_device *smmu, struct arm_smmu_cmdq_ent *ent);
+	struct iommufd_viommu *(*viommu_alloc)(
+		struct arm_smmu_device *smmu, struct arm_smmu_domain *smmu_domain,
+		struct iommufd_ctx *ictx);
 };
 
 /* An SMMUv3 instance */
@@ -923,6 +927,14 @@ int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 int arm_smmu_cmdq_init(struct arm_smmu_device *smmu,
 		       struct arm_smmu_cmdq *cmdq);
 
+static inline phys_addr_t
+arm_smmu_domain_ipa_to_pa(struct arm_smmu_domain *smmu_domain, u64 ipa)
+{
+	if (WARN_ON_ONCE(smmu_domain->stage != ARM_SMMU_DOMAIN_S2))
+		return 0;
+	return iommu_iova_to_phys(&smmu_domain->domain, ipa);
+}
+
 #ifdef CONFIG_ARM_SMMU_V3_SVA
 bool arm_smmu_sva_supported(struct arm_smmu_device *smmu);
 bool arm_smmu_master_sva_supported(struct arm_smmu_master *master);
diff --git a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
index fcd13d301fff6..56369e7d42441 100644
--- a/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
+++ b/drivers/iommu/arm/arm-smmu-v3/tegra241-cmdqv.c
@@ -8,7 +8,9 @@
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/iommu.h>
+#include <linux/iommufd.h>
 #include <linux/iopoll.h>
+#include <uapi/linux/iommufd.h>
 
 #include <acpi/acpixf.h>
 
@@ -26,6 +28,7 @@
 #define  CMDQV_EN			BIT(0)
 
 #define TEGRA241_CMDQV_PARAM		0x0004
+#define  CMDQV_NUM_SID_PER_VM_LOG2	GENMASK(15, 12)
 #define  CMDQV_NUM_VINTF_LOG2		GENMASK(11, 8)
 #define  CMDQV_NUM_VCMDQ_LOG2		GENMASK(7, 4)
 
@@ -53,6 +56,9 @@
 #define  VINTF_STATUS			GENMASK(3, 1)
 #define  VINTF_ENABLED			BIT(0)
 
+#define TEGRA241_VINTF_SID_MATCH(s)	(0x0040 + 0x4*(s))
+#define TEGRA241_VINTF_SID_REPLACE(s)	(0x0080 + 0x4*(s))
+
 #define TEGRA241_VINTF_LVCMDQ_ERR_MAP_64(m) \
 					(0x00C0 + 0x8*(m))
 #define  LVCMDQ_ERR_MAP_NUM_64		2
@@ -115,6 +121,7 @@ MODULE_PARM_DESC(bypass_vcmdq,
 
 /**
  * struct tegra241_vcmdq - Virtual Command Queue
+ * @core: Embedded iommufd_vqueue structure
  * @idx: Global index in the CMDQV
  * @lidx: Local index in the VINTF
  * @enabled: Enable status
@@ -125,6 +132,8 @@ MODULE_PARM_DESC(bypass_vcmdq,
  * @page1: MMIO Page1 base address
  */
 struct tegra241_vcmdq {
+	struct iommufd_vqueue core;
+
 	u16 idx;
 	u16 lidx;
 
@@ -137,18 +146,26 @@ struct tegra241_vcmdq {
 	void __iomem *page0;
 	void __iomem *page1;
 };
+#define vqueue_to_vcmdq(v) container_of(v, struct tegra241_vcmdq, core)
 
 /**
  * struct tegra241_vintf - Virtual Interface
+ * @core: Embedded iommufd_viommu structure
  * @idx: Global index in the CMDQV
+ * @vmid: VMID for configuration
  * @enabled: Enable status
  * @hyp_own: Owned by hypervisor (in-kernel)
  * @cmdqv: Parent CMDQV pointer
  * @lvcmdqs: List of logical VCMDQ pointers
  * @base: MMIO base address
+ * @s2_domain: Stage-2 SMMU domain
+ * @sid_slots: Stream ID Slot allocator
  */
 struct tegra241_vintf {
+	struct iommufd_viommu core;
+
 	u16 idx;
+	u16 vmid;
 
 	bool enabled;
 	bool hyp_own;
@@ -157,6 +174,24 @@ struct tegra241_vintf {
 	struct tegra241_vcmdq **lvcmdqs;
 
 	void __iomem *base;
+	struct arm_smmu_domain *s2_domain;
+
+	struct ida sid_slots;
+};
+#define viommu_to_vintf(v) container_of(v, struct tegra241_vintf, core)
+
+/**
+ * struct tegra241_vintf_sid_slot - Virtual Interface Stream ID Slot
+ * @core: Embedded iommufd_vdev_id structure
+ * @vintf: Parent VINTF pointer
+ * @sid: Physical Stream ID
+ * @id: Slot index in the VINTF
+ */
+struct tegra241_vintf_sid_slot {
+	struct iommufd_vdev_id core;
+	struct tegra241_vintf *vintf;
+	u32 sid;
+	u8 idx;
 };
 
 /**
@@ -164,10 +199,12 @@ struct tegra241_vintf {
  * @smmu: SMMUv3 device
  * @dev: CMDQV device
  * @base: MMIO base address
+ * @base_phys: Page frame number of @base, for mmap
  * @irq: IRQ number
  * @num_vintfs: Total number of VINTFs
  * @num_vcmdqs: Total number of VCMDQs
  * @num_lvcmdqs_per_vintf: Number of logical VCMDQs per VINTF
+ * @num_sids_per_vintf: Total number of SID replacements per VINTF
  * @vintf_ids: VINTF id allocator
  * @vintfs: List of VINTFs
  */
@@ -176,12 +213,14 @@ struct tegra241_cmdqv {
 	struct device *dev;
 
 	void __iomem *base;
+	unsigned long base_pfn;
 	int irq;
 
 	/* CMDQV Hardware Params */
 	u16 num_vintfs;
 	u16 num_vcmdqs;
 	u16 num_lvcmdqs_per_vintf;
+	u16 num_sids_per_vintf;
 
 	struct ida vintf_ids;
 
@@ -380,6 +419,11 @@ static void tegra241_vcmdq_hw_deinit(struct tegra241_vcmdq *vcmdq)
 	dev_dbg(vcmdq->cmdqv->dev, "%sdeinited\n", h);
 }
 
+static void _tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
+{
+	writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+}
+
 static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 {
 	char header[64], *h = lvcmdq_error_header(vcmdq, header, 64);
@@ -389,7 +433,7 @@ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 	tegra241_vcmdq_hw_deinit(vcmdq);
 
 	/* Configure and enable VCMDQ */
-	writeq_relaxed(vcmdq->cmdq.q.q_base, REG_VCMDQ_PAGE1(vcmdq, BASE));
+	_tegra241_vcmdq_hw_init(vcmdq);
 
 	ret = vcmdq_write_config(vcmdq, VCMDQ_EN);
 	if (ret) {
@@ -408,11 +452,16 @@ static int tegra241_vcmdq_hw_init(struct tegra241_vcmdq *vcmdq)
 static void tegra241_vintf_hw_deinit(struct tegra241_vintf *vintf)
 {
 	u16 lidx;
+	int slot;
 
 	for (lidx = 0; lidx < vintf->cmdqv->num_lvcmdqs_per_vintf; lidx++)
 		if (vintf->lvcmdqs && vintf->lvcmdqs[lidx])
 			tegra241_vcmdq_hw_deinit(vintf->lvcmdqs[lidx]);
 	vintf_write_config(vintf, 0);
+	for (slot = 0; slot < vintf->cmdqv->num_sids_per_vintf; slot++) {
+		writel_relaxed(0, REG_VINTF(vintf, SID_REPLACE(slot)));
+		writel_relaxed(0, REG_VINTF(vintf, SID_MATCH(slot)));
+	}
 }
 
 static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
@@ -430,7 +479,8 @@ static int tegra241_vintf_hw_init(struct tegra241_vintf *vintf, bool hyp_own)
 	 * whether enabling it here or not, as !HYP_OWN cmdq HWs only support a
 	 * restricted set of supported commands.
 	 */
-	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own);
+	regval = FIELD_PREP(VINTF_HYP_OWN, hyp_own) |
+		 FIELD_PREP(VINTF_VMID, vintf->vmid);
 	writel(regval, REG_VINTF(vintf, CONFIG));
 
 	ret = vintf_write_config(vintf, regval | VINTF_EN);
@@ -562,7 +612,9 @@ static void tegra241_vintf_free_lvcmdq(struct tegra241_vintf *vintf, u16 lidx)
 
 	dev_dbg(vintf->cmdqv->dev,
 		"%sdeallocated\n", lvcmdq_error_header(vcmdq, header, 64));
-	kfree(vcmdq);
+	/* Guest-owned VCMDQ is free-ed with vqueue by iommufd core */
+	if (vcmdq->vintf->hyp_own)
+		kfree(vcmdq);
 }
 
 static struct tegra241_vcmdq *
@@ -656,7 +708,10 @@ static void tegra241_cmdqv_remove_vintf(struct tegra241_cmdqv *cmdqv, u16 idx)
 
 	dev_dbg(cmdqv->dev, "VINTF%u: deallocated\n", vintf->idx);
 	tegra241_cmdqv_deinit_vintf(cmdqv, idx);
-	kfree(vintf);
+	ida_destroy(&vintf->sid_slots);
+	/* Guest-owned VINTF is free-ed with viommu by iommufd core */
+	if (vintf->hyp_own)
+		kfree(vintf);
 }
 
 static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
@@ -684,10 +739,16 @@ static void tegra241_cmdqv_remove(struct arm_smmu_device *smmu)
 	put_device(cmdqv->dev); /* smmu->impl_dev */
 }
 
+static struct iommufd_viommu *
+tegra241_cmdqv_viommu_alloc(struct arm_smmu_device *smmu,
+			    struct arm_smmu_domain *smmu_domain,
+			    struct iommufd_ctx *ictx);
+
 static struct arm_smmu_impl_ops tegra241_cmdqv_impl_ops = {
 	.get_secondary_cmdq = tegra241_cmdqv_get_cmdq,
 	.device_reset = tegra241_cmdqv_hw_reset,
 	.device_remove = tegra241_cmdqv_remove,
+	.viommu_alloc = tegra241_cmdqv_viommu_alloc,
 };
 
 /* Probe Functions */
@@ -839,6 +900,7 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	cmdqv->irq = irq;
 	cmdqv->base = base;
 	cmdqv->dev = smmu->impl_dev;
+	cmdqv->base_pfn = res->start >> PAGE_SHIFT;
 
 	if (cmdqv->irq > 0) {
 		ret = request_irq(irq, tegra241_cmdqv_isr, 0, "tegra241-cmdqv",
@@ -854,6 +916,8 @@ __tegra241_cmdqv_probe(struct arm_smmu_device *smmu, struct resource *res,
 	cmdqv->num_vintfs = 1 << FIELD_GET(CMDQV_NUM_VINTF_LOG2, regval);
 	cmdqv->num_vcmdqs = 1 << FIELD_GET(CMDQV_NUM_VCMDQ_LOG2, regval);
 	cmdqv->num_lvcmdqs_per_vintf = cmdqv->num_vcmdqs / cmdqv->num_vintfs;
+	cmdqv->num_sids_per_vintf =
+		1 << FIELD_GET(CMDQV_NUM_SID_PER_VM_LOG2, regval);
 
 	cmdqv->vintfs =
 		kcalloc(cmdqv->num_vintfs, sizeof(*cmdqv->vintfs), GFP_KERNEL);
@@ -907,3 +971,246 @@ struct arm_smmu_device *tegra241_cmdqv_probe(struct arm_smmu_device *smmu)
 	put_device(smmu->impl_dev);
 	return ERR_PTR(-ENODEV);
 }
+
+/* User-space VIOMMU and VQUEUE Functions */
+
+static int tegra241_vcmdq_hw_init_user(struct tegra241_vcmdq *vcmdq)
+{
+	char header[32];
+
+	/* Configure the vcmdq only; User space does the enabling */
+	_tegra241_vcmdq_hw_init(vcmdq);
+
+	dev_dbg(vcmdq->cmdqv->dev,
+		"%sinited at host PA 0x%llx size 0x%lx\n",
+		lvcmdq_error_header(vcmdq, header, 32),
+		vcmdq->cmdq.q.q_base & VCMDQ_ADDR,
+		1UL << (vcmdq->cmdq.q.q_base & VCMDQ_LOG2SIZE));
+	return 0;
+}
+
+static struct iommufd_vqueue *
+tegra241_cmdqv_vqueue_alloc(struct iommufd_viommu *viommu,
+			    const struct iommu_user_data *user_data)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+	struct iommu_vqueue_tegra241_cmdqv arg;
+	struct tegra241_vcmdq *vcmdq;
+	phys_addr_t q_base;
+	char header[32];
+	int ret;
+
+	ret = iommu_copy_struct_from_user(&arg, user_data,
+					  IOMMU_VQUEUE_DATA_TEGRA241_CMDQV,
+					  vcmdq_base);
+	if (ret)
+		return ERR_PTR(ret);
+
+	if (!arg.vcmdq_base || arg.vcmdq_base & ~VCMDQ_ADDR)
+		return ERR_PTR(-EINVAL);
+	if (!arg.vcmdq_log2size || arg.vcmdq_log2size > VCMDQ_LOG2SIZE)
+		return ERR_PTR(-EINVAL);
+	if (arg.vcmdq_id >= cmdqv->num_lvcmdqs_per_vintf)
+		return ERR_PTR(-EINVAL);
+	q_base = arm_smmu_domain_ipa_to_pa(vintf->s2_domain, arg.vcmdq_base);
+	if (!q_base)
+		return ERR_PTR(-EINVAL);
+
+	if (vintf->lvcmdqs[arg.vcmdq_id]) {
+		vcmdq = vintf->lvcmdqs[arg.vcmdq_id];
+
+		/* deinit the previous setting as a reset, before re-init */
+		tegra241_vcmdq_hw_deinit(vcmdq);
+
+		vcmdq->cmdq.q.q_base  = q_base & VCMDQ_ADDR;
+		vcmdq->cmdq.q.q_base |=	arg.vcmdq_log2size;
+		tegra241_vcmdq_hw_init_user(vcmdq);
+
+		return &vcmdq->core;
+	}
+
+	vcmdq = iommufd_vqueue_alloc(viommu, tegra241_vcmdq, core);
+	if (!vcmdq)
+		return ERR_PTR(-ENOMEM);
+
+	ret = tegra241_vintf_init_lvcmdq(vintf, arg.vcmdq_id, vcmdq);
+	if (ret)
+		goto free_vcmdq;
+	dev_dbg(cmdqv->dev, "%sallocated\n", lvcmdq_error_header(vcmdq, header, 32));
+
+	vcmdq->cmdq.q.q_base  = q_base & VCMDQ_ADDR;
+	vcmdq->cmdq.q.q_base |=	arg.vcmdq_log2size;
+
+	ret = tegra241_vcmdq_hw_init_user(vcmdq);
+	if (ret)
+		goto free_vcmdq;
+	vintf->lvcmdqs[arg.vcmdq_id] = vcmdq;
+
+	return &vcmdq->core;
+free_vcmdq:
+	kfree(vcmdq);
+	return ERR_PTR(ret);
+}
+
+static void tegra241_cmdqv_vqueue_free(struct iommufd_vqueue *vqueue)
+{
+	struct tegra241_vcmdq *vcmdq = vqueue_to_vcmdq(vqueue);
+
+	tegra241_vintf_remove_lvcmdq(vcmdq->vintf, vcmdq->lidx);
+
+	/* IOMMUFD core frees the memory of vcmdq and vqueue */
+}
+
+static void tegra241_cmdqv_viommu_free(struct iommufd_viommu *viommu)
+{
+	struct tegra241_vintf *vintf = viommu_to_vintf(viommu);
+
+	tegra241_cmdqv_remove_vintf(vintf->cmdqv, vintf->idx);
+
+	/* IOMMUFD core frees the memory of vintf and viommu */
+}
+
+static struct iommufd_vdev_id *
+tegra241_cmdqv_viommu_set_vdev_id(struct iommufd_viommu *viommu,
+				  struct device *dev, u64 dev_id)
+{
+	struct tegra241_vintf *vintf =
+		container_of(viommu, struct tegra241_vintf, core);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct arm_smmu_stream *stream = &master->streams[0];
+	struct tegra241_vintf_sid_slot *slot;
+	int idx;
+
+	if (dev_id > UINT_MAX)
+		return ERR_PTR(-EINVAL);
+
+	slot = iommufd_vdev_id_alloc(tegra241_vintf_sid_slot, core);
+	if (!slot)
+		return ERR_PTR(-ENOMEM);
+
+	WARN_ON_ONCE(master->num_streams != 1);
+
+	/* Find an empty slot of SID_MATCH and SID_REPLACE */
+	idx = ida_alloc_max(&vintf->sid_slots,
+			     vintf->cmdqv->num_sids_per_vintf - 1, GFP_KERNEL);
+	if (idx < 0) {
+		kfree(slot);
+		return ERR_PTR(idx);
+	}
+
+	writel_relaxed(stream->id, REG_VINTF(vintf, SID_REPLACE(idx)));
+	writel_relaxed(dev_id << 1 | 0x1, REG_VINTF(vintf, SID_MATCH(idx)));
+	dev_dbg(vintf->cmdqv->dev,
+		"VINTF%u: allocated a slot (%d) for pSID=%x, vSID=%x\n",
+		vintf->idx, idx, stream->id, (u32)dev_id);
+
+	slot->idx = idx;
+	slot->vintf = vintf;
+	slot->sid = stream->id;
+
+	/* FIXME add a helper to be used by both drivers? */
+	mutex_lock(&master->lock);
+	master->vdev_id = &slot->core;
+	mutex_unlock(&master->lock);
+
+	return &slot->core;
+}
+
+static void tegra241_cmdqv_viommu_unset_vdev_id(struct iommufd_vdev_id *vdev_id)
+{
+	struct tegra241_vintf_sid_slot *slot =
+		container_of(vdev_id, struct tegra241_vintf_sid_slot, core);
+	struct device *dev = iommufd_vdev_id_to_dev(vdev_id);
+	struct arm_smmu_master *master = dev_iommu_priv_get(dev);
+	struct tegra241_vintf *vintf = slot->vintf;
+
+	/* FIXME add a helper to be used by both drivers? */
+	mutex_lock(&master->lock);
+	master->vdev_id = NULL;
+	mutex_unlock(&master->lock);
+
+	writel_relaxed(0, REG_VINTF(vintf, SID_REPLACE(slot->idx)));
+	writel_relaxed(0, REG_VINTF(vintf, SID_MATCH(slot->idx)));
+	ida_free(&vintf->sid_slots, slot->idx);
+	dev_dbg(vintf->cmdqv->dev,
+		"VINTF%u: deallocated a slot (%d) for pSID=%x\n",
+		vintf->idx, slot->idx, slot->sid);
+
+	/* IOMMUFD core frees the memory of slot and vdev_id */
+}
+
+static unsigned long tegra241_cmdqv_get_mmap_pfn(struct iommufd_viommu *viommu,
+						 size_t pgsize)
+{
+	struct tegra241_vintf *vintf =
+		container_of(viommu, struct tegra241_vintf, core);
+	struct tegra241_cmdqv *cmdqv = vintf->cmdqv;
+
+	return cmdqv->base_pfn + TEGRA241_VINTFi_PAGE0(vintf->idx) / PAGE_SIZE;
+}
+
+static struct iommufd_viommu_ops tegra241_cmdqv_viommu_ops = {
+	.free = tegra241_cmdqv_viommu_free,
+	.set_vdev_id = tegra241_cmdqv_viommu_set_vdev_id,
+	.unset_vdev_id = tegra241_cmdqv_viommu_unset_vdev_id,
+	.vqueue_alloc = tegra241_cmdqv_vqueue_alloc,
+	.vqueue_free = tegra241_cmdqv_vqueue_free,
+	.get_mmap_pfn = tegra241_cmdqv_get_mmap_pfn,
+};
+
+static struct iommufd_viommu *
+tegra241_cmdqv_viommu_alloc(struct arm_smmu_device *smmu,
+			    struct arm_smmu_domain *smmu_domain,
+			    struct iommufd_ctx *ictx)
+{
+	struct tegra241_cmdqv *cmdqv =
+		container_of(smmu, struct tegra241_cmdqv, smmu);
+	struct tegra241_vintf *vintf;
+	int ret;
+
+	if (!smmu_domain || smmu_domain->stage != ARM_SMMU_DOMAIN_S2)
+		return ERR_PTR(-EINVAL);
+
+	tegra241_cmdqv_viommu_ops.cache_invalidate =
+		smmu_domain->domain.ops->default_viommu_ops->cache_invalidate;
+	vintf = iommufd_viommu_alloc(ictx, tegra241_vintf, core,
+				     &tegra241_cmdqv_viommu_ops);
+	if (!vintf)
+		return ERR_PTR(-ENOMEM);
+
+	ret = tegra241_cmdqv_init_vintf(cmdqv, cmdqv->num_vintfs - 1, vintf);
+	if (ret < 0) {
+		dev_err(cmdqv->dev, "no more available vintf\n");
+		goto free_vintf;
+	}
+
+	vintf->s2_domain = smmu_domain;
+	vintf->vmid = smmu_domain->s2_cfg.vmid;
+
+	ret = tegra241_vintf_hw_init(vintf, false);
+	if (ret)
+		goto deinit_vintf;
+
+	vintf->lvcmdqs = kcalloc(cmdqv->num_lvcmdqs_per_vintf,
+				sizeof(*vintf->lvcmdqs), GFP_KERNEL);
+	if (!vintf->lvcmdqs) {
+		ret = -ENOMEM;
+		goto hw_deinit_vintf;
+	}
+
+	ida_init(&vintf->sid_slots);
+
+	dev_dbg(cmdqv->dev, "VINTF%u: allocated with vmid (%d)\n",
+		vintf->idx, vintf->vmid);
+
+	return &vintf->core;
+
+hw_deinit_vintf:
+	tegra241_vintf_hw_deinit(vintf);
+deinit_vintf:
+	tegra241_cmdqv_deinit_vintf(cmdqv, vintf->idx);
+free_vintf:
+	kfree(vintf);
+	return ERR_PTR(ret);
+}
diff --git a/include/uapi/linux/iommufd.h b/include/uapi/linux/iommufd.h
index 653b2b60ea661..933c66b7aabb6 100644
--- a/include/uapi/linux/iommufd.h
+++ b/include/uapi/linux/iommufd.h
@@ -893,6 +893,7 @@ struct iommu_fault_alloc {
  */
 enum iommu_viommu_type {
 	IOMMU_VIOMMU_TYPE_DEFAULT = 0,
+	IOMMU_VIOMMU_TYPE_TEGRA241_CMDQV = 1,
 };
 
 /**
@@ -999,12 +1000,28 @@ struct iommu_virq_alloc {
 };
 #define IOMMU_VIRQ_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VIRQ_ALLOC)
 
+/**
+ * struct iommu_vqueue_tegra241_cmdqv - NVIDIA Tegra241's Virtual Command Queue
+ *                                      for its CMDQV Extension for ARM SMMUv3
+ *                                      (IOMMU_VQUEUE_DATA_TEGRA241_CMDQV)
+ * @vcmdq_id: logical ID of a virtual command queue in the VIOMMU instance
+ * @vcmdq_log2size: (1 << @vcmdq_log2size) will be the size of the vcmdq
+ * @vcmdq_base: guest physical address (IPA) to the vcmdq base address
+ */
+struct iommu_vqueue_tegra241_cmdqv {
+	__u32 vcmdq_id;
+	__u32 vcmdq_log2size;
+	__aligned_u64 vcmdq_base;
+};
+
 /**
  * enum iommu_vqueue_data_type - VQUEUE Data Type
  * @IOMMU_VQUEUE_DATA_NONE: No Data
+ * @IOMMU_VQUEUE_DATA_TEGRA241_CMDQV: NVIDIA Tegra241 CMDQV Extension for SMMUv3
  */
 enum iommu_vqueue_data_type {
 	IOMMU_VQUEUE_DATA_NONE = 0,
+	IOMMU_VQUEUE_DATA_TEGRA241_CMDQV = 1,
 };
 
 /**

From 15e11728e4a25eb088971c4318d6cf013f1dd547 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 28 Mar 2024 21:46:47 +0000
Subject: [PATCH 322/352] cover-letter: iommufd: Add VIOMMU infrastructure
 (Part-3 VQUEUE)

This is also the part-2 for Tegra241 CMDQV's user-space support.

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 752afed42421cca25a2dce87451b95255d944767 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Mon, 1 May 2023 14:46:37 -0700
Subject: [PATCH 323/352] arm64: defconfig: Enable CONFIG_TEGRA241_CMDQV

Enable the new tegra241_cmdqv driver to use the HW feature by default.

Booting kernel has an option to bypass the feature by adding to bootargs:
    "arm_smmu_v3.bypass_vcmdq=y"

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 335d7cd08dc7a..879ecd19725cf 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1322,6 +1322,7 @@ CONFIG_ROCKCHIP_IOMMU=y
 CONFIG_TEGRA_IOMMU_SMMU=y
 CONFIG_ARM_SMMU=y
 CONFIG_ARM_SMMU_V3=y
+CONFIG_TEGRA241_CMDQV=y
 CONFIG_MTK_IOMMU=y
 CONFIG_QCOM_IOMMU=y
 CONFIG_REMOTEPROC=y

From c8cbf07b294590d196220ecf15ad9051b8fab94f Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Fri, 12 Jul 2024 02:59:19 +0000
Subject: [PATCH 324/352] arm64: defconfig: Enable CONFIG_DMA_MAP_BENCHMARK

Enable this upstream tool for iotlb invalidation perf measurement.

make -C tools/testing/selftests/dma/

echo dma_map_benchmark | sudo tee /sys/bus/pci/devices/0000\:83\:00.0/driver_override
echo 0000:83:00.0 | sudo tee /sys/bus/pci/drivers/nvidia/unbind
echo 10de 2342 | sudo tee /sys/bus/pci/drivers/dma_map_benchmark/new_id

sudo tools/testing/selftests/dma/dma_map_benchmark -g 16
dma mapping benchmark: threads:1 seconds:20 node:-1 dir:BIDIRECTIONAL granule: 16
average map latency(us):0.2 standard deviation:0.1
average unmap latency(us):22.8 standard deviation:1.4

sudo tools/testing/selftests/dma/dma_map_benchmark -g 16
dma mapping benchmark: threads:1 seconds:20 node:-1 dir:BIDIRECTIONAL granule: 16
average map latency(us):0.0 standard deviation:0.0
average unmap latency(us):4.7 standard deviation:0.4

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 879ecd19725cf..666dc2a3d8e8f 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1658,3 +1658,4 @@ CONFIG_IOMMUFD_DRIVER=y
 CONFIG_IOMMUFD=y
 CONFIG_IOMMUFD_TEST=y
 CONFIG_IOMMUFD_VFIO_CONTAINER=y
+CONFIG_DMA_MAP_BENCHMARK=y

From 2c287915ba8a69c79a7a5bfffbba85ba26c6fef5 Mon Sep 17 00:00:00 2001
From: Nicolin Chen <nicolinc@nvidia.com>
Date: Thu, 29 Feb 2024 21:04:25 +0000
Subject: [PATCH 325/352] cover-letter: Add CMDQV support

Signed-off-by: Nicolin Chen <nicolinc@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From d12647e03b52fa6a15ff3919d184447bb9b13420 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 14 Aug 2024 11:36:21 +0000
Subject: [PATCH 326/352] vfio/nvgrace-gpu: Read dvsec register to determine
 need for uncached resmem

NVIDIA's recently introduced Grace Blackwell (GB) Superchip is a
continuation with the Grace Hopper (GH) superchip that provides a
cache coherent access to CPU and GPU to each other's memory with
an internal proprietary chip-to-chip cache coherent interconnect.

There is a HW defect on GH systems to support the Multi-Instance
GPU (MIG) feature [1] that necessiated the presence of a 1G region
with uncached mapping carved out from the device memory. The 1G
region is shown as a fake BAR (comprising region 2 and 3) to
workaround the issue. This is fixed on the GB systems.

The presence of the fix for the HW defect is communicated by the
device firmware through the DVSEC PCI config register with ID 3.
The module reads this to take a different codepath on GB vs GH.

Scan through the DVSEC registers to identify the correct one and use
it to determine the presence of the fix. Save the value in the device's
nvgrace_gpu_pci_core_device structure.

[1] https://www.nvidia.com/en-in/technologies/multi-instance-gpu/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 30 +++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 0853c66989948..63f8c96c4a081 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -35,6 +35,11 @@ struct h_node {
 /* A hardwired and constant ABI value between the GPU FW and VFIO driver. */
 #define MEMBLK_SIZE SZ_512M
 
+#define DVSEC_BITMAP_OFFSET 0xA
+#define MIG_SUPPORTED_WITH_CACHED_RESMEM BIT(0)
+
+#define GPU_CAP_DVSEC_REGISTER 3
+
 /*
  * The state of the two device memory region - resmem and usemem - is
  * saved as struct mem_region.
@@ -62,6 +67,7 @@ struct nvgrace_gpu_pci_core_device {
 	struct mem_region resmem;
 	/* Lock to control device memory kernel mapping */
 	struct mutex remap_lock;
+	bool has_mig_hw_bug_fix;
 };
 
 #ifdef CONFIG_MEMORY_FAILURE
@@ -932,6 +938,26 @@ nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
 	return ret;
 }
 
+static bool nvgrace_gpu_has_mig_hw_bug_fix(struct pci_dev *pdev)
+{
+	int pcie_dvsec;
+	u16 dvsec_ctrl16;
+
+	pcie_dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_NVIDIA,
+					       GPU_CAP_DVSEC_REGISTER);
+
+	if (pcie_dvsec) {
+		pci_read_config_word(pdev,
+				     pcie_dvsec + DVSEC_BITMAP_OFFSET,
+				     &dvsec_ctrl16);
+
+		if (dvsec_ctrl16 & MIG_SUPPORTED_WITH_CACHED_RESMEM)
+			return true;
+	}
+
+	return false;
+}
+
 static int nvgrace_gpu_probe(struct pci_dev *pdev,
 			     const struct pci_device_id *id)
 {
@@ -952,6 +978,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	dev_set_drvdata(&pdev->dev, &nvdev->core_device);
 
 	if (ops == &nvgrace_gpu_pci_ops) {
+		nvdev->has_mig_hw_bug_fix = nvgrace_gpu_has_mig_hw_bug_fix(pdev);
+
 		/*
 		 * Device memory properties are identified in the host ACPI
 		 * table. Set the nvgrace_gpu_pci_core_device structure.
@@ -1011,6 +1039,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
 	/* GH200 480GB */
 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+	/* GB200 SKU */
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
 	{}
 };
 

From dd0d8e1f402330fe2114390443235491dcd66477 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 14 Aug 2024 11:42:13 +0000
Subject: [PATCH 327/352] vfio/nvgrace-gpu: Expose the blackwell device PF BAR1
 to the VM

There is a HW defect on Grace Hopper (GH) to support the
Multi-Instance GPU (MIG) feature [1] that necessiated the presence
of a 1G region carved out from the device memory and mapped as
uncached. The 1G region is shown as a fake BAR (comprising region 2 and 3)
to workaround the issue.

The Grace Blackwell systems (GB) differ from GH systems in the following
aspects:
1. The aforementioned HW defect is fixed on GB systems.
2. There is a usable BAR1 (region 2 and 3) on GB systems for the
GPUdirect RDMA feature [2].

This patch accommodate those GB changes by showing the 64b physical
device BAR1 (region2 and 3) to the VM instead of the fake one. This
takes care of both the differences.

Moreover, the entire device memory is exposed on GB as cacheable to
the VM as there is no carveout required.

[1] https://www.nvidia.com/en-in/technologies/multi-instance-gpu/
[2] https://docs.nvidia.com/cuda/gpudirect-rdma/

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 43 +++++++++++++++++++++--------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 63f8c96c4a081..88d65f0548825 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -138,7 +138,8 @@ static vm_fault_t nvgrace_gpu_vfio_pci_fault(struct vm_fault *vmf)
 	/*
 	 * Check if the page is poisoned.
 	 */
-	if (mem_offset < (nvdev->resmem.memlength >> PAGE_SHIFT)) {
+	if (!nvdev->has_mig_hw_bug_fix &&
+	    (mem_offset < (nvdev->resmem.memlength >> PAGE_SHIFT))) {
 		hash_for_each_possible(nvdev->resmem.htbl, cur, node, mem_offset) {
 			if (cur->mem_offset == mem_offset)
 				return VM_FAULT_HWPOISON;
@@ -179,7 +180,7 @@ nvgrace_gpu_memregion(int index,
 	if (index == USEMEM_REGION_INDEX)
 		return &nvdev->usemem;
 
-	if (index == RESMEM_REGION_INDEX)
+	if (!nvdev->has_mig_hw_bug_fix && index == RESMEM_REGION_INDEX)
 		return &nvdev->resmem;
 
 	return NULL;
@@ -229,7 +230,8 @@ static void nvgrace_gpu_close_device(struct vfio_device *core_vdev)
 	mutex_destroy(&nvdev->remap_lock);
 
 #ifdef CONFIG_MEMORY_FAILURE
-	unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
+	if (!nvdev->has_mig_hw_bug_fix)
+		unregister_pfn_address_space(&nvdev->resmem.pfn_address_space);
 	unregister_pfn_address_space(&nvdev->usemem.pfn_address_space);
 #endif
 	vfio_pci_core_close_device(core_vdev);
@@ -310,7 +312,7 @@ static int nvgrace_gpu_mmap(struct vfio_device *core_vdev,
 #ifdef CONFIG_MEMORY_FAILURE
 	vma->vm_ops = &nvgrace_gpu_vfio_pci_mmap_ops;
 
-	if (index == VFIO_PCI_BAR2_REGION_INDEX)
+	if (!nvdev->has_mig_hw_bug_fix && index == VFIO_PCI_BAR2_REGION_INDEX)
 		ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->resmem, vma);
 	else
 		ret = nvgrace_gpu_vfio_pci_register_pfn_range(&nvdev->usemem, vma);
@@ -835,6 +837,16 @@ static const struct vfio_device_ops nvgrace_gpu_pci_core_ops = {
 	.detach_ioas	= vfio_iommufd_physical_detach_ioas,
 };
 
+static void
+nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
+			      struct nvgrace_gpu_pci_core_device *nvdev,
+			      u64 memphys, u64 memlength)
+{
+	nvdev->usemem.memphys = memphys;
+	nvdev->usemem.memlength = memlength;
+	nvdev->usemem.bar_size = roundup_pow_of_two(nvdev->usemem.memlength);
+}
+
 static int
 nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
 				  u64 *pmemphys, u64 *pmemlength)
@@ -872,9 +884,9 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
 }
 
 static int
-nvgrace_gpu_init_nvdev_struct(struct pci_dev *pdev,
-			      struct nvgrace_gpu_pci_core_device *nvdev,
-			      u64 memphys, u64 memlength)
+nvgrace_gpu_init_nvdev_struct_war(struct pci_dev *pdev,
+				  struct nvgrace_gpu_pci_core_device *nvdev,
+				  u64 memphys, u64 memlength)
 {
 	int ret = 0;
 
@@ -984,10 +996,16 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 		 * Device memory properties are identified in the host ACPI
 		 * table. Set the nvgrace_gpu_pci_core_device structure.
 		 */
-		ret = nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
-						    memphys, memlength);
-		if (ret)
-			goto out_put_vdev;
+		if (nvdev->has_mig_hw_bug_fix) {
+			nvgrace_gpu_init_nvdev_struct(pdev, nvdev,
+						      memphys, memlength);
+		} else {
+			ret = nvgrace_gpu_init_nvdev_struct_war(pdev, nvdev,
+								memphys,
+								memlength);
+			if (ret)
+				goto out_put_vdev;
+		}
 	}
 
 	ret = vfio_pci_core_register_device(&nvdev->core_device);
@@ -998,7 +1016,8 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	/*
 	 * Initialize the hashtable tracking the poisoned pages.
 	 */
-	hash_init(nvdev->resmem.htbl);
+	if (!nvdev->has_mig_hw_bug_fix)
+		hash_init(nvdev->resmem.htbl);
 	hash_init(nvdev->usemem.htbl);
 #endif
 	return ret;

From 093c583a0be3d5d24ba9c2435543c7c9a6783c12 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Wed, 14 Aug 2024 11:42:55 +0000
Subject: [PATCH 328/352] vfio/nvgrace-gpu: Check the HBM training and C2C link
 status

In contrast to Grace Hopper systems, the HBM training has been moved
out of the UEFI on the Grace Blackwell systems. This reduces the system
bootup time significantly.

The onus of checking whether the HBM training has completed thus falls
on the module.

The HBM training status can be determined from a BAR0 register.
Similarly, another BAR0 register exposes the status of the CPU-GPU
chip-to-chip (C2C) cache coherent interconnect.

Based on testing, 30s is determined to be sufficient to ensure
initialization completion on all the Grace based systems. Thus poll
these register and check for 30s. If the HBM training is not complete
or if the C2C link is not ready, fail the probe.

While the time is not required on Grace Hopper systems, it is
beneficial to make the check to ensure the device is in an
expected state. Hence keeping it generalized to both the generations.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 53 +++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 88d65f0548825..2ef2e37c5cb65 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -6,6 +6,7 @@
 #include <linux/sizes.h>
 #include <linux/vfio_pci_core.h>
 #include <linux/vmalloc.h>
+#include <linux/delay.h>
 
 #ifdef CONFIG_MEMORY_FAILURE
 #include <linux/bitmap.h>
@@ -40,6 +41,13 @@ struct h_node {
 
 #define GPU_CAP_DVSEC_REGISTER 3
 
+#define C2C_LINK_BAR0_OFFSET 0x1498
+#define HBM_TRAINING_BAR0_OFFSET 0x200BC
+#define STATUS_READY 0xFF
+
+#define POLL_QUANTUM_MS 1000
+#define POLL_TIMEOUT_MS (30 * 1000)
+
 /*
  * The state of the two device memory region - resmem and usemem - is
  * saved as struct mem_region.
@@ -970,6 +978,47 @@ static bool nvgrace_gpu_has_mig_hw_bug_fix(struct pci_dev *pdev)
 	return false;
 }
 
+/*
+ * To reduce the system bootup time, the HBM training has
+ * been moved out of the UEFI on the Grace-Blackwell systems.
+ *
+ * The onus of checking whether the HBM training has completed
+ * thus falls on the module. The HBM training status can be
+ * determined from a BAR0 register.
+ *
+ * Similarly, another BAR0 register exposes the status of the
+ * CPU-GPU chip-to-chip (C2C) cache coherent interconnect.
+ *
+ * Poll these register and check for 30s. If the HBM training is
+ * not complete or if the C2C link is not ready, fail the probe.
+ *
+ * While the wait is not required on Grace Hopper systems, it
+ * is beneficial to make the check to ensure the device is in an
+ * expected state.
+ */
+static int nvgrace_gpu_check_device_status(struct pci_dev *pdev)
+{
+	void __iomem *io;
+	int time_elasped;
+
+	io = pci_iomap(pdev, 0, ~0UL);
+	if (!io)
+		return -ENOMEM;
+
+	for (time_elasped = 0; time_elasped < POLL_TIMEOUT_MS;
+	     time_elasped += POLL_QUANTUM_MS) {
+		if ((ioread32(io + C2C_LINK_BAR0_OFFSET) == STATUS_READY) &&
+		    (ioread32(io + HBM_TRAINING_BAR0_OFFSET) == STATUS_READY)) {
+			pci_iounmap(pdev, io);
+			return 0;
+		}
+		msleep(POLL_QUANTUM_MS);
+	}
+
+	pci_iounmap(pdev, io);
+	return -ENODEV;
+}
+
 static int nvgrace_gpu_probe(struct pci_dev *pdev,
 			     const struct pci_device_id *id)
 {
@@ -978,6 +1027,10 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	u64 memphys, memlength;
 	int ret;
 
+	ret = nvgrace_gpu_check_device_status(pdev);
+	if (ret)
+		return ret;
+
 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
 	if (!ret)
 		ops = &nvgrace_gpu_pci_ops;

From f104174078bf7811ca2ac195eacb03977fb78c0d Mon Sep 17 00:00:00 2001
From: "ankita@nvidia.com" <ankita@nvidia.com>
Date: Wed, 14 Aug 2024 03:43:49 +0000
Subject: [PATCH 329/352] cover-letter: vfio/nvgrace-gpu: Enable grace
 blackwell boards

NVIDIA's recently introduced Grace Blackwell (GB) Superchip in
continuation with the Grace Hopper (GH) superchip that provides a
cache coherent access to CPU and GPU to each other's memory with
an internal proprietary chip-to-chip (C2C) cache coherent interconnect.
The in-tree nvgrace-gpu driver manages the GH devices. The intention
is to extend the support to the new Grace Blackwell boards.

There is a HW defect on GH to support the Multi-Instance GPU (MIG)
feature [1] that necessiated the presence of a 1G carved out from
the device memory and mapped uncached. The 1G region is shown as a
fake BAR (comprising region 2 and 3) to workaround the issue.

The GB systems differ from GH systems in the following aspects.
1. The aforementioned HW defect is fixed on GB systems.
2. There is a usable BAR1 (region 2 and 3) on GB systems for the
GPUdirect RDMA feature [2].

This patch series accommodate those GB changes by showing the real
physical device BAR1 (region2 and 3) to the VM instead of the fake
one. This takes care of both the differences.

The presence of the fix for the HW defect is communicated by the
firmware through a DVSEC PCI config register. The module reads
this to take a different codepath on GB vs GH.

To improve system bootup time, HBM training is moved out of UEFI
in GB system. Poll for the register indicating the training state.
Also check the C2C link status if it is ready. Fail the probe if
either fails.

[1] https://www.nvidia.com/en-in/technologies/multi-instance-gpu/
[2] https://docs.nvidia.com/cuda/gpudirect-rdma/

Applied over v6.10-rc7.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>

Ankit Agrawal (3):
  vfio/nvgrace-gpu: Read dvsec register to determine need for uncached
    resmem
  vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM
  vfio/nvgrace-gpu: Check the HBM training and C2C link status

 drivers/vfio/pci/nvgrace-gpu/main.c | 115 ++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 8 deletions(-)

--
2.34.1

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 12648ff432419e487015281ac453cb8e81e5e36b Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 29 Aug 2024 08:15:39 +0000
Subject: [PATCH 330/352] KVM: arm64: Allow exec fault on memory mapped
 cacheable in VMA

When the Grace Hopper/Blackwell system is setup with EGM mode in
virtualization, the system memory is partitioned into two: A Host
OS visible memory and a second EGM region that is not added to
the host OS. The EGM region is assigned to the VM as its system memory
with the QEMU VMA mapped through remap_pfn_range.

Currently KVM sets up the stage-2 mapping for memory that is not
added to the kernel with device properties. It thus does not allow
support for execution fault on such region. Since the EGM memory is
mapped through remap_pfn_range and not added to the kernel, such
memory is set without execution fault support.

This patch intends to update the KVM behaviour. It is an extension
of the proposal [1] to make KVM determine whether a region should have
NORMAL memory properties based on the VMA pgprot. The KVM behavior is
changed to set a region with support of executable fault if and only
if its VMA is mapped cacheable.

The EGM memory is NORMAL system memory that is not added to the
kernel. It is safe in terms of execution fault and is expected to
display all properties of NORMAL memory. The patch enables this
use case.

Check QEMU VMA pgprot to check if it is mapped as Normal cacheable
memory and allow exec fault.

Link:
https://lore.kernel.org/lkml/20230907181459.18145-2-ankita@nvidia.com [1]

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/kvm/mmu.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index f727ea1ceb09c..f6e519a6e24e3 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -1392,6 +1392,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 	bool exec_fault, mte_allowed;
 	bool device = false, vfio_allow_any_uc = false;
 	unsigned long mmu_seq;
+	unsigned long mt;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache;
 	struct vm_area_struct *vma;
@@ -1483,6 +1484,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 
 	vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
 
+	mt = mapping_type(vma->vm_page_prot);
+
 	/*
 	 * Figure out the memory type based on the user va mapping properties
 	 * Only MT_DEVICE_nGnRE and MT_DEVICE_nGnRnE will be set using
@@ -1542,7 +1545,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
 		writable = false;
 	}
 
-	if (exec_fault && device)
+	if (exec_fault && device && mt != MT_NORMAL)
 		return -ENOEXEC;
 
 	read_lock(&kvm->mmu_lock);

From d4f9dd80bb46e2308b9c5beb24c3e4104c76f6b9 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 29 Aug 2024 08:15:40 +0000
Subject: [PATCH 331/352] vfio/nvgrace-egm: Introduce module to manage EGM

The Extended GPU Memory (EGM) feature enables the GPU access to
the system memory across sockets and nodes. In this mode, the
physical memory can be allocated for GPU usage from anywhere
in a multi-node system. The feature is being extended to
virtualization.

EGM when enabled in the virtualization stack, the host memory
is partitioned into 2: One partition for the Host OS usage, and
a second EGM region. The EGM region essentially becomes the
system memory of the VM. The following figure shows the memory map
in the virtualization environment.

|---- Sysmem ----|                  |--- GPU mem ---|  VM Memory Map
|                |                  |               |
|                |                  |               |
|------ EGM -----|--Host Mem----|   |--- GPU mem ---|  Host Memory Map

The EGM region is not available to the host memory for its usage as it
is not added to the kernel. Its base HPA and the length is communicated
through the DSDT entries. A linear mapping between the VM IPA and system
HPA is a requirement for EGM support. The EGM region is thus assigned to
a VM by mapping the QEMU VMA to a linearly increasing HPA of the EGM
region using remap_pfn_range().

Introduce a new nvgrace-egm helper module to nvgrace-gpu to manage the
EGM/VM region for the VM.

nvgrace-egm module handles the following:
1. Fetch the EGM memory properties (base HPA, length, proximity domain).
2. Create a char device that can be used as memory-backend-file by Qemu
for the VM and implement file operations. The char device is /dev/egmX,
where X is the PXM node ID of the EGM being mapped fetched in 1.
3. Zero the EGM memory on first device open().
4. Map the QEMU VMA to the EGM region using remap_pfn_range.
5. Cleaning up state and destroying the chardev on device unbind.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/Kconfig  |  11 ++
 drivers/vfio/pci/nvgrace-gpu/Makefile |   3 +
 drivers/vfio/pci/nvgrace-gpu/egm.c    | 235 ++++++++++++++++++++++++++
 drivers/vfio/pci/nvgrace-gpu/egm.h    |  12 ++
 drivers/vfio/pci/nvgrace-gpu/main.c   |  35 +++-
 5 files changed, 292 insertions(+), 4 deletions(-)
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/egm.c
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/egm.h

diff --git a/drivers/vfio/pci/nvgrace-gpu/Kconfig b/drivers/vfio/pci/nvgrace-gpu/Kconfig
index a7f624b37e410..d5773bbd22f5e 100644
--- a/drivers/vfio/pci/nvgrace-gpu/Kconfig
+++ b/drivers/vfio/pci/nvgrace-gpu/Kconfig
@@ -1,8 +1,19 @@
 # SPDX-License-Identifier: GPL-2.0-only
+config NVGRACE_EGM
+	tristate "EGM driver for NVIDIA Grace Hopper and Blackwell Superchip"
+	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	help
+	  Extended GPU Memory (EGM) support for the GPU in the NVIDIA Grace
+	  based chips required to avail the CPU memory as additional
+	  cross-node/cross-socket memory for GPU using KVM/qemu.
+
+	  If you don't know what to do here, say N.
+
 config NVGRACE_GPU_VFIO_PCI
 	tristate "VFIO support for the GPU in the NVIDIA Grace Hopper Superchip"
 	depends on ARM64 || (COMPILE_TEST && 64BIT)
 	select VFIO_PCI_CORE
+	select NVGRACE_EGM
 	help
 	  VFIO support for the GPU in the NVIDIA Grace Hopper Superchip is
 	  required to assign the GPU device to userspace using KVM/qemu/etc.
diff --git a/drivers/vfio/pci/nvgrace-gpu/Makefile b/drivers/vfio/pci/nvgrace-gpu/Makefile
index 3ca8c187897a9..c99b04a94e770 100644
--- a/drivers/vfio/pci/nvgrace-gpu/Makefile
+++ b/drivers/vfio/pci/nvgrace-gpu/Makefile
@@ -1,3 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_NVGRACE_GPU_VFIO_PCI) += nvgrace-gpu-vfio-pci.o
 nvgrace-gpu-vfio-pci-y := main.o
+
+obj-$(CONFIG_NVGRACE_EGM) += nvgrace-egm.o
+nvgrace-egm-y := egm.o
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
new file mode 100644
index 0000000000000..f3c22a9dfecb9
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -0,0 +1,235 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include <linux/vfio_pci_core.h>
+#include "egm.h"
+
+#define MAX_EGM_NODES 256
+
+struct egm_region {
+	struct list_head list;
+	int egmpxm;
+	atomic_t open_count;
+	phys_addr_t egmphys;
+	size_t egmlength;
+	struct device device;
+	struct cdev cdev;
+};
+
+static dev_t dev;
+static struct class *class;
+static struct list_head egm_list;
+
+static int nvgrace_egm_open(struct inode *inode, struct file *file)
+{
+	void *memaddr;
+	struct egm_region *region = container_of(inode->i_cdev,
+						 struct egm_region, cdev);
+
+	if (!region)
+		return -EINVAL;
+
+	if (atomic_inc_return(&region->open_count) > 1)
+		return 0;
+
+	memaddr = memremap(region->egmphys, region->egmlength, MEMREMAP_WB);
+	if (!memaddr) {
+		atomic_dec(&region->open_count);
+		return -EINVAL;
+	}
+
+	memset((u8 *)memaddr, 0, region->egmlength);
+	memunmap(memaddr);
+	file->private_data = region;
+
+	return 0;
+}
+
+static int nvgrace_egm_release(struct inode *inode, struct file *file)
+{
+	struct egm_region *region = container_of(inode->i_cdev,
+						 struct egm_region, cdev);
+
+	if (!region)
+		return -EINVAL;
+
+	if (atomic_dec_and_test(&region->open_count))
+		file->private_data = NULL;
+
+	return 0;
+}
+
+static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int ret = 0;
+	struct egm_region *region = file->private_data;
+
+	if (!region)
+		return -EINVAL;
+
+	ret = remap_pfn_range(vma, vma->vm_start,
+			      PHYS_PFN(region->egmphys),
+			      (vma->vm_end - vma->vm_start),
+			      vma->vm_page_prot);
+	return ret;
+}
+
+static const struct file_operations file_ops = {
+	.owner = THIS_MODULE,
+	.open = nvgrace_egm_open,
+	.release = nvgrace_egm_release,
+	.mmap = nvgrace_egm_mmap,
+};
+
+static int setup_egm_chardev(struct egm_region *region)
+{
+	int ret = 0;
+
+	device_initialize(&region->device);
+
+	/*
+	 * Use the proximity domain number as the device minor
+	 * number. So the EGM corresponding to node X would be
+	 * /dev/egmX.
+	 */
+	region->device.devt = MKDEV(MAJOR(dev), region->egmpxm);
+	region->device.class = class;
+	cdev_init(&region->cdev, &file_ops);
+	region->cdev.owner = THIS_MODULE;
+
+	ret = dev_set_name(&region->device, "egm%d", region->egmpxm);
+	if (ret)
+		return ret;
+
+	ret = cdev_device_add(&region->cdev, &region->device);
+
+	return ret;
+}
+
+static int
+nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
+			       u64 *pegmlength, u64 *pegmpxm)
+{
+	int ret;
+
+	/*
+	 * The memory information is present in the system ACPI tables as DSD
+	 * properties nvidia,egm-base-pa and nvidia,egmm-size.
+	 */
+	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-size",
+				       pegmlength);
+	if (ret)
+		return ret;
+
+	if (*pegmlength > type_max(size_t))
+		return -EOVERFLOW;
+
+	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-base-pa",
+				       pegmphys);
+	if (ret)
+		return ret;
+
+	if (*pegmphys > type_max(phys_addr_t))
+		return -EOVERFLOW;
+
+	ret = device_property_read_u64(&pdev->dev, "nvidia,egm-pxm",
+				       pegmpxm);
+
+	if (*pegmpxm > type_max(phys_addr_t))
+		return -EOVERFLOW;
+
+	return ret;
+}
+
+int register_egm_node(struct pci_dev *pdev)
+{
+	struct egm_region *region = NULL;
+	u64 egmphys, egmlength, egmpxm;
+	int ret;
+
+	ret = nvgrace_gpu_fetch_egm_property(pdev, &egmphys, &egmlength, &egmpxm);
+	if (ret)
+		return ret;
+
+	list_for_each_entry(region, &egm_list, list) {
+		if (region->egmphys == egmphys)
+			return 0;
+	}
+
+	region = kvzalloc(sizeof(*region), GFP_KERNEL);
+	region->egmphys = egmphys;
+	region->egmlength = egmlength;
+	region->egmpxm = egmpxm;
+
+	atomic_set(&region->open_count, 0);
+
+	list_add_tail(&region->list, &egm_list);
+
+	setup_egm_chardev(region);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(register_egm_node);
+
+static void destroy_egm_chardev(struct egm_region *region)
+{
+	cdev_device_del(&region->cdev, &region->device);
+}
+
+void unregister_egm_node(int egm_node)
+{
+	struct egm_region *region, *temp_region;
+
+	list_for_each_entry_safe(region, temp_region, &egm_list, list) {
+		if (egm_node == region->egmpxm) {
+			destroy_egm_chardev(region);
+			list_del(&region->list);
+		}
+	}
+}
+EXPORT_SYMBOL_GPL(unregister_egm_node);
+
+static char *egm_devnode(const struct device *device, umode_t *mode)
+{
+	if (mode)
+		*mode = 0600;
+
+	return NULL;
+}
+
+static int __init nvgrace_egm_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&dev,
+				  0, MAX_EGM_NODES, "egm");
+	if (ret < 0)
+		return ret;
+
+	class = class_create("egm");
+	if (IS_ERR(class)) {
+		unregister_chrdev_region(dev, MAX_EGM_NODES);
+		return PTR_ERR(class);
+	}
+
+	class->devnode = egm_devnode;
+
+	INIT_LIST_HEAD(&egm_list);
+
+	return 0;
+}
+
+static void __exit nvgrace_egm_cleanup(void)
+{
+	class_destroy(class);
+	unregister_chrdev_region(dev, MAX_EGM_NODES);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Ankit Agrawal <ankita@nvidia.com>");
+MODULE_DESCRIPTION("NVGRACE EGM - Helper module of NVGRACE GPU to support Extended GPU Memory");
+
+module_init(nvgrace_egm_init);
+module_exit(nvgrace_egm_cleanup);
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.h b/drivers/vfio/pci/nvgrace-gpu/egm.h
new file mode 100644
index 0000000000000..28cc59e04a0b0
--- /dev/null
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.h
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef NVGRACE_EGM_H
+#define NVGRACE_EGM_H
+
+int register_egm_node(struct pci_dev *pdev);
+void unregister_egm_node(int egm_node);
+
+#endif /* NVGRACE_EGM_H */
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index 2ef2e37c5cb65..e648318809b5f 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -7,6 +7,7 @@
 #include <linux/vfio_pci_core.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
+#include "egm.h"
 
 #ifdef CONFIG_MEMORY_FAILURE
 #include <linux/bitmap.h>
@@ -76,8 +77,11 @@ struct nvgrace_gpu_pci_core_device {
 	/* Lock to control device memory kernel mapping */
 	struct mutex remap_lock;
 	bool has_mig_hw_bug_fix;
+	int egm_node;
 };
 
+static bool egm_enabled;
+
 #ifdef CONFIG_MEMORY_FAILURE
 static void
 nvgrace_gpu_vfio_pci_pfn_memory_failure(struct pfn_address_space *pfn_space,
@@ -891,6 +895,13 @@ nvgrace_gpu_fetch_memory_property(struct pci_dev *pdev,
 	return ret;
 }
 
+static int
+nvgrace_gpu_has_egm_property(struct pci_dev *pdev, u64 *pegmpxm)
+{
+	return device_property_read_u64(&pdev->dev, "nvidia,egm-pxm",
+					pegmpxm);
+}
+
 static int
 nvgrace_gpu_init_nvdev_struct_war(struct pci_dev *pdev,
 				  struct nvgrace_gpu_pci_core_device *nvdev,
@@ -1025,6 +1036,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 	const struct vfio_device_ops *ops = &nvgrace_gpu_pci_core_ops;
 	struct nvgrace_gpu_pci_core_device *nvdev;
 	u64 memphys, memlength;
+	u64 egmpxm;
 	int ret;
 
 	ret = nvgrace_gpu_check_device_status(pdev);
@@ -1032,9 +1044,14 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 		return ret;
 
 	ret = nvgrace_gpu_fetch_memory_property(pdev, &memphys, &memlength);
-	if (!ret)
+	if (!ret) {
 		ops = &nvgrace_gpu_pci_ops;
 
+		ret = nvgrace_gpu_has_egm_property(pdev, &egmpxm);
+		if (!ret)
+			egm_enabled = true;
+	}
+
 	nvdev = vfio_alloc_device(nvgrace_gpu_pci_core_device, core_device.vdev,
 				  &pdev->dev, ops);
 	if (IS_ERR(nvdev))
@@ -1059,6 +1076,12 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 			if (ret)
 				goto out_put_vdev;
 		}
+
+		if (egm_enabled) {
+			register_egm_node(pdev);
+			nvdev->egm_node = egmpxm;
+		}
+
 	}
 
 	ret = vfio_pci_core_register_device(&nvdev->core_device);
@@ -1073,6 +1096,7 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 		hash_init(nvdev->resmem.htbl);
 	hash_init(nvdev->usemem.htbl);
 #endif
+
 	return ret;
 
 out_put_vdev:
@@ -1083,14 +1107,14 @@ static int nvgrace_gpu_probe(struct pci_dev *pdev,
 static void nvgrace_gpu_remove(struct pci_dev *pdev)
 {
 	struct vfio_pci_core_device *core_device = dev_get_drvdata(&pdev->dev);
+	struct nvgrace_gpu_pci_core_device *nvdev =
+		container_of(core_device, struct nvgrace_gpu_pci_core_device,
+			     core_device);
 
 #ifdef CONFIG_MEMORY_FAILURE
 	struct h_node *cur;
 	unsigned long bkt;
 	struct hlist_node *tmp_node;
-	struct nvgrace_gpu_pci_core_device *nvdev =
-		container_of(core_device, struct nvgrace_gpu_pci_core_device,
-			     core_device);
 	hash_for_each_safe(nvdev->resmem.htbl, bkt, tmp_node, cur, node) {
 		hash_del(&cur->node);
 		vfree(cur);
@@ -1102,6 +1126,9 @@ static void nvgrace_gpu_remove(struct pci_dev *pdev)
 	}
 #endif
 
+	if (egm_enabled)
+		unregister_egm_node(nvdev->egm_node);
+
 	vfio_pci_core_unregister_device(core_device);
 	vfio_put_device(&core_device->vdev);
 }

From 630aa885aff2204a3e47e9edafc32ecddcafcf82 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 29 Aug 2024 08:15:41 +0000
Subject: [PATCH 332/352] vfio/nvgrace-egm: Handle pages with ECC errors on the
 EGM

It is possible for some system memory pages on the EGM to
have uncorrectable ECC errors. A list of pages known with such
errors (referred as retired pages) are maintained by the Host
UEFI. The Host UEFI populates such list in a reserved region.
It communicates the SPA of this region through a ACPI DSDT property.

nvgrace-egm module is responsible to store the list of retired page
offsets to be made available for usermode processes. The module:
1. Get the reserved memory region SPA and maps to it to fetch
the list of bad pages.
2. Calculate the retired page offsets in the EGM and stores it.
3. Expose an ioctl to allow querying of the offsets.

The ioctl is called by usermode apps such as QEMU to get the
retired page offsets. The usermode apps are expected to take
appropriate action to communicate the list to the VM.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/egm.c | 126 +++++++++++++++++++++++++++++
 include/uapi/linux/egm.h           |  26 ++++++
 2 files changed, 152 insertions(+)
 create mode 100644 include/uapi/linux/egm.h

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index f3c22a9dfecb9..8c9ff6313e9f4 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -4,6 +4,8 @@
  */
 
 #include <linux/vfio_pci_core.h>
+#include <linux/hashtable.h>
+#include <linux/egm.h>
 #include "egm.h"
 
 #define MAX_EGM_NODES 256
@@ -16,6 +18,12 @@ struct egm_region {
 	size_t egmlength;
 	struct device device;
 	struct cdev cdev;
+	DECLARE_HASHTABLE(htbl, 0x10);
+};
+
+struct h_node {
+	unsigned long mem_offset;
+	struct hlist_node node;
 };
 
 static dev_t dev;
@@ -76,11 +84,80 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma)
 	return ret;
 }
 
+static long nvgrace_egm_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	unsigned long minsz = offsetofend(struct egm_bad_pages_list, count);
+	struct egm_bad_pages_list info;
+	void __user *uarg = (void __user *)arg;
+	struct egm_region *region = file->private_data;
+
+	if (copy_from_user(&info, uarg, minsz))
+		return -EFAULT;
+
+	if (info.argsz < minsz)
+		return -EINVAL;
+
+	if (!region)
+		return -EINVAL;
+
+	switch (cmd) {
+	case EGM_BAD_PAGES_LIST:
+		int ret;
+		unsigned long bad_page_struct_size = sizeof(struct egm_bad_pages_info);
+		struct egm_bad_pages_info tmp;
+		struct h_node *cur_page;
+		struct hlist_node *tmp_node;
+		unsigned long bkt;
+		int count = 0, index = 0;
+
+		hash_for_each_safe(region->htbl, bkt, tmp_node, cur_page, node)
+			count++;
+
+		if (info.argsz < (minsz + count * bad_page_struct_size)) {
+			info.argsz = minsz + count * bad_page_struct_size;
+			info.count = 0;
+			goto done;
+		} else {
+			hash_for_each_safe(region->htbl, bkt, tmp_node, cur_page, node) {
+				/*
+				 * This check fails if there was an ECC error
+				 * after the usermode app read the count of
+				 * bad pages through this ioctl.
+				 */
+				if (minsz + index * bad_page_struct_size >= info.argsz) {
+					info.argsz = minsz + index * bad_page_struct_size;
+					info.count = index;
+					goto done;
+				}
+
+				tmp.offset = cur_page->mem_offset;
+				tmp.size = PAGE_SIZE;
+
+				ret = copy_to_user(uarg + minsz +
+						   index * bad_page_struct_size,
+						   &tmp, bad_page_struct_size);
+				if (ret)
+					return ret;
+				index++;
+			}
+
+			info.count = index;
+		}
+		break;
+	default:
+		return -EINVAL;
+	}
+
+done:
+	return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0;
+}
+
 static const struct file_operations file_ops = {
 	.owner = THIS_MODULE,
 	.open = nvgrace_egm_open,
 	.release = nvgrace_egm_release,
 	.mmap = nvgrace_egm_mmap,
+	.unlocked_ioctl = nvgrace_egm_ioctl,
 };
 
 static int setup_egm_chardev(struct egm_region *region)
@@ -143,6 +220,45 @@ nvgrace_gpu_fetch_egm_property(struct pci_dev *pdev, u64 *pegmphys,
 	return ret;
 }
 
+static void nvgrace_egm_fetch_bad_pages(struct pci_dev *pdev,
+					struct egm_region *region)
+{
+	u64 retiredpagesphys, count;
+	void *memaddr;
+	int index;
+
+	if (device_property_read_u64(&pdev->dev,
+				     "nvidia,egm-retired-pages-data-base",
+				     &retiredpagesphys))
+		return;
+
+	memaddr = memremap(retiredpagesphys, PAGE_SIZE, MEMREMAP_WB);
+	if (!memaddr)
+		return;
+
+	count = *(u64 *)memaddr;
+
+	hash_init(region->htbl);
+
+	for (index = 0; index < count; index++) {
+		struct h_node *retired_page;
+
+		/*
+		 * Since the EGM is linearly mapped, the offset in the
+		 * carveout is the same offset in the VM system memory.
+		 *
+		 * Calculate the offset to communicate to the usermode
+		 * apps.
+		 */
+		retired_page = (struct h_node *)(vzalloc(sizeof(struct h_node)));
+		retired_page->mem_offset = *((u64 *)memaddr + index + 1) -
+					   region->egmphys;
+		hash_add(region->htbl, &retired_page->node, retired_page->mem_offset);
+	}
+
+	memunmap(memaddr);
+}
+
 int register_egm_node(struct pci_dev *pdev)
 {
 	struct egm_region *region = NULL;
@@ -165,6 +281,8 @@ int register_egm_node(struct pci_dev *pdev)
 
 	atomic_set(&region->open_count, 0);
 
+	nvgrace_egm_fetch_bad_pages(pdev, region);
+
 	list_add_tail(&region->list, &egm_list);
 
 	setup_egm_chardev(region);
@@ -181,9 +299,17 @@ static void destroy_egm_chardev(struct egm_region *region)
 void unregister_egm_node(int egm_node)
 {
 	struct egm_region *region, *temp_region;
+	struct h_node *cur_page;
+	unsigned long bkt;
+	struct hlist_node *temp_node;
 
 	list_for_each_entry_safe(region, temp_region, &egm_list, list) {
 		if (egm_node == region->egmpxm) {
+			hash_for_each_safe(region->htbl, bkt, temp_node, cur_page, node) {
+				hash_del(&cur_page->node);
+				vfree(cur_page);
+			}
+
 			destroy_egm_chardev(region);
 			list_del(&region->list);
 		}
diff --git a/include/uapi/linux/egm.h b/include/uapi/linux/egm.h
new file mode 100644
index 0000000000000..8a808e45c2052
--- /dev/null
+++ b/include/uapi/linux/egm.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef _UAPIEGM_H
+#define _UAPIEGM_H
+
+#define EGM_TYPE ('E')
+
+struct egm_bad_pages_info {
+	__aligned_u64 offset;
+	__aligned_u64 size;
+};
+
+struct egm_bad_pages_list {
+	__u32 argsz;
+	/* out */
+	__u32 count;
+	/* out */
+	struct egm_bad_pages_info bad_pages[];
+};
+
+#define EGM_BAD_PAGES_LIST     _IO(EGM_TYPE, 100)
+
+#endif /* _UAPIEGM_H */

From d25d8710e6686928ade4178efaad78b39e4b7221 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Thu, 29 Aug 2024 08:15:42 +0000
Subject: [PATCH 333/352] vfio/nvgrace-egm: Register EGM for runtime ECC poison
 errors handling

The Extended GPU Memory (EGM) is mapped through remap_pfn_range() and
is not backed by struct pages. Currently, memory_failure() on such
region is unsupported in kernel MM.

There is a proposal to handle such memory region [1]. The implementation
exports APIs to register a memory region and a corresponding callback
function with the kernel MM. On the occurrence of memory failure on the
registered region, kernel MM calls the callback to communicate the
faulting PFN.

This patch registers the EGM memory and the callback function
nvgrace_egm_pfn_memory_failure with the kernel MM. On memory failure,
nvgrace_egm_pfn_memory_failure is triggered and the nvgrace-egm module
adds the faulting PFN to the hashtable tracking retired ECC error pages.

It also implements a fault VM ops to check if the access is being made
to a page known with ECC errors and returns VM_FAULT_HWPOISON in such
case.

Link: https://lore.kernel.org/all/20231123003513.24292-1-ankita@nvidia.com/ [1]

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/egm.c | 88 +++++++++++++++++++++++++++++-
 1 file changed, 87 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index 8c9ff6313e9f4..844e872e39f91 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -8,6 +8,11 @@
 #include <linux/egm.h>
 #include "egm.h"
 
+#ifdef CONFIG_MEMORY_FAILURE
+#include <linux/bitmap.h>
+#include <linux/memory-failure.h>
+#endif
+
 #define MAX_EGM_NODES 256
 
 struct egm_region {
@@ -19,6 +24,9 @@ struct egm_region {
 	struct device device;
 	struct cdev cdev;
 	DECLARE_HASHTABLE(htbl, 0x10);
+#ifdef CONFIG_MEMORY_FAILURE
+	struct pfn_address_space pfn_address_space;
+#endif
 };
 
 struct h_node {
@@ -30,6 +38,70 @@ static dev_t dev;
 static struct class *class;
 static struct list_head egm_list;
 
+#ifdef CONFIG_MEMORY_FAILURE
+static void
+nvgrace_egm_pfn_memory_failure(struct pfn_address_space *pfn_space,
+			       unsigned long pfn)
+{
+	struct egm_region *region =
+		container_of(pfn_space, struct egm_region, pfn_address_space);
+	unsigned long mem_offset = PFN_PHYS(pfn - pfn_space->node.start);
+	struct h_node *ecc;
+
+	if (mem_offset >= region->egmlength)
+		return;
+
+	/*
+	 * MM has called to notify a poisoned page. Track that in the hastable.
+	 */
+	ecc = (struct h_node *)(vzalloc(sizeof(struct h_node)));
+	ecc->mem_offset = mem_offset;
+	hash_add(region->htbl, &ecc->node, ecc->mem_offset);
+}
+
+struct pfn_address_space_ops nvgrace_egm_pas_ops = {
+	.failure = nvgrace_egm_pfn_memory_failure,
+};
+
+static int
+nvgrace_egm_register_pfn_range(struct egm_region *region,
+			       struct vm_area_struct *vma)
+{
+	unsigned long nr_pages = region->egmlength >> PAGE_SHIFT;
+
+	region->pfn_address_space.node.start = vma->vm_pgoff;
+	region->pfn_address_space.node.last = vma->vm_pgoff + nr_pages - 1;
+	region->pfn_address_space.ops = &nvgrace_egm_pas_ops;
+	region->pfn_address_space.mapping = vma->vm_file->f_mapping;
+
+	return register_pfn_address_space(&region->pfn_address_space);
+}
+
+static vm_fault_t nvgrace_egm_fault(struct vm_fault *vmf)
+{
+	unsigned long mem_offset = PFN_PHYS(vmf->pgoff - vmf->vma->vm_pgoff);
+	struct egm_region *region = vmf->vma->vm_file->private_data;
+	struct h_node *cur;
+
+	/*
+	 * Check if the page is poisoned.
+	 */
+	if (mem_offset < region->egmlength) {
+		hash_for_each_possible(region->htbl, cur, node, mem_offset) {
+			if (cur->mem_offset == mem_offset)
+				return VM_FAULT_HWPOISON;
+		}
+	}
+
+	return VM_FAULT_ERROR;
+}
+
+static const struct vm_operations_struct nvgrace_egm_mmap_ops = {
+	 .fault = nvgrace_egm_fault,
+};
+
+#endif
+
 static int nvgrace_egm_open(struct inode *inode, struct file *file)
 {
 	void *memaddr;
@@ -63,8 +135,12 @@ static int nvgrace_egm_release(struct inode *inode, struct file *file)
 	if (!region)
 		return -EINVAL;
 
-	if (atomic_dec_and_test(&region->open_count))
+	if (atomic_dec_and_test(&region->open_count)) {
+#ifdef CONFIG_MEMORY_FAILURE
+		unregister_pfn_address_space(&region->pfn_address_space);
+#endif
 		file->private_data = NULL;
+	}
 
 	return 0;
 }
@@ -81,6 +157,16 @@ static int nvgrace_egm_mmap(struct file *file, struct vm_area_struct *vma)
 			      PHYS_PFN(region->egmphys),
 			      (vma->vm_end - vma->vm_start),
 			      vma->vm_page_prot);
+	if (ret)
+		return ret;
+
+	vma->vm_pgoff = PHYS_PFN(region->egmphys);
+
+#ifdef CONFIG_MEMORY_FAILURE
+	vma->vm_ops = &nvgrace_egm_mmap_ops;
+
+	ret = nvgrace_egm_register_pfn_range(region, vma);
+#endif
 	return ret;
 }
 

From f4361aeb6bd5a8be847207e93abe43688eed2c43 Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mochs@nvidia.com>
Date: Thu, 29 Aug 2024 18:49:03 -0700
Subject: [PATCH 334/352] arm64: configs: Build CONFIG_NVGRACE_EGM as LKM

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 arch/arm64/configs/defconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig
index 666dc2a3d8e8f..4619fe2c4d2c4 100644
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -1651,6 +1651,7 @@ CONFIG_CORESIGHT_CPU_DEBUG=m
 CONFIG_CORESIGHT_CTI=m
 CONFIG_MEMTEST=y
 CONFIG_NVGRACE_GPU_VFIO_PCI=m
+CONFIG_NVGRACE_EGM=m
 CONFIG_VFIO_DEVICE_CDEV=y
 # CONFIG_VFIO_CONTAINER is not set
 CONFIG_FAULT_INJECTION=y

From 31824bdb19e8603b9fc51876a142c5840d21d0fe Mon Sep 17 00:00:00 2001
From: "ankita@nvidia.com" <ankita@nvidia.com>
Date: Thu, 29 Aug 2024 08:15:38 +0000
Subject: [PATCH 335/352] cover-letter: Add virtualization support for EGM

Grace Hopper/Blackwell systems support the Extended GPU Memory (EGM)
feature that enable the GPU to access the system memory within and
across nodes. The GPU can allocate the system memory located on the
same socket or from a different socket or even on a different
node in a multi-node system [1]. The feature is being extended to
virtualization.

EGM when enabled in the virtualization stack, the host memory
is partitioned into two: One partition for the Host OS usage, and
a second EGM region that is assigned to the VM. The EGM region
essentially becomes the system memory of the VM.

The EGM/VM region is not available to the host memory for its usage
as it is not added to the kernel. Its base HPA and the length is
communicated through the DSDT entries. A linear mapping between the
VM IPA and system HPA is a requirement for EGM support. The EGM region
is thus assigned to a VM by mapping the QEMU VMA to a linearly
increasing HPA of the EGM region using remap_pfn_range().

Patch 1/4 change the KVM code to allow EGM memory to be S2
mapped with executable flag.
Patch 2/4 introduce a new nvgrace-egm helper module to
nvgrace-gpu to manage the carveout for the VM. The module
implements a char device to expose the EGM to usermode apps such
as QEMU. The module does a linear mapping of the QEMU VMA to the
EGM HPA using remap_pfn range.
Patch 3/4 fetches the list of pages known with ECC errors on the
EGM memory and expose them through an ioctl.
Patch 4/4 registers the EGM memory for ECC poison error handling.

Link:
https://developer.nvidia.com/blog/nvidia-grace-hopper-superchip-architecture-in-depth/#extended_gpu_memory
[1]

Ankit Agrawal (4):
  KVM: arm64: Allow exec fault on memory mapped cacheable in VMA
  vfio/nvgrace-egm: Introduce module to manage EGM
  vfio/nvgrace-egm: Handle pages with ECC errors on the EGM
  vfio/nvgrace-egm: Register EGM for runtime ECC poison errors handling

 arch/arm64/kvm/mmu.c                  |   5 +-
 drivers/vfio/pci/nvgrace-gpu/Kconfig  |  11 +
 drivers/vfio/pci/nvgrace-gpu/Makefile |   3 +
 drivers/vfio/pci/nvgrace-gpu/egm.c    | 449 ++++++++++++++++++++++++++
 drivers/vfio/pci/nvgrace-gpu/egm.h    |  12 +
 drivers/vfio/pci/nvgrace-gpu/main.c   |  35 +-
 include/uapi/linux/egm.h              |  26 ++
 7 files changed, 536 insertions(+), 5 deletions(-)
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/egm.c
 create mode 100644 drivers/vfio/pci/nvgrace-gpu/egm.h
 create mode 100644 include/uapi/linux/egm.h

--
2.34.1

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 84ca99b2f0e1098150e79597934cb6660a610a31 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sun, 13 Oct 2024 04:53:38 +0000
Subject: [PATCH 336/352] vfio/nvgrace-egm: Move the egm header file to include

nvgrace-egm exposes the API register_egm_node & unregister_egm_node
to manage EGM (Extended GPU Memory) present on the system.

To allow out-of-tree driver such as nvidia-vgpu-vfio make use of them,
move the declaration to a new nvgrace-egm.h in include.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/egm.c                        | 2 +-
 drivers/vfio/pci/nvgrace-gpu/main.c                       | 2 +-
 .../pci/nvgrace-gpu/egm.h => include/linux/nvgrace-egm.h  | 8 ++++----
 3 files changed, 6 insertions(+), 6 deletions(-)
 rename drivers/vfio/pci/nvgrace-gpu/egm.h => include/linux/nvgrace-egm.h (55%)

diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.c b/drivers/vfio/pci/nvgrace-gpu/egm.c
index 844e872e39f91..28283c03c218a 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.c
+++ b/drivers/vfio/pci/nvgrace-gpu/egm.c
@@ -6,7 +6,7 @@
 #include <linux/vfio_pci_core.h>
 #include <linux/hashtable.h>
 #include <linux/egm.h>
-#include "egm.h"
+#include <linux/nvgrace-egm.h>
 
 #ifdef CONFIG_MEMORY_FAILURE
 #include <linux/bitmap.h>
diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index e648318809b5f..b2c2cb9c0d35a 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -7,7 +7,7 @@
 #include <linux/vfio_pci_core.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
-#include "egm.h"
+#include <linux/nvgrace-egm.h>
 
 #ifdef CONFIG_MEMORY_FAILURE
 #include <linux/bitmap.h>
diff --git a/drivers/vfio/pci/nvgrace-gpu/egm.h b/include/linux/nvgrace-egm.h
similarity index 55%
rename from drivers/vfio/pci/nvgrace-gpu/egm.h
rename to include/linux/nvgrace-egm.h
index 28cc59e04a0b0..48add892aa5bf 100644
--- a/drivers/vfio/pci/nvgrace-gpu/egm.h
+++ b/include/linux/nvgrace-egm.h
@@ -1,12 +1,12 @@
-// SPDX-License-Identifier: GPL-2.0-only
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
 /*
  * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
  */
 
-#ifndef NVGRACE_EGM_H
-#define NVGRACE_EGM_H
+#ifndef _NVGRACE_EGM_H
+#define _NVGRACE_EGM_H
 
 int register_egm_node(struct pci_dev *pdev);
 void unregister_egm_node(int egm_node);
 
-#endif /* NVGRACE_EGM_H */
+#endif /* _NVGRACE_EGM_H */

From 41de139008bcf9366d991b2916ea06e47350d0f5 Mon Sep 17 00:00:00 2001
From: Ankit Agrawal <ankita@nvidia.com>
Date: Sun, 13 Oct 2024 04:53:39 +0000
Subject: [PATCH 337/352] vfio/nvgrace-gpu: Add a new GH200 SKU to the devid
 table

NVIDIA is planning to productize a new Grace Hopper superchip
SKU with device ID 0x2348.

Add the SKU devid to nvgrace_gpu_vfio_pci_table.

Signed-off-by: Ankit Agrawal <ankita@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/vfio/pci/nvgrace-gpu/main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c
index b2c2cb9c0d35a..5bb64827f0f55 100644
--- a/drivers/vfio/pci/nvgrace-gpu/main.c
+++ b/drivers/vfio/pci/nvgrace-gpu/main.c
@@ -1138,6 +1138,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = {
 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) },
 	/* GH200 480GB */
 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) },
+	/* GH200 SKU */
+	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) },
 	/* GB200 SKU */
 	{ PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2941) },
 	{}

From 22c8d056ab926526e64ae3f66cfbfb738709fdf9 Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mochs@nvidia.com>
Date: Mon, 14 Oct 2024 10:33:04 -0700
Subject: [PATCH 338/352] cover-letter: vfio/nvgrace-gpu: Enable GH SKU and
 migrate EGM header file

Minor updates to the nvgrace-gpu-vfio-pci module to enable an additional
SKU and support out-of-tree drivers using EGM services.

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>

From 17cf4be42164f392cc17178f399fe28d31bbe215 Mon Sep 17 00:00:00 2001
From: "Matthew R. Ochs" <mochs@nvidia.com>
Date: Tue, 16 Jul 2024 01:47:44 +0000
Subject: [PATCH 339/352] NVIDIA: [Config] nvidia-6.8: Update annotations for
 Grace I/O virtualization

This adds the following config options to annotations:

            CONFIG_FAULT_INJECTION=y
            CONFIG_IOMMUFD=y
            CONFIG_IOMMUFD_TEST=y
            CONFIG_IOMMUFD_VFIO_CONTAINER=y
            CONFIG_VFIO_CONTAINER=n
            CONFIG_TEGRA241_CMDQV=y
            CONFIG_IOMMU_IOPF=y
            CONFIG_NVGRACE_GPU_VFIO_PCI=m
            CONFIG_NVGRACE_EGM=m
            CONFIG_FAILSLAB=n
            CONFIG_FAIL_FUTEX=n
            CONFIG_FAIL_IO_TIMEOUT=n
            CONFIG_FAIL_MAKE_REQUEST=n
            CONFIG_FAIL_PAGE_ALLOC=n
            CONFIG_FAULT_INJECTION_CONFIGFS=n
            CONFIG_FAULT_INJECTION_DEBUG_FS=n
            CONFIG_FAULT_INJECTION_USERCOPY=n
            CONFIG_SCSI_UFS_FAULT_INJECTION=n

Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Kai-Heng Feng <kaihengf@nvidia.com>
Acked-by: Koba Ko <kobak@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 debian.nvidia-adv/config/annotations | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/debian.nvidia-adv/config/annotations b/debian.nvidia-adv/config/annotations
index 729639479558d..8637342f01cc5 100644
--- a/debian.nvidia-adv/config/annotations
+++ b/debian.nvidia-adv/config/annotations
@@ -168,6 +168,26 @@ CONFIG_UBUNTU_ODM_DRIVERS                       note<'{Disable all Ubuntu ODM dr
 CONFIG_ULTRASOC_SMB                             policy<{'arm64': 'n'}>
 CONFIG_ULTRASOC_SMB                             note<'{Required for Grace enablement}'>
 
+# ---- Annotations to support vSMMU/vCMDQ/vEGM/GPU Passthrough ----
+CONFIG_FAULT_INJECTION                          policy<{'arm64': 'y'}>
+CONFIG_IOMMUFD                                  policy<{'arm64': 'y'}>
+CONFIG_IOMMUFD_TEST                             policy<{'arm64': 'y'}>
+CONFIG_IOMMUFD_VFIO_CONTAINER                   policy<{'arm64': 'y'}>
+CONFIG_TEGRA241_CMDQV                           policy<{'arm64': 'y'}>
+CONFIG_VFIO_CONTAINER                           policy<{'arm64': 'n'}>
+CONFIG_VFIO_IOMMU_TYPE1                         policy<{'arm64': '-'}>
+CONFIG_NVGRACE_GPU_VFIO_PCI                     policy<{'arm64': 'm'}>
+CONFIG_NVGRACE_EGM                              policy<{'arm64': 'm'}>
+CONFIG_IOMMU_IOPF                               policy<{'amd64': 'y', 'arm64': 'y'}>
+CONFIG_SCSI_UFS_FAULT_INJECTION                 policy<{'arm64': 'n'}>
+CONFIG_FAILSLAB                                 policy<{'arm64': 'n'}>
+CONFIG_FAIL_PAGE_ALLOC                          policy<{'arm64': 'n'}>
+CONFIG_FAULT_INJECTION_USERCOPY                 policy<{'arm64': 'n'}>
+CONFIG_FAIL_MAKE_REQUEST                        policy<{'arm64': 'n'}>
+CONFIG_FAIL_IO_TIMEOUT                          policy<{'arm64': 'n'}>
+CONFIG_FAIL_FUTEX                               policy<{'arm64': 'n'}>
+CONFIG_FAULT_INJECTION_DEBUG_FS                 policy<{'arm64': 'n'}>
+CONFIG_FAULT_INJECTION_CONFIGFS                 policy<{'arm64': 'n'}>
 
 # ---- Annotations without notes ----
 

From 515f6bcc284656ce7e3e5af3eac0712cd1d24276 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:10 +0300
Subject: [PATCH 340/352] net/mlx5: Add IFC related stuff for data direct

Add IFC related stuff for data direct.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/82da7f578a567909bb5858a64ba844fe4cc298fa.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry-picked from commit c772a2c690182410642ead740f7a84b3a7544b2b linux)
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 include/linux/mlx5/mlx5_ifc.h | 51 +++++++++++++++++++++++++++++++----
 1 file changed, 46 insertions(+), 5 deletions(-)

diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index d2c27a7227bb4..8da91aae2e8af 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -313,6 +313,7 @@ enum {
 	MLX5_CMD_OP_MODIFY_VHCA_STATE             = 0xb0e,
 	MLX5_CMD_OP_SYNC_CRYPTO                   = 0xb12,
 	MLX5_CMD_OP_ALLOW_OTHER_VHCA_ACCESS       = 0xb16,
+	MLX5_CMD_OPCODE_QUERY_VUID                = 0xb22,
 	MLX5_CMD_OP_MAX
 };
 
@@ -1864,7 +1865,8 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 
 	u8         reserved_at_5a0[0x10];
 	u8         enhanced_cqe_compression[0x1];
-	u8         reserved_at_5b1[0x2];
+	u8         reserved_at_5b1[0x1];
+	u8         crossing_vhca_mkey[0x1];
 	u8         log_max_dek[0x5];
 	u8         reserved_at_5b8[0x4];
 	u8         mini_cqe_resp_stride_index[0x1];
@@ -1933,7 +1935,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
 	u8	   dynamic_msix_table_size[0xc];
 	u8	   reserved_at_740[0xc];
 	u8	   min_dynamic_vf_msix_table_size[0x4];
-	u8	   reserved_at_750[0x4];
+	u8	   reserved_at_750[0x2];
+	u8	   data_direct[0x1];
+	u8	   reserved_at_753[0x1];
 	u8	   max_dynamic_vf_msix_table_size[0xc];
 
 	u8         reserved_at_760[0x3];
@@ -1961,7 +1965,9 @@ struct mlx5_ifc_cmd_hca_cap_2_bits {
 	u8	   reserved_at_0[0x80];
 
 	u8         migratable[0x1];
-	u8         reserved_at_81[0x1f];
+	u8         reserved_at_81[0x11];
+	u8         query_vuid[0x1];
+	u8         reserved_at_93[0xd];
 
 	u8	   max_reformat_insert_size[0x8];
 	u8	   max_reformat_insert_offset[0x8];
@@ -4075,6 +4081,7 @@ enum {
 	MLX5_MKC_ACCESS_MODE_KSM   = 0x3,
 	MLX5_MKC_ACCESS_MODE_SW_ICM = 0x4,
 	MLX5_MKC_ACCESS_MODE_MEMIC = 0x5,
+	MLX5_MKC_ACCESS_MODE_CROSSING = 0x6,
 };
 
 struct mlx5_ifc_mkc_bits {
@@ -4117,7 +4124,10 @@ struct mlx5_ifc_mkc_bits {
 
 	u8         bsf_octword_size[0x20];
 
-	u8         reserved_at_120[0x80];
+	u8         reserved_at_120[0x60];
+
+	u8         crossing_target_vhca_id[0x10];
+	u8         reserved_at_190[0x10];
 
 	u8         translations_octword_size[0x20];
 
@@ -5043,6 +5053,36 @@ struct mlx5_ifc_query_vport_state_out_bits {
 	u8         state[0x4];
 };
 
+struct mlx5_ifc_array1024_auto_bits {
+	u8         array1024_auto[32][0x20];
+};
+
+struct mlx5_ifc_query_vuid_in_bits {
+	u8         opcode[0x10];
+	u8         uid[0x10];
+
+	u8         reserved_at_20[0x40];
+
+	u8         query_vfs_vuid[0x1];
+	u8         data_direct[0x1];
+	u8         reserved_at_62[0xe];
+	u8         vhca_id[0x10];
+};
+
+struct mlx5_ifc_query_vuid_out_bits {
+	u8        status[0x8];
+	u8        reserved_at_8[0x18];
+
+	u8        syndrome[0x20];
+
+	u8        reserved_at_40[0x1a0];
+
+	u8        reserved_at_1e0[0x10];
+	u8        num_of_entries[0x10];
+
+	struct mlx5_ifc_array1024_auto_bits vuid[];
+};
+
 enum {
 	MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT  = 0x0,
 	MLX5_VPORT_STATE_OP_MOD_ESW_VPORT   = 0x1,
@@ -8904,7 +8944,8 @@ struct mlx5_ifc_create_mkey_in_bits {
 
 	u8         pg_access[0x1];
 	u8         mkey_umem_valid[0x1];
-	u8         reserved_at_62[0x1e];
+	u8         data_direct[0x1];
+	u8         reserved_at_63[0x1d];
 
 	struct mlx5_ifc_mkc_bits memory_key_mkey_entry;
 

From 648e3a78b5c1b86ba5c7b2a433a971d541ee66fa Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:11 +0300
Subject: [PATCH 341/352] RDMA/mlx5: Introduce the 'data direct' driver

Introduce the 'data direct' driver for a ConnectX-8 Data Direct device.

The 'data direct' driver functions as the affiliated DMA device for one
or more capable mlx5_ib devices. This DMA device, as the name suggests,
is used exclusively for DMA operations. It can be considered a DMA engine
managed by a PF/VF, lacking network capabilities and having minimal overall
capabilities.

Consequently, the DMA NIC PF will not be exposed to or directly used by
software applications. The driver will not have any direct interface or
interaction with the firmware (no command interface, no capabilities,
etc.). It will operate solely over PCI to enable its DMA functionality.

Registration and un-registration of the driver are handled as part of
the mlx5_ib initialization and exit processes, as the mlx5_ib devices
will effectively be its clients.

The driver will serve as the DMA device for accessing another PCI device
to achieve optimal performance (both on the same NUMA node, P2P access,
etc.).

Upon probing, it will read its VUID over PCI to handle mlx5_ib device
registrations with the same VUID.

Upon removal, it will notify its clients to allow them to clean up the
resources that were mmaped with its DMA device.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/b77edecfd476c3f445da96ab6aef499ae47b2829.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry-picked from commit 6910e3660d86c1a5654f742a40181d2c9154f26f linux)
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/hw/mlx5/Makefile      |   1 +
 drivers/infiniband/hw/mlx5/data_direct.c | 227 +++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/data_direct.h |  23 +++
 drivers/infiniband/hw/mlx5/main.c        |  24 +++
 drivers/infiniband/hw/mlx5/mlx5_ib.h     |   6 +
 5 files changed, 281 insertions(+)
 create mode 100644 drivers/infiniband/hw/mlx5/data_direct.c
 create mode 100644 drivers/infiniband/hw/mlx5/data_direct.h

diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
index 72a526236c2e0..b38961f5058ef 100644
--- a/drivers/infiniband/hw/mlx5/Makefile
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -6,6 +6,7 @@ mlx5_ib-y := ah.o \
 	     cong.o \
 	     counters.o \
 	     cq.o \
+	     data_direct.o \
 	     dm.o \
 	     doorbell.o \
 	     gsi.o \
diff --git a/drivers/infiniband/hw/mlx5/data_direct.c b/drivers/infiniband/hw/mlx5/data_direct.c
new file mode 100644
index 0000000000000..b9ba84afaae22
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/data_direct.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#include "mlx5_ib.h"
+#include "data_direct.h"
+
+static LIST_HEAD(mlx5_data_direct_dev_list);
+static LIST_HEAD(mlx5_data_direct_reg_list);
+
+/*
+ * This mutex should be held when accessing either of the above lists
+ */
+static DEFINE_MUTEX(mlx5_data_direct_mutex);
+
+struct mlx5_data_direct_registration {
+	struct mlx5_ib_dev *ibdev;
+	char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1];
+	struct list_head list;
+};
+
+static const struct pci_device_id mlx5_data_direct_pci_table[] = {
+	{ PCI_VDEVICE(MELLANOX, 0x2100) }, /* ConnectX-8 Data Direct */
+	{ 0, }
+};
+
+static int mlx5_data_direct_vpd_get_vuid(struct mlx5_data_direct_dev *dev)
+{
+	struct pci_dev *pdev = dev->pdev;
+	unsigned int vpd_size, kw_len;
+	u8 *vpd_data;
+	int start;
+	int ret;
+
+	vpd_data = pci_vpd_alloc(pdev, &vpd_size);
+	if (IS_ERR(vpd_data)) {
+		pci_err(pdev, "Unable to read VPD, err=%ld\n", PTR_ERR(vpd_data));
+		return PTR_ERR(vpd_data);
+	}
+
+	start = pci_vpd_find_ro_info_keyword(vpd_data, vpd_size, "VU", &kw_len);
+	if (start < 0) {
+		ret = start;
+		pci_err(pdev, "VU keyword not found, err=%d\n", ret);
+		goto end;
+	}
+
+	dev->vuid = kmemdup_nul(vpd_data + start, kw_len, GFP_KERNEL);
+	ret = dev->vuid ? 0 : -ENOMEM;
+
+end:
+	kfree(vpd_data);
+	return ret;
+}
+
+static void mlx5_data_direct_shutdown(struct pci_dev *pdev)
+{
+	pci_disable_device(pdev);
+}
+
+static int mlx5_data_direct_set_dma_caps(struct pci_dev *pdev)
+{
+	int err;
+
+	err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev,
+			 "Warning: couldn't set 64-bit PCI DMA mask, err=%d\n", err);
+		err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, err=%d\n", err);
+			return err;
+		}
+	}
+
+	dma_set_max_seg_size(&pdev->dev, SZ_2G);
+	return 0;
+}
+
+int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid)
+{
+	struct mlx5_data_direct_registration *reg;
+	struct mlx5_data_direct_dev *dev;
+
+	reg = kzalloc(sizeof(*reg), GFP_KERNEL);
+	if (!reg)
+		return -ENOMEM;
+
+	reg->ibdev = ibdev;
+	strcpy(reg->vuid, vuid);
+
+	mutex_lock(&mlx5_data_direct_mutex);
+	list_for_each_entry(dev, &mlx5_data_direct_dev_list, list) {
+		if (strcmp(dev->vuid, vuid) == 0) {
+			mlx5_ib_data_direct_bind(ibdev, dev);
+			break;
+		}
+	}
+
+	/* Add the registration to its global list, to be used upon bind/unbind
+	 * of its affiliated data direct device
+	 */
+	list_add_tail(&reg->list, &mlx5_data_direct_reg_list);
+	mutex_unlock(&mlx5_data_direct_mutex);
+	return 0;
+}
+
+void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev)
+{
+	struct mlx5_data_direct_registration *reg;
+
+	mutex_lock(&mlx5_data_direct_mutex);
+	list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+		if (reg->ibdev == ibdev) {
+			list_del(&reg->list);
+			kfree(reg);
+			goto end;
+		}
+	}
+
+	WARN_ON(true);
+end:
+	mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static void mlx5_data_direct_dev_reg(struct mlx5_data_direct_dev *dev)
+{
+	struct mlx5_data_direct_registration *reg;
+
+	mutex_lock(&mlx5_data_direct_mutex);
+	list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+		if (strcmp(dev->vuid, reg->vuid) == 0)
+			mlx5_ib_data_direct_bind(reg->ibdev, dev);
+	}
+
+	/* Add the data direct device to the global list, further IB devices may
+	 * use it later as well
+	 */
+	list_add_tail(&dev->list, &mlx5_data_direct_dev_list);
+	mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static void mlx5_data_direct_dev_unreg(struct mlx5_data_direct_dev *dev)
+{
+	struct mlx5_data_direct_registration *reg;
+
+	mutex_lock(&mlx5_data_direct_mutex);
+	/* Prevent any further affiliations */
+	list_del(&dev->list);
+	list_for_each_entry(reg, &mlx5_data_direct_reg_list, list) {
+		if (strcmp(dev->vuid, reg->vuid) == 0)
+			mlx5_ib_data_direct_unbind(reg->ibdev);
+	}
+	mutex_unlock(&mlx5_data_direct_mutex);
+}
+
+static int mlx5_data_direct_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct mlx5_data_direct_dev *dev;
+	int err;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return -ENOMEM;
+
+	dev->device = &pdev->dev;
+	dev->pdev = pdev;
+
+	pci_set_drvdata(dev->pdev, dev);
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(dev->device, "Cannot enable PCI device, err=%d\n", err);
+		goto err;
+	}
+
+	pci_set_master(pdev);
+	err = mlx5_data_direct_set_dma_caps(pdev);
+	if (err)
+		goto err_disable;
+
+	if (pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP32) &&
+	    pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP64) &&
+	    pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128))
+		dev_dbg(dev->device, "Enabling pci atomics failed\n");
+
+	err = mlx5_data_direct_vpd_get_vuid(dev);
+	if (err)
+		goto err_disable;
+
+	mlx5_data_direct_dev_reg(dev);
+	return 0;
+
+err_disable:
+	pci_disable_device(pdev);
+err:
+	kfree(dev);
+	return err;
+}
+
+static void mlx5_data_direct_remove(struct pci_dev *pdev)
+{
+	struct mlx5_data_direct_dev *dev = pci_get_drvdata(pdev);
+
+	mlx5_data_direct_dev_unreg(dev);
+	pci_disable_device(pdev);
+	kfree(dev->vuid);
+	kfree(dev);
+}
+
+static struct pci_driver mlx5_data_direct_driver = {
+	.name = KBUILD_MODNAME,
+	.id_table = mlx5_data_direct_pci_table,
+	.probe = mlx5_data_direct_probe,
+	.remove = mlx5_data_direct_remove,
+	.shutdown = mlx5_data_direct_shutdown,
+};
+
+int mlx5_data_direct_driver_register(void)
+{
+	return pci_register_driver(&mlx5_data_direct_driver);
+}
+
+void mlx5_data_direct_driver_unregister(void)
+{
+	pci_unregister_driver(&mlx5_data_direct_driver);
+}
diff --git a/drivers/infiniband/hw/mlx5/data_direct.h b/drivers/infiniband/hw/mlx5/data_direct.h
new file mode 100644
index 0000000000000..2fd2bdbe8f692
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/data_direct.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved
+ */
+
+#ifndef _MLX5_IB_DATA_DIRECT_H
+#define _MLX5_IB_DATA_DIRECT_H
+
+struct mlx5_ib_dev;
+
+struct mlx5_data_direct_dev {
+	struct device *device;
+	struct pci_dev *pdev;
+	char *vuid;
+	struct list_head list;
+};
+
+int mlx5_data_direct_ib_reg(struct mlx5_ib_dev *ibdev, char *vuid);
+void mlx5_data_direct_ib_unreg(struct mlx5_ib_dev *ibdev);
+int mlx5_data_direct_driver_register(void);
+void mlx5_data_direct_driver_unregister(void);
+
+#endif
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 9fb8a544236d7..0f8466cd9a53d 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -48,6 +48,7 @@
 #include <rdma/mlx5_user_ioctl_verbs.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
 #include "macsec.h"
+#include "data_direct.h"
 
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
@@ -3751,6 +3752,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 	dev->ib_dev.num_comp_vectors    = mlx5_comp_vectors_max(mdev);
 
 	mutex_init(&dev->cap_mask_mutex);
+	mutex_init(&dev->data_direct_lock);
 	INIT_LIST_HEAD(&dev->qp_list);
 	spin_lock_init(&dev->reset_flow_resource_lock);
 	xa_init(&dev->odp_mkeys);
@@ -4180,6 +4182,21 @@ static void mlx5_ib_stage_dev_notifier_cleanup(struct mlx5_ib_dev *dev)
 	mlx5_notifier_unregister(dev->mdev, &dev->mdev_events);
 }
 
+void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
+			      struct mlx5_data_direct_dev *dev)
+{
+	mutex_lock(&ibdev->data_direct_lock);
+	ibdev->data_direct_dev = dev;
+	mutex_unlock(&ibdev->data_direct_lock);
+}
+
+void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev)
+{
+	mutex_lock(&ibdev->data_direct_lock);
+	ibdev->data_direct_dev = NULL;
+	mutex_unlock(&ibdev->data_direct_lock);
+}
+
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
 		      const struct mlx5_ib_profile *profile,
 		      int stage)
@@ -4509,17 +4526,23 @@ static int __init mlx5_ib_init(void)
 	ret = mlx5r_rep_init();
 	if (ret)
 		goto rep_err;
+	ret = mlx5_data_direct_driver_register();
+	if (ret)
+		goto dd_err;
 	ret = auxiliary_driver_register(&mlx5r_mp_driver);
 	if (ret)
 		goto mp_err;
 	ret = auxiliary_driver_register(&mlx5r_driver);
 	if (ret)
 		goto drv_err;
+
 	return 0;
 
 drv_err:
 	auxiliary_driver_unregister(&mlx5r_mp_driver);
 mp_err:
+	mlx5_data_direct_driver_unregister();
+dd_err:
 	mlx5r_rep_cleanup();
 rep_err:
 	mlx5_ib_qp_event_cleanup();
@@ -4531,6 +4554,7 @@ static int __init mlx5_ib_init(void)
 
 static void __exit mlx5_ib_cleanup(void)
 {
+	mlx5_data_direct_driver_unregister();
 	auxiliary_driver_unregister(&mlx5r_driver);
 	auxiliary_driver_unregister(&mlx5r_mp_driver);
 	mlx5r_rep_cleanup();
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 79ebafecca22a..ddcced1c19a2a 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1115,6 +1115,9 @@ struct mlx5_macsec {
 struct mlx5_ib_dev {
 	struct ib_device		ib_dev;
 	struct mlx5_core_dev		*mdev;
+	struct mlx5_data_direct_dev	*data_direct_dev;
+	/* protect accessing data_direct_dev */
+	struct mutex			data_direct_lock;
 	struct notifier_block		mdev_events;
 	int				num_ports;
 	/* serialize update of capability mask
@@ -1406,6 +1409,9 @@ int mlx5_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *wq_ind_table);
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 				struct ib_dm_mr_attr *attr,
 				struct uverbs_attr_bundle *attrs);
+void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
+			      struct mlx5_data_direct_dev *dev);
+void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);

From 99ac7981e423cd5ccbc6a5f3cc735d58205cfe60 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:12 +0300
Subject: [PATCH 342/352] RDMA/mlx5: Add the initialization flow to utilize the
 'data direct' device

Add the NET device initialization flow to utilize the 'data
direct' device.

When a NET mlx5_ib device is capable of 'data direct', the following
sequence of actions will occur:
- Find its affiliated 'data direct' VUID via a firmware command.
- Create its own private PD and 'data direct' mkey.
- Register to be notified when its 'data direct' driver is probed or removed.

The DMA device of the affiliated 'data direct' device, including the
private PD and the 'data direct' mkey, will be used later during MR
registrations that request the data direct functionality.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/b11fa87b2a65bce4db8d40341bb6cee490fa4d06.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(backported from commit 2e8e631d7a41e3a4edc94f3c9dd5cb32c2aa539e linux)
[tdave: Resolve conflict in main.c and mlx5_ib.h]
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/hw/mlx5/cmd.c     | 21 +++++++
 drivers/infiniband/hw/mlx5/cmd.h     |  2 +
 drivers/infiniband/hw/mlx5/main.c    | 90 ++++++++++++++++++++++++++++
 drivers/infiniband/hw/mlx5/mlx5_ib.h |  6 ++
 4 files changed, 119 insertions(+)

diff --git a/drivers/infiniband/hw/mlx5/cmd.c b/drivers/infiniband/hw/mlx5/cmd.c
index 1d0c8d5e745bf..83046506e4d2a 100644
--- a/drivers/infiniband/hw/mlx5/cmd.c
+++ b/drivers/infiniband/hw/mlx5/cmd.c
@@ -239,3 +239,24 @@ int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid)
 	MLX5_SET(dealloc_uar_in, in, uid, uid);
 	return mlx5_cmd_exec_in(dev, dealloc_uar, in);
 }
+
+int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct,
+			char *out_vuid)
+{
+	u8 out[MLX5_ST_SZ_BYTES(query_vuid_out) +
+		MLX5_ST_SZ_BYTES(array1024_auto)] = {};
+	u8 in[MLX5_ST_SZ_BYTES(query_vuid_in)] = {};
+	char *vuid;
+	int err;
+
+	MLX5_SET(query_vuid_in, in, opcode, MLX5_CMD_OPCODE_QUERY_VUID);
+	MLX5_SET(query_vuid_in, in, vhca_id, MLX5_CAP_GEN(dev, vhca_id));
+	MLX5_SET(query_vuid_in, in, data_direct, data_direct);
+	err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+	if (err)
+		return err;
+
+	vuid = MLX5_ADDR_OF(query_vuid_out, out, vuid);
+	memcpy(out_vuid, vuid, MLX5_ST_SZ_BYTES(array1024_auto));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cmd.h b/drivers/infiniband/hw/mlx5/cmd.h
index 93a971a40d119..384e64cebc95e 100644
--- a/drivers/infiniband/hw/mlx5/cmd.h
+++ b/drivers/infiniband/hw/mlx5/cmd.h
@@ -58,4 +58,6 @@ int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
 		     u16 opmod, u8 port);
 int mlx5_cmd_uar_alloc(struct mlx5_core_dev *dev, u32 *uarn, u16 uid);
 int mlx5_cmd_uar_dealloc(struct mlx5_core_dev *dev, u32 uarn, u16 uid);
+int mlx5_cmd_query_vuid(struct mlx5_core_dev *dev, bool data_direct,
+			char *out_vuid);
 #endif /* MLX5_IB_CMD_H */
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 0f8466cd9a53d..24ac456ad72fd 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2921,6 +2921,59 @@ static void mlx5_ib_dev_res_cleanup(struct mlx5_ib_dev *dev)
 	ib_dealloc_pd(devr->p0);
 }
 
+static int
+mlx5_ib_create_data_direct_resources(struct mlx5_ib_dev *dev)
+{
+	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+        struct mlx5_core_dev *mdev = dev->mdev;
+        void *mkc;
+        u32 mkey;
+        u32 pdn;
+        u32 *in;
+        int err;
+
+        err = mlx5_core_alloc_pd(mdev, &pdn);
+        if (err)
+                return err;
+
+        in = kvzalloc(inlen, GFP_KERNEL);
+        if (!in) {
+                err = -ENOMEM;
+                goto err;
+        }
+
+        MLX5_SET(create_mkey_in, in, data_direct, 1);
+        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_PA);
+        MLX5_SET(mkc, mkc, lw, 1);
+        MLX5_SET(mkc, mkc, lr, 1);
+        MLX5_SET(mkc, mkc, rw, 1);
+        MLX5_SET(mkc, mkc, rr, 1);
+        MLX5_SET(mkc, mkc, a, 1);
+        MLX5_SET(mkc, mkc, pd, pdn);
+        MLX5_SET(mkc, mkc, length64, 1);
+        MLX5_SET(mkc, mkc, qpn, 0xffffff);
+        err = mlx5_core_create_mkey(mdev, &mkey, in, inlen);
+        kvfree(in);
+        if (err)
+                goto err;
+
+        dev->ddr.mkey = mkey;
+        dev->ddr.pdn = pdn;
+        return 0;
+
+err:
+        mlx5_core_dealloc_pd(mdev, pdn);
+        return err;
+}
+
+static void
+mlx5_ib_free_data_direct_resources(struct mlx5_ib_dev *dev)
+{
+        mlx5_core_destroy_mkey(dev->mdev, dev->ddr.mkey);
+        mlx5_core_dealloc_pd(dev->mdev, dev->ddr.pdn);
+}
+
 static u32 get_core_cap_flags(struct ib_device *ibdev,
 			      struct mlx5_hca_vport_context *rep)
 {
@@ -3307,6 +3360,38 @@ static bool mlx5_ib_bind_slave_port(struct mlx5_ib_dev *ibdev,
 	return false;
 }
 
+static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev)
+{
+	char vuid[MLX5_ST_SZ_BYTES(array1024_auto) + 1] = {};
+	int ret;
+
+	if (!MLX5_CAP_GEN(dev->mdev, data_direct))
+		return 0;
+
+	ret = mlx5_cmd_query_vuid(dev->mdev, true, vuid);
+	if (ret)
+		return ret;
+
+	ret = mlx5_ib_create_data_direct_resources(dev);
+	if (ret)
+		return ret;
+
+	ret = mlx5_data_direct_ib_reg(dev, vuid);
+	if (ret)
+		mlx5_ib_free_data_direct_resources(dev);
+
+	return ret;
+}
+
+static void mlx5_ib_data_direct_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!MLX5_CAP_GEN(dev->mdev, data_direct))
+		return;
+
+	mlx5_data_direct_ib_unreg(dev);
+	mlx5_ib_free_data_direct_resources(dev);
+}
+
 static int mlx5_ib_init_multiport_master(struct mlx5_ib_dev *dev)
 {
 	u32 port_num = mlx5_core_native_port_num(dev->mdev) - 1;
@@ -3699,6 +3784,7 @@ static const struct uapi_definition mlx5_ib_defs[] = {
 
 static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
+	mlx5_ib_data_direct_cleanup(dev);
 	mlx5_ib_cleanup_multiport_master(dev);
 	WARN_ON(!xa_empty(&dev->odp_mkeys));
 	mutex_destroy(&dev->cap_mask_mutex);
@@ -3761,6 +3847,10 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 
 	spin_lock_init(&dev->dm.lock);
 	dev->dm.dev = mdev;
+	err = mlx5_ib_data_direct_init(dev);
+	if (err)
+		goto err_mp;
+
 	return 0;
 err_mp:
 	mlx5_ib_cleanup_multiport_master(dev);
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index ddcced1c19a2a..26e307e72dbca 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -821,6 +821,11 @@ struct mlx5_ib_port_resources {
 	struct work_struct pkey_change_work;
 };
 
+struct mlx5_data_direct_resources {
+	u32 pdn;
+	u32 mkey;
+};
+
 struct mlx5_ib_resources {
 	struct ib_cq	*c0;
 	u32 xrcdn0;
@@ -1174,6 +1179,7 @@ struct mlx5_ib_dev {
 	u16 pkey_table_len;
 	u8 lag_ports;
 	struct mlx5_special_mkeys mkeys;
+	struct mlx5_data_direct_resources ddr;
 
 #ifdef CONFIG_MLX5_MACSEC
 	struct mlx5_macsec macsec;

From 4524efd6125764def90b337473d7aad6a773ae71 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:13 +0300
Subject: [PATCH 343/352] RDMA/umem: Add support for creating pinned DMABUF
 umem with a given dma device

Add support for creating pinned DMABUF umem with a specified DMA device
instead of the DMA device of the given IB device.

This API will be utilized in the upcoming patches of the series when
multiple path DMAs are implemented.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/038aad36a43797e5591b20ba81051fc5758124f9.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry-picked from commit 682358fd35dece838e6ae2d9d6a69fc0b9a9d411 linux)
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/core/umem_dmabuf.c | 45 ++++++++++++++++++++-------
 include/rdma/ib_umem.h                | 15 +++++++++
 2 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
index 39357dc2d229f..726a097865470 100644
--- a/drivers/infiniband/core/umem_dmabuf.c
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -110,10 +110,12 @@ void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf)
 }
 EXPORT_SYMBOL(ib_umem_dmabuf_unmap_pages);
 
-struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
-					  unsigned long offset, size_t size,
-					  int fd, int access,
-					  const struct dma_buf_attach_ops *ops)
+static struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_with_dma_device(struct ib_device *device,
+				   struct device *dma_device,
+				   unsigned long offset, size_t size,
+				   int fd, int access,
+				   const struct dma_buf_attach_ops *ops)
 {
 	struct dma_buf *dmabuf;
 	struct ib_umem_dmabuf *umem_dmabuf;
@@ -152,7 +154,7 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
 
 	umem_dmabuf->attach = dma_buf_dynamic_attach(
 					dmabuf,
-					device->dma_device,
+					dma_device,
 					ops,
 					umem_dmabuf);
 	if (IS_ERR(umem_dmabuf->attach)) {
@@ -168,6 +170,15 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
 	dma_buf_put(dmabuf);
 	return ret;
 }
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get(struct ib_device *device,
+					  unsigned long offset, size_t size,
+					  int fd, int access,
+					  const struct dma_buf_attach_ops *ops)
+{
+	return ib_umem_dmabuf_get_with_dma_device(device, device->dma_device,
+						  offset, size, fd, access, ops);
+}
 EXPORT_SYMBOL(ib_umem_dmabuf_get);
 
 static void
@@ -184,16 +195,18 @@ static struct dma_buf_attach_ops ib_umem_dmabuf_attach_pinned_ops = {
 	.move_notify = ib_umem_dmabuf_unsupported_move_notify,
 };
 
-struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
-						 unsigned long offset,
-						 size_t size, int fd,
-						 int access)
+struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device,
+					  struct device *dma_device,
+					  unsigned long offset, size_t size,
+					  int fd, int access)
 {
 	struct ib_umem_dmabuf *umem_dmabuf;
 	int err;
 
-	umem_dmabuf = ib_umem_dmabuf_get(device, offset, size, fd, access,
-					 &ib_umem_dmabuf_attach_pinned_ops);
+	umem_dmabuf = ib_umem_dmabuf_get_with_dma_device(device, dma_device, offset,
+							 size, fd, access,
+							 &ib_umem_dmabuf_attach_pinned_ops);
 	if (IS_ERR(umem_dmabuf))
 		return umem_dmabuf;
 
@@ -217,6 +230,16 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
 	ib_umem_release(&umem_dmabuf->umem);
 	return ERR_PTR(err);
 }
+EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned_with_dma_device);
+
+struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
+						 unsigned long offset,
+						 size_t size, int fd,
+						 int access)
+{
+	return ib_umem_dmabuf_get_pinned_with_dma_device(device, device->dma_device,
+							 offset, size, fd, access);
+}
 EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned);
 
 void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index 565a850445414..de05268ed6320 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -150,6 +150,11 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
 						 unsigned long offset,
 						 size_t size, int fd,
 						 int access);
+struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device,
+					  struct device *dma_device,
+					  unsigned long offset, size_t size,
+					  int fd, int access);
 int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
@@ -196,6 +201,16 @@ ib_umem_dmabuf_get_pinned(struct ib_device *device, unsigned long offset,
 {
 	return ERR_PTR(-EOPNOTSUPP);
 }
+
+static inline struct ib_umem_dmabuf *
+ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device,
+					  struct device *dma_device,
+					  unsigned long offset, size_t size,
+					  int fd, int access)
+{
+	return ERR_PTR(-EOPNOTSUPP);
+}
+
 static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
 {
 	return -EOPNOTSUPP;

From 53b8e2ca7c03cdb3d246ae420f8da70edc36a2e0 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:14 +0300
Subject: [PATCH 344/352] RDMA/umem: Introduce an option to revoke DMABUF umem

Introduce an option to revoke DMABUF umem.

This option will retain the umem allocation while revoking its DMA
mapping. Furthermore, any subsequent attempts to map the pages should
fail once the umem has been revoked.

This functionality will be utilized in the upcoming patches in the
series, where we aim to delay umem deallocation until the mkey
deregistration. However, we must unmap its pages immediately.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/a38270f2fe4a194868ca2312f4c1c760e51bcbff.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry-picked from commit 253c61dc256b3e6be65657f78b4a8452163ce00f linux)
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/core/umem_dmabuf.c | 21 +++++++++++++++++++--
 include/rdma/ib_umem.h                |  3 +++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/umem_dmabuf.c b/drivers/infiniband/core/umem_dmabuf.c
index 726a097865470..9fcd37761264a 100644
--- a/drivers/infiniband/core/umem_dmabuf.c
+++ b/drivers/infiniband/core/umem_dmabuf.c
@@ -23,6 +23,9 @@ int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
 
 	dma_resv_assert_held(umem_dmabuf->attach->dmabuf->resv);
 
+	if (umem_dmabuf->revoked)
+		return -EINVAL;
+
 	if (umem_dmabuf->sgt)
 		goto wait_fence;
 
@@ -242,15 +245,29 @@ struct ib_umem_dmabuf *ib_umem_dmabuf_get_pinned(struct ib_device *device,
 }
 EXPORT_SYMBOL(ib_umem_dmabuf_get_pinned);
 
-void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf)
 {
 	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
 
 	dma_resv_lock(dmabuf->resv, NULL);
+	if (umem_dmabuf->revoked)
+		goto end;
 	ib_umem_dmabuf_unmap_pages(umem_dmabuf);
-	if (umem_dmabuf->pinned)
+	if (umem_dmabuf->pinned) {
 		dma_buf_unpin(umem_dmabuf->attach);
+		umem_dmabuf->pinned = 0;
+	}
+	umem_dmabuf->revoked = 1;
+end:
 	dma_resv_unlock(dmabuf->resv);
+}
+EXPORT_SYMBOL(ib_umem_dmabuf_revoke);
+
+void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf)
+{
+	struct dma_buf *dmabuf = umem_dmabuf->attach->dmabuf;
+
+	ib_umem_dmabuf_revoke(umem_dmabuf);
 
 	dma_buf_detach(dmabuf, umem_dmabuf->attach);
 	dma_buf_put(dmabuf);
diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h
index de05268ed6320..7dc7b1cc71b5a 100644
--- a/include/rdma/ib_umem.h
+++ b/include/rdma/ib_umem.h
@@ -38,6 +38,7 @@ struct ib_umem_dmabuf {
 	unsigned long last_sg_trim;
 	void *private;
 	u8 pinned : 1;
+	u8 revoked : 1;
 };
 
 static inline struct ib_umem_dmabuf *to_ib_umem_dmabuf(struct ib_umem *umem)
@@ -158,6 +159,7 @@ ib_umem_dmabuf_get_pinned_with_dma_device(struct ib_device *device,
 int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf);
 void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf);
+void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -217,6 +219,7 @@ static inline int ib_umem_dmabuf_map_pages(struct ib_umem_dmabuf *umem_dmabuf)
 }
 static inline void ib_umem_dmabuf_unmap_pages(struct ib_umem_dmabuf *umem_dmabuf) { }
 static inline void ib_umem_dmabuf_release(struct ib_umem_dmabuf *umem_dmabuf) { }
+static inline void ib_umem_dmabuf_revoke(struct ib_umem_dmabuf *umem_dmabuf) {}
 
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 #endif /* IB_UMEM_H */

From 893bf22571a8ddab51e6bf375d0e032d77a8ab1d Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:15 +0300
Subject: [PATCH 345/352] RDMA: Pass uverbs_attr_bundle as part of
 '.reg_user_mr_dmabuf' API

Pass uverbs_attr_bundle as part of '.reg_user_mr_dmabuf' API instead of
udata.

This enables passing some new ioctl attributes to the drivers, as will
be introduced in the next patches for mlx5 driver.

Change the involved drivers accordingly.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/9a25b2fc02443f7c36c2d93499ae25252b6afd40.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry-picked from commit 3aa73c6b795b9aaaf933f3c95495d85fc0de39e3 linux)
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/core/uverbs_std_types_mr.c | 2 +-
 drivers/infiniband/hw/bnxt_re/ib_verbs.c      | 3 ++-
 drivers/infiniband/hw/bnxt_re/ib_verbs.h      | 2 +-
 drivers/infiniband/hw/efa/efa.h               | 2 +-
 drivers/infiniband/hw/efa/efa_verbs.c         | 4 ++--
 drivers/infiniband/hw/irdma/verbs.c           | 2 +-
 drivers/infiniband/hw/mlx5/mlx5_ib.h          | 2 +-
 drivers/infiniband/hw/mlx5/mr.c               | 2 +-
 include/rdma/ib_verbs.h                       | 2 +-
 9 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/infiniband/core/uverbs_std_types_mr.c b/drivers/infiniband/core/uverbs_std_types_mr.c
index 03e1db5d1e8c3..7ebc7bd3caaea 100644
--- a/drivers/infiniband/core/uverbs_std_types_mr.c
+++ b/drivers/infiniband/core/uverbs_std_types_mr.c
@@ -239,7 +239,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_REG_DMABUF_MR)(
 
 	mr = pd->device->ops.reg_user_mr_dmabuf(pd, offset, length, iova, fd,
 						access_flags,
-						&attrs->driver_udata);
+						attrs);
 	if (IS_ERR(mr))
 		return PTR_ERR(mr);
 
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.c b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
index ce9c5bae83bf1..b3cf9c8867fba 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.c
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.c
@@ -4121,7 +4121,8 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
 
 struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start,
 					 u64 length, u64 virt_addr, int fd,
-					 int mr_access_flags, struct ib_udata *udata)
+					 int mr_access_flags,
+					 struct uverbs_attr_bundle *attrs)
 {
 	struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
 	struct bnxt_re_dev *rdev = pd->rdev;
diff --git a/drivers/infiniband/hw/bnxt_re/ib_verbs.h b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
index b267d6d5975f7..879c82321fd5d 100644
--- a/drivers/infiniband/hw/bnxt_re/ib_verbs.h
+++ b/drivers/infiniband/hw/bnxt_re/ib_verbs.h
@@ -242,7 +242,7 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 struct ib_mr *bnxt_re_reg_user_mr_dmabuf(struct ib_pd *ib_pd, u64 start,
 					 u64 length, u64 virt_addr,
 					 int fd, int mr_access_flags,
-					 struct ib_udata *udata);
+					 struct uverbs_attr_bundle *attrs);
 int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata);
 void bnxt_re_dealloc_ucontext(struct ib_ucontext *context);
 int bnxt_re_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
index e2bdec32ae805..733ef1b0219e5 100644
--- a/drivers/infiniband/hw/efa/efa.h
+++ b/drivers/infiniband/hw/efa/efa.h
@@ -167,7 +167,7 @@ struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 virt_addr,
 				     int fd, int access_flags,
-				     struct ib_udata *udata);
+				     struct uverbs_attr_bundle *attrs);
 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
 			   struct ib_port_immutable *immutable);
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
index 2f412db2edcd3..d875e4a8ec359 100644
--- a/drivers/infiniband/hw/efa/efa_verbs.c
+++ b/drivers/infiniband/hw/efa/efa_verbs.c
@@ -1670,14 +1670,14 @@ static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
 				     u64 length, u64 virt_addr,
 				     int fd, int access_flags,
-				     struct ib_udata *udata)
+				     struct uverbs_attr_bundle *attrs)
 {
 	struct efa_dev *dev = to_edev(ibpd->device);
 	struct ib_umem_dmabuf *umem_dmabuf;
 	struct efa_mr *mr;
 	int err;
 
-	mr = efa_alloc_mr(ibpd, access_flags, udata);
+	mr = efa_alloc_mr(ibpd, access_flags, &attrs->driver_udata);
 	if (IS_ERR(mr)) {
 		err = PTR_ERR(mr);
 		goto err_out;
diff --git a/drivers/infiniband/hw/irdma/verbs.c b/drivers/infiniband/hw/irdma/verbs.c
index 12704efb7b19a..0dbdaf12cf706 100644
--- a/drivers/infiniband/hw/irdma/verbs.c
+++ b/drivers/infiniband/hw/irdma/verbs.c
@@ -3084,7 +3084,7 @@ static struct ib_mr *irdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 len,
 static struct ib_mr *irdma_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
 					      u64 len, u64 virt,
 					      int fd, int access,
-					      struct ib_udata *udata)
+					      struct uverbs_attr_bundle *attrs)
 {
 	struct irdma_device *iwdev = to_iwdev(pd->device);
 	struct ib_umem_dmabuf *umem_dmabuf;
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index 26e307e72dbca..b151171b8108e 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -1334,7 +1334,7 @@ struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 start,
 					 u64 length, u64 virt_addr,
 					 int fd, int access_flags,
-					 struct ib_udata *udata);
+					 struct uverbs_attr_bundle *attrs);
 int mlx5_ib_advise_mr(struct ib_pd *pd,
 		      enum ib_uverbs_advise_mr_advice advice,
 		      u32 flags,
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index d3c1f63791a2b..c571db572cc21 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -1508,7 +1508,7 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 					 u64 length, u64 virt_addr,
 					 int fd, int access_flags,
-					 struct ib_udata *udata)
+					 struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_mr *mr = NULL;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b7b6b58dd3486..93fc903c97a5f 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2491,7 +2491,7 @@ struct ib_device_ops {
 	struct ib_mr *(*reg_user_mr_dmabuf)(struct ib_pd *pd, u64 offset,
 					    u64 length, u64 virt_addr, int fd,
 					    int mr_access_flags,
-					    struct ib_udata *udata);
+					    struct uverbs_attr_bundle *attrs);
 	struct ib_mr *(*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start,
 				       u64 length, u64 virt_addr,
 				       int mr_access_flags, struct ib_pd *pd,

From a90a19e51b1cb10eae18f99260abe83f662f7818 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:16 +0300
Subject: [PATCH 346/352] RDMA/mlx5: Add support for DMABUF MR registrations
 with Data-direct

Add support for DMABUF MR registrations with Data-direct device.

Upon userspace calling to register a DMABUF MR with the data direct bit
set, the below algorithm will be followed.

1) Obtain a pinned DMABUF umem from the IB core using the user input
parameters (FD, offset, length) and the DMA PF device.  The DMA PF
device is needed to allow the IOMMU to enable the DMA PF to access the
user buffer over PCI.

2) Create a KSM MKEY by setting its entries according to the user buffer
VA to IOVA mapping, with the MKEY being the data direct device-crossed
MKEY. This KSM MKEY is umrable and will be used as part of the MR cache.
The PD for creating it is the internal device 'data direct' kernel one.

3) Create a crossing MKEY that points to the KSM MKEY using the crossing
access mode.

4) Manage the KSM MKEY by adding it to a list of 'data direct' MKEYs
managed on the mlx5_ib device.

5) Return the crossing MKEY to the user, created with its supplied PD.

Upon DMA PF unbind flow, the driver will revoke the KSM entries.
The final deregistration will occur under the hood once the application
deregisters its MKEY.

Notes:
- This version supports only the PINNED UMEM mode, so there is no
  dependency on ODP.
- The IOVA supplied by the application must be system page aligned due to
  HW translations of KSM.
- The crossing MKEY will not be umrable or part of the MR cache, as we
  cannot change its crossed (i.e. KSM) MKEY over UMR.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/1f99d8020ed540d9702b9e2252a145a439609ba6.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(backported from commit de8f847a5114ff7cfcdfc114af8485c431dec703 linux)
[tdave: resolve conflict in main.c, mr.c and mlx5_user_ioctl_cmds.h ,
 also rework reg_user_mr_dmabuf() to sync with recent upstream rdma changes]
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/hw/mlx5/main.c         |  11 +
 drivers/infiniband/hw/mlx5/mlx5_ib.h      |   8 +
 drivers/infiniband/hw/mlx5/mr.c           | 332 ++++++++++++++++++----
 drivers/infiniband/hw/mlx5/odp.c          |   5 +-
 drivers/infiniband/hw/mlx5/umr.c          |  93 ++++--
 drivers/infiniband/hw/mlx5/umr.h          |   1 +
 include/uapi/rdma/mlx5_user_ioctl_cmds.h  |   4 +
 include/uapi/rdma/mlx5_user_ioctl_verbs.h |   4 +
 8 files changed, 374 insertions(+), 84 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 24ac456ad72fd..09dabb9f74acd 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -3376,6 +3376,7 @@ static int mlx5_ib_data_direct_init(struct mlx5_ib_dev *dev)
 	if (ret)
 		return ret;
 
+	INIT_LIST_HEAD(&dev->data_direct_mr_list);
 	ret = mlx5_data_direct_ib_reg(dev, vuid);
 	if (ret)
 		mlx5_ib_free_data_direct_resources(dev);
@@ -3768,6 +3769,14 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
 				   dump_fill_mkey),
 		UA_MANDATORY));
 
+ADD_UVERBS_ATTRIBUTES_SIMPLE(
+	mlx5_ib_reg_dmabuf_mr,
+	UVERBS_OBJECT_MR,
+	UVERBS_METHOD_REG_DMABUF_MR,
+	UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+			     enum mlx5_ib_uapi_reg_dmabuf_flags,
+			     UA_OPTIONAL));
+
 static const struct uapi_definition mlx5_ib_defs[] = {
 	UAPI_DEF_CHAIN(mlx5_ib_devx_defs),
 	UAPI_DEF_CHAIN(mlx5_ib_flow_defs),
@@ -3776,6 +3785,7 @@ static const struct uapi_definition mlx5_ib_defs[] = {
 	UAPI_DEF_CHAIN(mlx5_ib_dm_defs),
 
 	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_DEVICE, &mlx5_ib_query_context),
+	UAPI_DEF_CHAIN_OBJ_TREE(UVERBS_OBJECT_MR, &mlx5_ib_reg_dmabuf_mr),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_VAR,
 				UAPI_DEF_IS_OBJ_SUPPORTED(var_is_supported)),
 	UAPI_DEF_CHAIN_OBJ_TREE_NAMED(MLX5_IB_OBJECT_UAR),
@@ -4283,6 +4293,7 @@ void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
 void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev)
 {
 	mutex_lock(&ibdev->data_direct_lock);
+	mlx5_ib_revoke_data_direct_mrs(ibdev);
 	ibdev->data_direct_dev = NULL;
 	mutex_unlock(&ibdev->data_direct_lock);
 }
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
index b151171b8108e..43d377bc4b565 100644
--- a/drivers/infiniband/hw/mlx5/mlx5_ib.h
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -670,6 +670,8 @@ struct mlx5_ib_mr {
 	struct mlx5_ib_mkey mmkey;
 
 	struct ib_umem *umem;
+	/* The mr is data direct related */
+	u8 data_direct :1;
 
 	union {
 		/* Used only by kernel MRs (umem == NULL) */
@@ -707,6 +709,10 @@ struct mlx5_ib_mr {
 			} odp_destroy;
 			struct ib_odp_counters odp_stats;
 			bool is_odp_implicit;
+			/* The affilated data direct crossed mr */
+			struct mlx5_ib_mr *dd_crossed_mr;
+			struct list_head dd_node;
+			u8 revoked :1;
 		};
 	};
 };
@@ -1154,6 +1160,7 @@ struct mlx5_ib_dev {
 	/* protect resources needed as part of reset flow */
 	spinlock_t		reset_flow_resource_lock;
 	struct list_head	qp_list;
+	struct list_head data_direct_mr_list;
 	/* Array with num_ports elements */
 	struct mlx5_ib_port	*port;
 	struct mlx5_sq_bfreg	bfreg;
@@ -1418,6 +1425,7 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
 void mlx5_ib_data_direct_bind(struct mlx5_ib_dev *ibdev,
 			      struct mlx5_data_direct_dev *dev);
 void mlx5_ib_data_direct_unbind(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev);
 
 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
 int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
index c571db572cc21..a4197c2a8012a 100644
--- a/drivers/infiniband/hw/mlx5/mr.c
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -43,6 +43,7 @@
 #include "dm.h"
 #include "mlx5_ib.h"
 #include "umr.h"
+#include "data_direct.h"
 
 enum {
 	MAX_PENDING_REG_MR = 8,
@@ -54,7 +55,9 @@ static void
 create_mkey_callback(int status, struct mlx5_async_work *context);
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 				     u64 iova, int access_flags,
-				     unsigned int page_size, bool populate);
+				     unsigned int page_size, bool populate,
+				     int access_mode);
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr);
 
 static void set_mkc_access_pd_addr_fields(void *mkc, int acc, u64 start_addr,
 					  struct ib_pd *pd)
@@ -1126,12 +1129,10 @@ static unsigned int mlx5_umem_dmabuf_default_pgsz(struct ib_umem *umem,
 
 static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 					     struct ib_umem *umem, u64 iova,
-					     int access_flags)
+					     int access_flags, int access_mode)
 {
-	struct mlx5r_cache_rb_key rb_key = {
-		.access_mode = MLX5_MKC_ACCESS_MODE_MTT,
-	};
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5r_cache_rb_key rb_key = {};
 	struct mlx5_cache_ent *ent;
 	struct mlx5_ib_mr *mr;
 	unsigned int page_size;
@@ -1144,6 +1145,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	if (WARN_ON(!page_size))
 		return ERR_PTR(-EINVAL);
 
+	rb_key.access_mode = access_mode;
 	rb_key.ndescs = ib_umem_num_dma_blocks(umem, page_size);
 	rb_key.ats = mlx5_umem_needs_ats(dev, umem, access_flags);
 	rb_key.access_flags = get_unchangeable_access_flags(dev, access_flags);
@@ -1154,7 +1156,7 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	 */
 	if (!ent) {
 		mutex_lock(&dev->slow_path_mutex);
-		mr = reg_create(pd, umem, iova, access_flags, page_size, false);
+		mr = reg_create(pd, umem, iova, access_flags, page_size, false, access_mode);
 		mutex_unlock(&dev->slow_path_mutex);
 		if (IS_ERR(mr))
 			return mr;
@@ -1175,13 +1177,71 @@ static struct mlx5_ib_mr *alloc_cacheable_mr(struct ib_pd *pd,
 	return mr;
 }
 
+static struct ib_mr *
+reg_create_crossing_vhca_mr(struct ib_pd *pd, u64 iova, u64 length, int access_flags,
+			    u32 crossed_lkey)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	int access_mode = MLX5_MKC_ACCESS_MODE_CROSSING;
+	struct mlx5_ib_mr *mr;
+	void *mkc;
+	int inlen;
+	u32 *in;
+	int err;
+
+	if (!MLX5_CAP_GEN(dev->mdev, crossing_vhca_mkey))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
+	in = kvzalloc(inlen, GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_1;
+	}
+
+	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
+	MLX5_SET(mkc, mkc, crossing_target_vhca_id,
+		 MLX5_CAP_GEN(dev->mdev, vhca_id));
+	MLX5_SET(mkc, mkc, translations_octword_size, crossed_lkey);
+	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode & 0x3);
+	MLX5_SET(mkc, mkc, access_mode_4_2, (access_mode >> 2) & 0x7);
+
+	/* for this crossing mkey IOVA should be 0 and len should be IOVA + len */
+	set_mkc_access_pd_addr_fields(mkc, access_flags, 0, pd);
+	MLX5_SET64(mkc, mkc, len, iova + length);
+
+	MLX5_SET(mkc, mkc, free, 0);
+	MLX5_SET(mkc, mkc, umr_en, 0);
+	err = mlx5_ib_create_mkey(dev, &mr->mmkey, in, inlen);
+	if (err)
+		goto err_2;
+
+	mr->mmkey.type = MLX5_MKEY_MR;
+	set_mr_fields(dev, mr, length, access_flags, iova);
+	mr->ibmr.pd = pd;
+	kvfree(in);
+	mlx5_ib_dbg(dev, "crossing mkey = 0x%x\n", mr->mmkey.key);
+
+	return &mr->ibmr;
+err_2:
+	kvfree(in);
+err_1:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
 /*
  * If ibmr is NULL it will be allocated by reg_create.
  * Else, the given ibmr will be used.
  */
 static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 				     u64 iova, int access_flags,
-				     unsigned int page_size, bool populate)
+				     unsigned int page_size, bool populate,
+				     int access_mode)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
 	struct mlx5_ib_mr *mr;
@@ -1190,7 +1250,9 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 	int inlen;
 	u32 *in;
 	int err;
-	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
+	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg)) &&
+		(access_mode == MLX5_MKC_ACCESS_MODE_MTT);
+	bool ksm_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
 
 	if (!page_size)
 		return ERR_PTR(-EINVAL);
@@ -1213,7 +1275,7 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 	}
 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
 	if (populate) {
-		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND)) {
+		if (WARN_ON(access_flags & IB_ACCESS_ON_DEMAND || ksm_mode)) {
 			err = -EINVAL;
 			goto err_2;
 		}
@@ -1229,14 +1291,22 @@ static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, struct ib_umem *umem,
 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 	set_mkc_access_pd_addr_fields(mkc, access_flags, iova,
 				      populate ? pd : dev->umrc.pd);
+	/* In case a data direct flow, overwrite the pdn field by its internal kernel PD */
+	if (umem->is_dmabuf && ksm_mode)
+		MLX5_SET(mkc, mkc, pd, dev->ddr.pdn);
+
 	MLX5_SET(mkc, mkc, free, !populate);
-	MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
+	MLX5_SET(mkc, mkc, access_mode_1_0, access_mode);
 	MLX5_SET(mkc, mkc, umr_en, 1);
 
 	MLX5_SET64(mkc, mkc, len, umem->length);
 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
-	MLX5_SET(mkc, mkc, translations_octword_size,
-		 get_octo_len(iova, umem->length, mr->page_shift));
+	if (ksm_mode)
+		MLX5_SET(mkc, mkc, translations_octword_size,
+			 get_octo_len(iova, umem->length, mr->page_shift) * 2);
+	else
+		MLX5_SET(mkc, mkc, translations_octword_size,
+			 get_octo_len(iova, umem->length, mr->page_shift));
 	MLX5_SET(mkc, mkc, log_page_size, mr->page_shift);
 	if (mlx5_umem_needs_ats(dev, umem, access_flags))
 		MLX5_SET(mkc, mkc, ma_translation_mode, 1);
@@ -1373,13 +1443,15 @@ static struct ib_mr *create_real_mr(struct ib_pd *pd, struct ib_umem *umem,
 
 	xlt_with_umr = mlx5r_umr_can_load_pas(dev, umem->length);
 	if (xlt_with_umr) {
-		mr = alloc_cacheable_mr(pd, umem, iova, access_flags);
+		mr = alloc_cacheable_mr(pd, umem, iova, access_flags,
+					MLX5_MKC_ACCESS_MODE_MTT);
 	} else {
 		unsigned int page_size = mlx5_umem_find_best_pgsz(
 			umem, mkc, log_page_size, 0, iova);
 
 		mutex_lock(&dev->slow_path_mutex);
-		mr = reg_create(pd, umem, iova, access_flags, page_size, true);
+		mr = reg_create(pd, umem, iova, access_flags, page_size,
+				true, MLX5_MKC_ACCESS_MODE_MTT);
 		mutex_unlock(&dev->slow_path_mutex);
 	}
 	if (IS_ERR(mr)) {
@@ -1442,7 +1514,8 @@ static struct ib_mr *create_user_odp_mr(struct ib_pd *pd, u64 start, u64 length,
 	if (IS_ERR(odp))
 		return ERR_CAST(odp);
 
-	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags);
+	mr = alloc_cacheable_mr(pd, &odp->umem, iova, access_flags,
+				MLX5_MKC_ACCESS_MODE_MTT);
 	if (IS_ERR(mr)) {
 		ib_umem_release(&odp->umem);
 		return ERR_CAST(mr);
@@ -1505,60 +1578,155 @@ static struct dma_buf_attach_ops mlx5_ib_dmabuf_attach_ops = {
 	.move_notify = mlx5_ib_dmabuf_invalidate_cb,
 };
 
+static struct ib_mr *
+reg_user_mr_dmabuf(struct ib_pd *pd, struct device *dma_device,
+                   u64 offset, u64 length, u64 virt_addr,
+                   int fd, int access_flags, int access_mode)
+{
+	bool pinned_mode = (access_mode == MLX5_MKC_ACCESS_MODE_KSM);
+        struct mlx5_ib_dev *dev = to_mdev(pd->device);
+        struct mlx5_ib_mr *mr = NULL;
+        struct ib_umem_dmabuf *umem_dmabuf;
+        int err;
+
+        err = mlx5r_umr_resource_init(dev);
+        if (err)
+                return ERR_PTR(err);
+
+        if (!pinned_mode)
+                umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev,
+                                                 offset, length, fd,
+                                                 access_flags,
+                                                 &mlx5_ib_dmabuf_attach_ops);
+        else
+                umem_dmabuf = ib_umem_dmabuf_get_pinned_with_dma_device(&dev->ib_dev,
+                                dma_device, offset, length,
+                                fd, access_flags);
+
+        if (IS_ERR(umem_dmabuf)) {
+                mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
+                            PTR_ERR(umem_dmabuf));
+                return ERR_CAST(umem_dmabuf);
+        }
+
+        mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
+                                access_flags, access_mode);
+        if (IS_ERR(mr)) {
+                ib_umem_release(&umem_dmabuf->umem);
+                return ERR_CAST(mr);
+        }
+
+        mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
+
+        atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
+        umem_dmabuf->private = mr;
+        if (!pinned_mode) {
+                err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
+                if (err)
+                        goto err_dereg_mr;
+        } else {
+                mr->data_direct = true;
+        }
+
+        err = mlx5_ib_init_dmabuf_mr(mr);
+        if (err)
+                goto err_dereg_mr;
+        return &mr->ibmr;
+
+err_dereg_mr:
+        __mlx5_ib_dereg_mr(&mr->ibmr);
+        return ERR_PTR(err);
+}
+
+static struct ib_mr *
+reg_user_mr_dmabuf_by_data_direct(struct ib_pd *pd, u64 offset,
+				  u64 length, u64 virt_addr,
+				  int fd, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_data_direct_dev *data_direct_dev;
+	struct ib_mr *crossing_mr;
+	struct ib_mr *crossed_mr;
+	int ret = 0;
+
+	/* As of HW behaviour the IOVA must be page aligned in KSM mode */
+	if (!PAGE_ALIGNED(virt_addr) || (access_flags & IB_ACCESS_ON_DEMAND))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	mutex_lock(&dev->data_direct_lock);
+	data_direct_dev = dev->data_direct_dev;
+	if (!data_direct_dev) {
+		ret = -EINVAL;
+		goto end;
+	}
+
+	/* The device's 'data direct mkey' was created without RO flags to
+	 * simplify things and allow for a single mkey per device.
+	 * Since RO is not a must, mask it out accordingly.
+	 */
+	access_flags &= ~IB_ACCESS_RELAXED_ORDERING;
+	crossed_mr = reg_user_mr_dmabuf(pd, &data_direct_dev->pdev->dev,
+					offset, length, virt_addr, fd,
+					access_flags, MLX5_MKC_ACCESS_MODE_KSM);
+	if (IS_ERR(crossed_mr)) {
+		ret = PTR_ERR(crossed_mr);
+		goto end;
+	}
+
+	mutex_lock(&dev->slow_path_mutex);
+	crossing_mr = reg_create_crossing_vhca_mr(pd, virt_addr, length, access_flags,
+						  crossed_mr->lkey);
+	mutex_unlock(&dev->slow_path_mutex);
+	if (IS_ERR(crossing_mr)) {
+		__mlx5_ib_dereg_mr(crossed_mr);
+		ret = PTR_ERR(crossing_mr);
+		goto end;
+	}
+
+	list_add_tail(&to_mmr(crossed_mr)->dd_node, &dev->data_direct_mr_list);
+	to_mmr(crossing_mr)->dd_crossed_mr = to_mmr(crossed_mr);
+	to_mmr(crossing_mr)->data_direct = true;
+end:
+	mutex_unlock(&dev->data_direct_lock);
+	return ret ? ERR_PTR(ret) : crossing_mr;
+}
+
 struct ib_mr *mlx5_ib_reg_user_mr_dmabuf(struct ib_pd *pd, u64 offset,
 					 u64 length, u64 virt_addr,
 					 int fd, int access_flags,
 					 struct uverbs_attr_bundle *attrs)
 {
 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
-	struct mlx5_ib_mr *mr = NULL;
-	struct ib_umem_dmabuf *umem_dmabuf;
+	int mlx5_access_flags = 0;
 	int err;
 
 	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) ||
 	    !IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
 		return ERR_PTR(-EOPNOTSUPP);
 
+	if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS)) {
+		err = uverbs_get_flags32(&mlx5_access_flags, attrs,
+					 MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS,
+					 MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT);
+		if (err)
+			return ERR_PTR(err);
+	}
+
 	mlx5_ib_dbg(dev,
-		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x\n",
-		    offset, virt_addr, length, fd, access_flags);
+		    "offset 0x%llx, virt_addr 0x%llx, length 0x%llx, fd %d, access_flags 0x%x, mlx5_access_flags 0x%x\n",
+		    offset, virt_addr, length, fd, access_flags, mlx5_access_flags);
 
 	/* dmabuf requires xlt update via umr to work. */
 	if (!mlx5r_umr_can_load_pas(dev, length))
 		return ERR_PTR(-EINVAL);
 
-	umem_dmabuf = ib_umem_dmabuf_get(&dev->ib_dev, offset, length, fd,
-					 access_flags,
-					 &mlx5_ib_dmabuf_attach_ops);
-	if (IS_ERR(umem_dmabuf)) {
-		mlx5_ib_dbg(dev, "umem_dmabuf get failed (%ld)\n",
-			    PTR_ERR(umem_dmabuf));
-		return ERR_CAST(umem_dmabuf);
-	}
-
-	mr = alloc_cacheable_mr(pd, &umem_dmabuf->umem, virt_addr,
-				access_flags);
-	if (IS_ERR(mr)) {
-		ib_umem_release(&umem_dmabuf->umem);
-		return ERR_CAST(mr);
-	}
+	if (mlx5_access_flags & MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT)
+		return reg_user_mr_dmabuf_by_data_direct(pd, offset, length, virt_addr,
+							 fd, access_flags);
 
-	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
-
-	atomic_add(ib_umem_num_pages(mr->umem), &dev->mdev->priv.reg_pages);
-	umem_dmabuf->private = mr;
-	err = mlx5r_store_odp_mkey(dev, &mr->mmkey);
-	if (err)
-		goto err_dereg_mr;
-
-	err = mlx5_ib_init_dmabuf_mr(mr);
-	if (err)
-		goto err_dereg_mr;
-	return &mr->ibmr;
-
-err_dereg_mr:
-	mlx5_ib_dereg_mr(&mr->ibmr, NULL);
-	return ERR_PTR(err);
+	return reg_user_mr_dmabuf(pd, pd->device->dma_device,
+				  offset, length, virt_addr,
+				  fd, access_flags, MLX5_MKC_ACCESS_MODE_MTT);
 }
 
 /*
@@ -1656,7 +1824,7 @@ struct ib_mr *mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
 	int err;
 
-	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM))
+	if (!IS_ENABLED(CONFIG_INFINIBAND_USER_MEM) || mr->data_direct)
 		return ERR_PTR(-EOPNOTSUPP);
 
 	mlx5_ib_dbg(
@@ -1784,7 +1952,7 @@ mlx5_alloc_priv_descs(struct ib_device *device,
 static void
 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
 {
-	if (!mr->umem && mr->descs) {
+	if (!mr->umem && !mr->data_direct && mr->descs) {
 		struct ib_device *device = mr->ibmr.device;
 		int size = mr->max_descs * mr->desc_size;
 		struct mlx5_ib_dev *dev = to_mdev(device);
@@ -1838,6 +2006,34 @@ static int cache_ent_find_and_store(struct mlx5_ib_dev *dev,
 	return ret;
 }
 
+static int mlx5_ib_revoke_data_direct_mr(struct mlx5_ib_mr *mr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+	struct ib_umem_dmabuf *umem_dmabuf = to_ib_umem_dmabuf(mr->umem);
+	int err;
+
+	lockdep_assert_held(&dev->data_direct_lock);
+	mr->revoked = true;
+	err = mlx5r_umr_revoke_mr(mr);
+	if (WARN_ON(err))
+		return err;
+
+	ib_umem_dmabuf_revoke(umem_dmabuf);
+	return 0;
+}
+
+void mlx5_ib_revoke_data_direct_mrs(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_ib_mr *mr, *next;
+
+	lockdep_assert_held(&dev->data_direct_lock);
+
+	list_for_each_entry_safe(mr, next, &dev->data_direct_mr_list, dd_node) {
+		list_del(&mr->dd_node);
+		mlx5_ib_revoke_data_direct_mr(mr);
+	}
+}
+
 static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
 {
 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
@@ -1855,7 +2051,7 @@ static int mlx5_revoke_mr(struct mlx5_ib_mr *mr)
 	return destroy_mkey(dev, mr);
 }
 
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+static int __mlx5_ib_dereg_mr(struct ib_mr *ibmr)
 {
 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
 	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
@@ -1922,6 +2118,36 @@ int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 	return 0;
 }
 
+static int dereg_crossing_data_direct_mr(struct mlx5_ib_dev *dev,
+					struct mlx5_ib_mr *mr)
+{
+	struct mlx5_ib_mr *dd_crossed_mr = mr->dd_crossed_mr;
+	int ret;
+
+	ret = __mlx5_ib_dereg_mr(&mr->ibmr);
+	if (ret)
+		return ret;
+
+	mutex_lock(&dev->data_direct_lock);
+	if (!dd_crossed_mr->revoked)
+		list_del(&dd_crossed_mr->dd_node);
+
+	ret = __mlx5_ib_dereg_mr(&dd_crossed_mr->ibmr);
+	mutex_unlock(&dev->data_direct_lock);
+	return ret;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+
+	if (mr->data_direct)
+		return dereg_crossing_data_direct_mr(dev, mr);
+
+	return __mlx5_ib_dereg_mr(ibmr);
+}
+
 static void mlx5_set_umr_free_mkey(struct ib_pd *pd, u32 *in, int ndescs,
 				   int access_mode, int page_shift)
 {
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
index 4a04cbc5b78a4..0f76d681cdc3f 100644
--- a/drivers/infiniband/hw/mlx5/odp.c
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -712,7 +712,10 @@ static int pagefault_dmabuf_mr(struct mlx5_ib_mr *mr, size_t bcnt,
 		ib_umem_dmabuf_unmap_pages(umem_dmabuf);
 		err = -EINVAL;
 	} else {
-		err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
+		if (mr->data_direct)
+			err = mlx5r_umr_update_data_direct_ksm_pas(mr, xlt_flags);
+		else
+			err = mlx5r_umr_update_mr_pas(mr, xlt_flags);
 	}
 	dma_resv_unlock(umem_dmabuf->attach->dmabuf->resv);
 
diff --git a/drivers/infiniband/hw/mlx5/umr.c b/drivers/infiniband/hw/mlx5/umr.c
index e76142f6fa888..824aa39219b14 100644
--- a/drivers/infiniband/hw/mlx5/umr.c
+++ b/drivers/infiniband/hw/mlx5/umr.c
@@ -603,44 +603,47 @@ static void mlx5r_umr_final_update_xlt(struct mlx5_ib_dev *dev,
 	wqe->data_seg.byte_count = cpu_to_be32(sg->length);
 }
 
-/*
- * Send the DMA list to the HW for a normal MR using UMR.
- * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
- * flag may be used.
- */
-int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+static int
+_mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags, bool dd)
 {
+	size_t ent_size = dd ? sizeof(struct mlx5_ksm) : sizeof(struct mlx5_mtt);
 	struct mlx5_ib_dev *dev = mr_to_mdev(mr);
 	struct device *ddev = &dev->mdev->pdev->dev;
 	struct mlx5r_umr_wqe wqe = {};
 	struct ib_block_iter biter;
+	struct mlx5_ksm *cur_ksm;
 	struct mlx5_mtt *cur_mtt;
 	size_t orig_sg_length;
-	struct mlx5_mtt *mtt;
 	size_t final_size;
+	void *curr_entry;
 	struct ib_sge sg;
+	void *entry;
 	u64 offset = 0;
 	int err = 0;
 
-	if (WARN_ON(mr->umem->is_odp))
-		return -EINVAL;
-
-	mtt = mlx5r_umr_create_xlt(
-		dev, &sg, ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
-		sizeof(*mtt), flags);
-	if (!mtt)
+	entry = mlx5r_umr_create_xlt(dev, &sg,
+				     ib_umem_num_dma_blocks(mr->umem, 1 << mr->page_shift),
+				     ent_size, flags);
+	if (!entry)
 		return -ENOMEM;
 
 	orig_sg_length = sg.length;
-
 	mlx5r_umr_set_update_xlt_ctrl_seg(&wqe.ctrl_seg, flags, &sg);
 	mlx5r_umr_set_update_xlt_mkey_seg(dev, &wqe.mkey_seg, mr,
 					  mr->page_shift);
+	if (dd) {
+		/* Use the data direct internal kernel PD */
+		MLX5_SET(mkc, &wqe.mkey_seg, pd, dev->ddr.pdn);
+		cur_ksm = entry;
+	} else {
+		cur_mtt = entry;
+	}
+
 	mlx5r_umr_set_update_xlt_data_seg(&wqe.data_seg, &sg);
 
-	cur_mtt = mtt;
+	curr_entry = entry;
 	rdma_umem_for_each_dma_block(mr->umem, &biter, BIT(mr->page_shift)) {
-		if (cur_mtt == (void *)mtt + sg.length) {
+		if (curr_entry == entry + sg.length) {
 			dma_sync_single_for_device(ddev, sg.addr, sg.length,
 						   DMA_TO_DEVICE);
 
@@ -652,23 +655,31 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 						DMA_TO_DEVICE);
 			offset += sg.length;
 			mlx5r_umr_update_offset(&wqe.ctrl_seg, offset);
-
-			cur_mtt = mtt;
+			if (dd)
+				cur_ksm = entry;
+			else
+				cur_mtt = entry;
 		}
 
-		cur_mtt->ptag =
-			cpu_to_be64(rdma_block_iter_dma_address(&biter) |
-				    MLX5_IB_MTT_PRESENT);
-
-		if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
-			cur_mtt->ptag = 0;
-
-		cur_mtt++;
+		if (dd) {
+			cur_ksm->va = cpu_to_be64(rdma_block_iter_dma_address(&biter));
+			cur_ksm->key = cpu_to_be32(dev->ddr.mkey);
+			cur_ksm++;
+			curr_entry = cur_ksm;
+		} else {
+			cur_mtt->ptag =
+				cpu_to_be64(rdma_block_iter_dma_address(&biter) |
+					    MLX5_IB_MTT_PRESENT);
+			if (mr->umem->is_dmabuf && (flags & MLX5_IB_UPD_XLT_ZAP))
+				cur_mtt->ptag = 0;
+			cur_mtt++;
+			curr_entry = cur_mtt;
+		}
 	}
 
-	final_size = (void *)cur_mtt - (void *)mtt;
+	final_size = curr_entry - entry;
 	sg.length = ALIGN(final_size, MLX5_UMR_FLEX_ALIGNMENT);
-	memset(cur_mtt, 0, sg.length - final_size);
+	memset(curr_entry, 0, sg.length - final_size);
 	mlx5r_umr_final_update_xlt(dev, &wqe, mr, &sg, flags);
 
 	dma_sync_single_for_device(ddev, sg.addr, sg.length, DMA_TO_DEVICE);
@@ -676,10 +687,32 @@ int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
 
 err:
 	sg.length = orig_sg_length;
-	mlx5r_umr_unmap_free_xlt(dev, mtt, &sg);
+	mlx5r_umr_unmap_free_xlt(dev, entry, &sg);
 	return err;
 }
 
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+	/* No invalidation flow is expected */
+	if (WARN_ON(!mr->umem->is_dmabuf) || (flags & MLX5_IB_UPD_XLT_ZAP))
+		return -EINVAL;
+
+	return _mlx5r_umr_update_mr_pas(mr, flags, true);
+}
+
+/*
+ * Send the DMA list to the HW for a normal MR using UMR.
+ * Dmabuf MR is handled in a similar way, except that the MLX5_IB_UPD_XLT_ZAP
+ * flag may be used.
+ */
+int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags)
+{
+	if (WARN_ON(mr->umem->is_odp))
+		return -EINVAL;
+
+	return _mlx5r_umr_update_mr_pas(mr, flags, false);
+}
+
 static bool umr_can_use_indirect_mkey(struct mlx5_ib_dev *dev)
 {
 	return !MLX5_CAP_GEN(dev->mdev, umr_indirect_mkey_disabled);
diff --git a/drivers/infiniband/hw/mlx5/umr.h b/drivers/infiniband/hw/mlx5/umr.h
index 3799bb758e490..e3f80ba7c7814 100644
--- a/drivers/infiniband/hw/mlx5/umr.h
+++ b/drivers/infiniband/hw/mlx5/umr.h
@@ -92,6 +92,7 @@ int mlx5r_umr_revoke_mr(struct mlx5_ib_mr *mr);
 int mlx5r_umr_rereg_pd_access(struct mlx5_ib_mr *mr, struct ib_pd *pd,
 			      int access_flags);
 int mlx5r_umr_update_mr_pas(struct mlx5_ib_mr *mr, unsigned int flags);
+int mlx5r_umr_update_data_direct_ksm_pas(struct mlx5_ib_mr *mr, unsigned int flags);
 int mlx5r_umr_update_xlt(struct mlx5_ib_mr *mr, u64 idx, int npages,
 			 int page_shift, int flags);
 
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 595edad03dfe5..278372042469d 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -270,6 +270,10 @@ enum mlx5_ib_device_query_context_attrs {
 	MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT),
 };
 
+enum mlx5_ib_reg_dmabuf_mr_attrs {
+	MLX5_IB_ATTR_REG_DMABUF_MR_ACCESS_FLAGS = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 #define MLX5_IB_DW_MATCH_PARAM 0xA0
 
 struct mlx5_ib_match_params {
diff --git a/include/uapi/rdma/mlx5_user_ioctl_verbs.h b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
index 3189c7f08d178..7c233df475e71 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_verbs.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_verbs.h
@@ -54,6 +54,10 @@ enum mlx5_ib_uapi_flow_action_packet_reformat_type {
 	MLX5_IB_UAPI_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL = 0x3,
 };
 
+enum mlx5_ib_uapi_reg_dmabuf_flags {
+	MLX5_IB_UAPI_REG_DMABUF_ACCESS_DATA_DIRECT = 1 << 0,
+};
+
 struct mlx5_ib_uapi_devx_async_cmd_hdr {
 	__aligned_u64	wr_id;
 	__u8		out_data[];

From 39be027491be79eea40212b48c8aa8c4364568a6 Mon Sep 17 00:00:00 2001
From: Yishai Hadas <yishaih@nvidia.com>
Date: Thu, 1 Aug 2024 15:05:17 +0300
Subject: [PATCH 347/352] RDMA/mlx5: Introduce GET_DATA_DIRECT_SYSFS_PATH ioctl

Introduce the 'GET_DATA_DIRECT_SYSFS_PATH' ioctl to return the sysfs
path of the affiliated 'data direct' device for a given device.

Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
Link: https://patch.msgid.link/403745463e0ef52adbef681ff09aa6a29a756352.1722512548.git.leon@kernel.org
Signed-off-by: Leon Romanovsky <leon@kernel.org>
(cherry-picked from commit ec7ad6530909983c8736c80af46e3529ce7bab55 linux)
Signed-off-by: Tushar Dave <tdave@nvidia.com>
Acked-by: Matthew R. Ochs <mochs@nvidia.com>
Acked-by: Jamie Nguyen <jamien@nvidia.com>
Acked-by: Carol L Soto <csoto@nvidia.com>
Signed-off-by: Matthew R. Ochs <mochs@nvidia.com>
---
 drivers/infiniband/hw/mlx5/std_types.c   | 55 +++++++++++++++++++++++-
 include/uapi/rdma/mlx5_user_ioctl_cmds.h |  5 +++
 2 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx5/std_types.c b/drivers/infiniband/hw/mlx5/std_types.c
index bbfcce3bdc84e..ffeb1e1a15389 100644
--- a/drivers/infiniband/hw/mlx5/std_types.c
+++ b/drivers/infiniband/hw/mlx5/std_types.c
@@ -10,6 +10,7 @@
 #include <linux/mlx5/eswitch.h>
 #include <linux/mlx5/vport.h>
 #include "mlx5_ib.h"
+#include "data_direct.h"
 
 #define UVERBS_MODULE_NAME mlx5_ib
 #include <rdma/uverbs_named_ioctl.h>
@@ -183,6 +184,50 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_QUERY_PORT)(
 					     sizeof(info));
 }
 
+static int UVERBS_HANDLER(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH)(
+	struct uverbs_attr_bundle *attrs)
+{
+	struct mlx5_data_direct_dev *data_direct_dev;
+	struct mlx5_ib_ucontext *c;
+	struct mlx5_ib_dev *dev;
+	int out_len = uverbs_attr_get_len(attrs,
+			MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH);
+	u32 dev_path_len;
+	char *dev_path;
+	int ret;
+
+	c = to_mucontext(ib_uverbs_get_ucontext(attrs));
+	if (IS_ERR(c))
+		return PTR_ERR(c);
+	dev = to_mdev(c->ibucontext.device);
+	mutex_lock(&dev->data_direct_lock);
+	data_direct_dev = dev->data_direct_dev;
+	if (!data_direct_dev) {
+		ret = -ENODEV;
+		goto end;
+	}
+
+	dev_path = kobject_get_path(&data_direct_dev->device->kobj, GFP_KERNEL);
+	if (!dev_path) {
+		ret = -ENOMEM;
+		goto end;
+	}
+
+	dev_path_len = strlen(dev_path) + 1;
+	if (dev_path_len > out_len) {
+		ret = -ENOSPC;
+		goto end;
+	}
+
+	ret = uverbs_copy_to(attrs, MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH, dev_path,
+			     dev_path_len);
+	kfree(dev_path);
+
+end:
+	mutex_unlock(&dev->data_direct_lock);
+	return ret;
+}
+
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_QUERY_PORT,
 	UVERBS_ATTR_PTR_IN(MLX5_IB_ATTR_QUERY_PORT_PORT_NUM,
@@ -193,9 +238,17 @@ DECLARE_UVERBS_NAMED_METHOD(
 				   reg_c0),
 		UA_MANDATORY));
 
+DECLARE_UVERBS_NAMED_METHOD(
+	MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH,
+	UVERBS_ATTR_PTR_OUT(
+		MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH,
+		UVERBS_ATTR_MIN_SIZE(0),
+		UA_MANDATORY));
+
 ADD_UVERBS_METHODS(mlx5_ib_device,
 		   UVERBS_OBJECT_DEVICE,
-		   &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT));
+		   &UVERBS_METHOD(MLX5_IB_METHOD_QUERY_PORT),
+		   &UVERBS_METHOD(MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH));
 
 DECLARE_UVERBS_NAMED_METHOD(
 	MLX5_IB_METHOD_PD_QUERY,
diff --git a/include/uapi/rdma/mlx5_user_ioctl_cmds.h b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
index 278372042469d..3d5cc714f2ed3 100644
--- a/include/uapi/rdma/mlx5_user_ioctl_cmds.h
+++ b/include/uapi/rdma/mlx5_user_ioctl_cmds.h
@@ -344,6 +344,7 @@ enum mlx5_ib_pd_methods {
 
 enum mlx5_ib_device_methods {
 	MLX5_IB_METHOD_QUERY_PORT = (1U << UVERBS_ID_NS_SHIFT),
+	MLX5_IB_METHOD_GET_DATA_DIRECT_SYSFS_PATH,
 };
 
 enum mlx5_ib_query_port_attrs {
@@ -351,4 +352,8 @@ enum mlx5_ib_query_port_attrs {
 	MLX5_IB_ATTR_QUERY_PORT,
 };
 
+enum mlx5_ib_get_data_direct_sysfs_path_attrs {
+	MLX5_IB_ATTR_GET_DATA_DIRECT_SYSFS_PATH = (1U << UVERBS_ID_NS_SHIFT),
+};
+
 #endif

From 4186f260e57d35a73d4905f932e553c7fdf731cc Mon Sep 17 00:00:00 2001
From: Brad Figg <bfigg@nvidia.com>
Date: Fri, 18 Oct 2024 18:56:01 -0700
Subject: [PATCH 348/352] UBUNTU: Start new release

Ignore: yes
Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/changelog | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
index 672012c7d9fde..b3b400869f8f3 100644
--- a/debian.nvidia-adv/changelog
+++ b/debian.nvidia-adv/changelog
@@ -1,3 +1,11 @@
+linux-nvidia-adv (6.8.0-1003.3) UNRELEASED; urgency=medium
+
+  CHANGELOG: Do not edit directly. Autogenerated at release.
+  CHANGELOG: Use the printchanges target to see the curent changes.
+  CHANGELOG: Use the insertchanges target to create the final log.
+
+ -- Brad Figg <bfigg@nvidia.com>  Fri, 18 Oct 2024 18:56:01 -0700
+
 linux-nvidia-adv (6.8.0-1002.2) noble; urgency=medium
 
   * NVIDIA: SAUCE: acpi/prmt: find block with specific type (LP: #2081874)

From cff35f3c8c632e3159953b13b071d315fea6567a Mon Sep 17 00:00:00 2001
From: Brad Figg <bfigg@nvidia.com>
Date: Fri, 18 Oct 2024 18:57:52 -0700
Subject: [PATCH 349/352] NVIDIA: [Config] Annotations update

Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/config/annotations | 41 ++++++++++++++--------------
 1 file changed, 21 insertions(+), 20 deletions(-)

diff --git a/debian.nvidia-adv/config/annotations b/debian.nvidia-adv/config/annotations
index 8637342f01cc5..8d2baabee5d01 100644
--- a/debian.nvidia-adv/config/annotations
+++ b/debian.nvidia-adv/config/annotations
@@ -168,38 +168,39 @@ CONFIG_UBUNTU_ODM_DRIVERS                       note<'{Disable all Ubuntu ODM dr
 CONFIG_ULTRASOC_SMB                             policy<{'arm64': 'n'}>
 CONFIG_ULTRASOC_SMB                             note<'{Required for Grace enablement}'>
 
-# ---- Annotations to support vSMMU/vCMDQ/vEGM/GPU Passthrough ----
-CONFIG_FAULT_INJECTION                          policy<{'arm64': 'y'}>
-CONFIG_IOMMUFD                                  policy<{'arm64': 'y'}>
-CONFIG_IOMMUFD_TEST                             policy<{'arm64': 'y'}>
-CONFIG_IOMMUFD_VFIO_CONTAINER                   policy<{'arm64': 'y'}>
-CONFIG_TEGRA241_CMDQV                           policy<{'arm64': 'y'}>
-CONFIG_VFIO_CONTAINER                           policy<{'arm64': 'n'}>
-CONFIG_VFIO_IOMMU_TYPE1                         policy<{'arm64': '-'}>
-CONFIG_NVGRACE_GPU_VFIO_PCI                     policy<{'arm64': 'm'}>
-CONFIG_NVGRACE_EGM                              policy<{'arm64': 'm'}>
-CONFIG_IOMMU_IOPF                               policy<{'amd64': 'y', 'arm64': 'y'}>
-CONFIG_SCSI_UFS_FAULT_INJECTION                 policy<{'arm64': 'n'}>
-CONFIG_FAILSLAB                                 policy<{'arm64': 'n'}>
-CONFIG_FAIL_PAGE_ALLOC                          policy<{'arm64': 'n'}>
-CONFIG_FAULT_INJECTION_USERCOPY                 policy<{'arm64': 'n'}>
-CONFIG_FAIL_MAKE_REQUEST                        policy<{'arm64': 'n'}>
-CONFIG_FAIL_IO_TIMEOUT                          policy<{'arm64': 'n'}>
-CONFIG_FAIL_FUTEX                               policy<{'arm64': 'n'}>
-CONFIG_FAULT_INJECTION_DEBUG_FS                 policy<{'arm64': 'n'}>
-CONFIG_FAULT_INJECTION_CONFIGFS                 policy<{'arm64': 'n'}>
 
 # ---- Annotations without notes ----
 
 CONFIG_AX88796B_RUST_PHY                        policy<{'amd64': '-'}>
 CONFIG_BCH                                      policy<{'amd64': 'm', 'arm64': 'y'}>
 CONFIG_BINDGEN_VERSION_TEXT                     policy<{'amd64': '-'}>
+CONFIG_CC_VERSION_TEXT                          policy<{'amd64': '"x86_64-linux-gnu-gcc-13 (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"', 'arm64': '"aarch64-linux-gnu-gcc-13 (Ubuntu 13.3.0-6ubuntu2~24.04) 13.3.0"'}>
 CONFIG_CONSTRUCTORS                             policy<{'amd64': '-'}>
 CONFIG_EFI_CAPSULE_LOADER                       policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_FAILSLAB                                 policy<{'arm64': 'n'}>
+CONFIG_FAIL_FUTEX                               policy<{'arm64': 'n'}>
+CONFIG_FAIL_IO_TIMEOUT                          policy<{'arm64': 'n'}>
+CONFIG_FAIL_MAKE_REQUEST                        policy<{'arm64': 'n'}>
+CONFIG_FAIL_PAGE_ALLOC                          policy<{'arm64': 'n'}>
+CONFIG_FAULT_INJECTION                          policy<{'amd64': 'n', 'arm64': 'y'}>
+CONFIG_FAULT_INJECTION_CONFIGFS                 policy<{'arm64': 'n'}>
+CONFIG_FAULT_INJECTION_DEBUG_FS                 policy<{'arm64': 'n'}>
+CONFIG_FAULT_INJECTION_USERCOPY                 policy<{'arm64': 'n'}>
+CONFIG_GCC_VERSION                              policy<{'amd64': '130300', 'arm64': '130300'}>
+CONFIG_IOMMUFD                                  policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_IOMMUFD_TEST                             policy<{'arm64': 'y'}>
+CONFIG_IOMMUFD_VFIO_CONTAINER                   policy<{'arm64': 'y'}>
+CONFIG_IOMMU_IOPF                               policy<{'amd64': 'y', 'arm64': 'y'}>
 CONFIG_MTD_NAND_CORE                            policy<{'amd64': 'm', 'arm64': 'y'}>
+CONFIG_NVGRACE_EGM                              policy<{'arm64': 'm'}>
+CONFIG_NVGRACE_GPU_VFIO_PCI                     policy<{'arm64': 'm'}>
 CONFIG_RUSTC_VERSION_TEXT                       policy<{'amd64': '-'}>
 CONFIG_RUST_BUILD_ASSERT_ALLOW                  policy<{'amd64': '-'}>
 CONFIG_RUST_DEBUG_ASSERTIONS                    policy<{'amd64': '-'}>
 CONFIG_RUST_OVERFLOW_CHECKS                     policy<{'amd64': '-'}>
 CONFIG_RUST_PHYLIB_ABSTRACTIONS                 policy<{'amd64': '-'}>
 CONFIG_SAMPLES_RUST                             policy<{'amd64': '-'}>
+CONFIG_SCSI_UFS_FAULT_INJECTION                 policy<{'arm64': 'n'}>
+CONFIG_TEGRA241_CMDQV                           policy<{'arm64': 'y'}>
+CONFIG_VFIO_CONTAINER                           policy<{'amd64': 'y', 'arm64': 'n'}>
+CONFIG_VFIO_IOMMU_TYPE1                         policy<{'amd64': 'm', 'arm64': '-'}>

From add7f35c15820e69ef3112f9fb8b439473e5e31b Mon Sep 17 00:00:00 2001
From: Brad Figg <bfigg@nvidia.com>
Date: Fri, 18 Oct 2024 19:00:18 -0700
Subject: [PATCH 350/352] UBUNTU: linux-nvidia-adv-6.8.0-1003.3

Signed-off-by: Brad Figg <bfigg@nvidia.com>
---
 debian.nvidia-adv/changelog   | 388 +++++++++++++++++++++++++++++++++-
 debian.nvidia-adv/reconstruct |   3 +
 2 files changed, 386 insertions(+), 5 deletions(-)

diff --git a/debian.nvidia-adv/changelog b/debian.nvidia-adv/changelog
index b3b400869f8f3..04ce286c517c7 100644
--- a/debian.nvidia-adv/changelog
+++ b/debian.nvidia-adv/changelog
@@ -1,10 +1,388 @@
-linux-nvidia-adv (6.8.0-1003.3) UNRELEASED; urgency=medium
+linux-nvidia-adv (6.8.0-1003.3) noble; urgency=medium
 
-  CHANGELOG: Do not edit directly. Autogenerated at release.
-  CHANGELOG: Use the printchanges target to see the curent changes.
-  CHANGELOG: Use the insertchanges target to create the final log.
+  * NVIDIA: SAUCE: acpi/prmt: find block with specific type (LP: #2081874)
+    - NVIDIA: SAUCE: acpi/prmt: find block with specific type
+
+  * Pull-request to address ARM SMMU issue (LP: #2031320)
+    - NVIDIA: SAUCE: iommu/arm-smmu-v3: Allow default substream bypass with a
+      pasid support
+
+  * Pull request: mm: fix old/young bit handling in the faulting path of
+    set_pte_range() (LP: #2075396)
+    - mm: fix old/young bit handling in the faulting path
+
+  * Pull-request:Add a kernel command-line option 'config_acs' to directly
+    control all the ACS bits for specific devices (LP: #2073811)
+    - PCI: Extend ACS configurability
+
+  * PR for:  "IB/mlx5: Use __iowrite64_copy() for write combining stores"
+    (LP: #2071655)
+    - x86: Stop using weak symbols for __iowrite32_copy()
+    - s390: Implement __iowrite32_copy()
+    - s390: Stop using weak symbols for __iowrite64_copy()
+    - arm64/io: Provide a WC friendly __iowriteXX_copy()
+    - net: hns3: Remove io_stop_wc() calls after __iowrite64_copy()
+
+  * PR for:  "PCI: Clear Secondary Status errors after enumeration"
+    (LP: #2071654)
+    - PCI: Clear Secondary Status errors after enumeration
+
+  * mlxbf_pmc: bring in latest 6.8 upstream commits (LP: #2069777)
+    - platform/mellanox: mlxbf-pmc: Replace uintN_t with kernel-style types
+    - platform/mellanox: mlxbf-pmc: Cleanup signed/unsigned mix-up
+    - platform/mellanox: mlxbf-pmc: mlxbf_pmc_event_list(): make size ptr optional
+    - platform/mellanox: mlxbf-pmc: Ignore unsupported performance blocks
+    - platform/mellanox: mlxbf-pmc: fix signedness bugs
+
+  * mlxbf_gige: bring in latest 6.x upstream commits (LP: #2068067)
+    - mlxbf_gige: add support to display pause frame counters
+
+  * Export kernel symbols required for NVIDIA GDS (LP: #2068544)
+    - NVIDIA: SAUCE: NFS: Export nvfs register and unregister functions as GPL
+    - NVIDIA: SAUCE: NVMe/NVMeoF: Export nvfs register and unregister functions as
+      GPL
+
+  * linux-nvidia-6.5_6.5.0-1014.14 breaks with earlier BIOS release, and
+    modeset/resolutions are wrong (LP: #2061930) // Blacklist coresight_etm4x
+    (LP: #2067106)
+    - [Packaging] blacklist coresight_etm4x
+
+  * backport arm64 THP improvements from 6.9 (LP: #2059316)
+    - arm64/mm: make set_ptes() robust when OAs cross 48-bit boundary
+    - arm/pgtable: define PFN_PTE_SHIFT
+    - nios2/pgtable: define PFN_PTE_SHIFT
+    - powerpc/pgtable: define PFN_PTE_SHIFT
+    - riscv/pgtable: define PFN_PTE_SHIFT
+    - s390/pgtable: define PFN_PTE_SHIFT
+    - sparc/pgtable: define PFN_PTE_SHIFT
+    - mm/pgtable: make pte_next_pfn() independent of set_ptes()
+    - arm/mm: use pte_next_pfn() in set_ptes()
+    - powerpc/mm: use pte_next_pfn() in set_ptes()
+    - mm/memory: factor out copying the actual PTE in copy_present_pte()
+    - mm/memory: pass PTE to copy_present_pte()
+    - mm/memory: optimize fork() with PTE-mapped THP
+    - mm/memory: ignore dirty/accessed/soft-dirty bits in folio_pte_batch()
+    - mm/memory: ignore writable bit in folio_pte_batch()
+    - mm: clarify the spec for set_ptes()
+    - mm: thp: batch-collapse PMD with set_ptes()
+    - mm: introduce pte_advance_pfn() and use for pte_next_pfn()
+    - arm64/mm: convert pte_next_pfn() to pte_advance_pfn()
+    - x86/mm: convert pte_next_pfn() to pte_advance_pfn()
+    - mm: tidy up pte_next_pfn() definition
+    - arm64/mm: convert READ_ONCE(*ptep) to ptep_get(ptep)
+    - arm64/mm: convert set_pte_at() to set_ptes(..., 1)
+    - arm64/mm: convert ptep_clear() to ptep_get_and_clear()
+    - arm64/mm: new ptep layer to manage contig bit
+    - arm64/mm: dplit __flush_tlb_range() to elide trailing DSB
+    - NVIDIA: [Config] arm64: ARM64_CONTPTE=y
+    - arm64/mm: wire up PTE_CONT for user mappings
+    - arm64/mm: implement new wrprotect_ptes() batch API
+    - arm64/mm: implement new [get_and_]clear_full_ptes() batch APIs
+    - mm: add pte_batch_hint() to reduce scanning in folio_pte_batch()
+    - arm64/mm: implement pte_batch_hint()
+    - arm64/mm: __always_inline to improve fork() perf
+    - arm64/mm: automatically fold contpte mappings
+    - arm64/mm: export contpte symbols only to GPL users
+    - arm64/mm: improve comment in contpte_ptep_get_lockless()
+
+  * Enable GDS in the 6.8 based linux-nvidia kernel (LP: #2059814)
+    - NVIDIA: SAUCE: Patch NFS driver to support GDS with 6.8 Kernel
+    - NVIDIA: SAUCE: NVMe/MVMEeOF: Patch NVMe/NVMeOF driver to support GDS on
+      Linux 6.8 Kernel
+
+  * Miscellaneous upstream changes
+    - NVIDIA: linux-nvidia-adv-6.8.0-1002.2
+    - Revert "NVIDIA: SAUCE: iommu/arm-smmu-v3: Allow default substream bypass
+      with a pasid support"
+    - vfio: replace CONFIG_HAVE_KVM with IS_ENABLED(CONFIG_KVM)
+    - iommu/iova: Tidy up iova_cache_get() failure
+    - iommu/iova: Reorganise some code
+    - iommu/iova: use named kmem_cache for iova magazines
+    - iommu/ipmmu-vmsa: Minor cleanups
+    - iommu: Introduce iommu_group_mutex_assert()
+    - iommu: Move iommu fault data to linux/iommu.h
+    - iommu/arm-smmu-v3: Remove unrecoverable faults reporting
+    - iommu: Remove unrecoverable fault data
+    - iommu: Cleanup iopf data structure definitions
+    - iommu: Merge iopf_device_param into iommu_fault_param
+    - iommu: Remove iommu_[un]register_device_fault_handler()
+    - iommu: Merge iommu_fault_event and iopf_fault
+    - iommu: Prepare for separating SVA and IOPF
+    - iommu: Make iommu_queue_iopf() more generic
+    - iommu: Separate SVA and IOPF
+    - iommu: Refine locking for per-device fault data management
+    - iommu: Use refcount for fault data access
+    - iommu: Improve iopf_queue_remove_device()
+    - iommu: Track iopf group instead of last fault
+    - iommu: Make iopf_group_response() return void
+    - iommu: Make iommu_report_device_fault() return void
+    - treewide: replace or remove redundant def_bool in Kconfig files
+    - iommu/arm-smmu-qcom: Add X1E80100 MDSS compatible
+    - vfio/pci: WARN_ON driver_override kasprintf failure
+    - vfio: mdev: make mdev_bus_type const
+    - vfio/pci: rename and export do_io_rw()
+    - vfio/pci: rename and export range_intersect_range
+    - vfio/nvgrace-gpu: Add vfio pci variant module for grace hopper
+    - KVM: arm64: Introduce new flag for non-cacheable IO memory
+    - mm: Introduce new flag to indicate wc safe
+    - KVM: arm64: Set io memory s2 pte as normalnc for vfio pci device
+    - vfio: Convey kvm that the vfio-pci device is wc safe
+    - iommu/arm-smmu-v3: Make STE programming independent of the callers
+    - iommu/arm-smmu-v3: Consolidate the STE generation for abort/bypass
+    - iommu/arm-smmu-v3: Move the STE generation for S1 and S2 domains into
+      functions
+    - iommu/arm-smmu-v3: Build the whole STE in arm_smmu_make_s2_domain_ste()
+    - iommu/arm-smmu-v3: Compute the STE only once for each master
+    - iommu/arm-smmu-v3: Do not change the STE twice during arm_smmu_attach_dev()
+    - iommu/arm-smmu-v3: Put writing the context descriptor in the right order
+    - iommu/arm-smmu-v3: Pass smmu_domain to arm_enable/disable_ats()
+    - iommu/arm-smmu-v3: Remove arm_smmu_master->domain
+    - iommu/arm-smmu-v3: Check that the RID domain is S1 in SVA
+    - iommu/arm-smmu-v3: Add a global static IDENTITY domain
+    - iommu/arm-smmu-v3: Add a global static BLOCKED domain
+    - iommu/arm-smmu-v3: Use the identity/blocked domain during release
+    - iommu/arm-smmu-v3: Pass arm_smmu_domain and arm_smmu_device to finalize
+    - iommu/arm-smmu-v3: Convert to domain_alloc_paging()
+    - iommu: constify pointer to bus_type
+    - iommu: constify of_phandle_args in xlate
+    - iommu: constify fwnode in iommu_ops_from_fwnode()
+    - iommu: re-use local fwnode variable in iommu_ops_from_fwnode()
+    - vfio/nvgrace-gpu: Convey kvm to map device memory region as noncached
+    - Revert "vfio/type1: Unpin zero pages"
+    - iommu/dma: Document min_align_mask assumption
+    - iommu/arm-smmu-v3: Add cpu_to_le64() around STRTAB_STE_0_V
+    - iommu/arm-smmu-v3: Fix access for STE.SHCFG
+    - swiotlb: fix swiotlb_bounce() to do partial sync's correctly
+    - iommu/arm-smmu-v3: Retire disable_bypass parameter
+    - iommu/arm-smmu-v3: Do not allow a SVA domain to be set on the wrong PASID
+    - iommu/arm-smmu-v3: Do not ATC invalidate the entire domain
+    - iommu/arm-smmu-v3: Add a type for the CD entry
+    - iommu: Pass domain to remove_dev_pasid() op
+    - iommu/dma: use iommu_put_pages_list() to releae freelist
+    - iommu/vt-d: add wrapper functions for page allocations
+    - iommu/io-pgtable-arm: use page allocation function provided by iommu-pages.h
+    - iommu: observability of the IOMMU allocations
+    - iommu: account IOMMU allocated memory
+    - iommu/arm-smmu: Convert to domain_alloc_paging()
+    - iommu/arm-smmu-qcom-debug: Add support for TBUs
+    - iommu/arm-smmu: Allow using a threaded handler for context interrupts
+    - iommu/arm-smmu-qcom: Use a custom context fault handler for sdm845
+    - iommu/arm-smmu-qcom: Use the custom fault handler on more platforms
+    - iommu: Add ops->domain_alloc_sva()
+    - iommu/arm-smmu-qcom: Don't build debug features as a kernel module
+    - iommu/arm-smmu-v3: Add an ops indirection to the STE code
+    - iommu/arm-smmu-v3: Make CD programming use arm_smmu_write_entry()
+    - iommu/arm-smmu-v3: Move the CD generation for S1 domains into a function
+    - iommu/arm-smmu-v3: Consolidate clearing a CD table entry
+    - iommu/arm-smmu-v3: Make arm_smmu_alloc_cd_ptr()
+    - iommu/arm-smmu-v3: Allocate the CD table entry in advance
+    - iommu/arm-smmu-v3: Move the CD generation for SVA into a function
+    - iommu/arm-smmu-v3: Build the whole CD in arm_smmu_make_s1_cd()
+    - iommu/arm-smmu-v3: Add unit tests for arm_smmu_write_entry
+    - mm/memory-failure: convert shake_page() to shake_folio()
+    - mm: convert hugetlb_page_mapping_lock_write to folio
+    - mm/memory-failure: convert memory_failure() to use a folio
+    - mm/memory-failure: convert hwpoison_user_mappings to take a folio
+    - mm/memory-failure: add some folio conversions to unpoison_memory
+    - mm/memory-failure: use folio functions throughout collect_procs()
+    - mm/memory-failure: pass the folio to collect_procs_ksm()
+    - memory-failure: remove calls to page_mapping()
+    - swiotlb: remove alloc_size argument to swiotlb_tbl_map_single()
+    - iommu/dma: fix zeroing of bounce buffer padding used by untrusted devices
+    - iommu/arm-smmu-v3: Make the kunit into a module
+    - vfio/pci: Restore zero affected bus reset devices warning
+    - iommu/arm-smmu-v3: Avoid uninitialized asid in case of error
+    - iommu/arm-smmu-v3: Use *-y instead of *-objs in Makefile
+    - iommu: Make iommu_sva_domain_alloc() static
+    - iommu/dma: Prune redundant pgprot arguments
+    - iommu/iova: Add missing MODULE_DESCRIPTION() macro
+    - iommufd: Use atomic_long_try_cmpxchg() in incr_user_locked_vm()
+    - iommufd/selftest: Fix dirty bitmap tests with u8 bitmaps
+    - iommufd/selftest: Fix iommufd_test_dirty() to handle <u8 bitmaps
+    - iommufd/selftest: Add tests for <= u8 bitmap sizes
+    - iommufd/selftest: Fix tests to use MOCK_PAGE_SIZE based buffer sizes
+    - iommufd/selftest: Do not record head iova to better match iommu drivers
+    - iommufd/iova_bitmap: Check iova_bitmap_done() after set ahead
+    - iommufd/iova_bitmap: Cache mapped length in iova_bitmap_map struct
+    - iommufd/iova_bitmap: Move initial pinning to iova_bitmap_for_each()
+    - iommufd/iova_bitmap: Consolidate iova_bitmap_set exit conditionals
+    - iommufd/iova_bitmap: Dynamic pinning on iova_bitmap_set()
+    - iommufd/iova_bitmap: Remove iterator logic
+    - iommu/arm-smmu-v3: Convert to domain_alloc_sva()
+    - iommu/arm-smmu-v3: Start building a generic PASID layer
+    - iommu/arm-smmu-v3: Make smmu_domain->devices into an allocated list
+    - iommu/arm-smmu-v3: Make changing domains be hitless for ATS
+    - iommu/arm-smmu-v3: Add ssid to struct arm_smmu_master_domain
+    - iommu/arm-smmu-v3: Do not use master->sva_enable to restrict attaches
+    - iommu/arm-smmu-v3: Thread SSID through the arm_smmu_attach_*() interface
+    - iommu/arm-smmu-v3: Make SVA allocate a normal arm_smmu_domain
+    - iommu/arm-smmu-v3: Keep track of arm_smmu_master_domain for SVA
+    - iommu/arm-smmu-v3: Put the SVA mmu notifier in the smmu_domain
+    - iommu/arm-smmu-v3: Allow IDENTITY/BLOCKED to be set while PASID is used
+    - iommu/arm-smmu-v3: Test the STE S1DSS functionality
+    - iommu/arm-smmu-v3: Allow a PASID to be set when RID is IDENTITY/BLOCKED
+    - iommu/arm-smmu-v3: Allow setting a S1 domain to a PASID
+    - iommu/arm-smmu-v3: Do not zero the strtab twice
+    - iommu/arm-smmu-v3: Shrink the strtab l1_desc array
+    - iommu/arm-smmu-v3: add missing MODULE_DESCRIPTION() macro
+    - iommu/arm-smmu: Add CB prefix to register bitfields
+    - iommu/arm-smmu-qcom-debug: Do not print for handled faults
+    - iommu/arm-smmu: Pretty-print context fault related regs
+    - iommu/arm-smmu-qcom: record reason for deferring probe
+    - iommu/arm-smmu-v3: Add support for domain_alloc_user fn
+    - iommu/arm-smmu-v3: Add feature detection for HTTU
+    - iommu/io-pgtable-arm: Add read_and_clear_dirty() support
+    - iommu/arm-smmu-v3: Add support for dirty tracking in domain alloc
+    - iommu/arm-smmu-v3: Enable HTTU for stage1 with io-pgtable mapping
+    - iommu: Introduce domain attachment handle
+    - iommu: Remove sva handle list
+    - iommu: Add attach handle to struct iopf_group
+    - iommu: Extend domain attach group with handle support
+    - iommu: Add iommu_paging_domain_alloc() interface
+    - iommufd: Use iommu_paging_domain_alloc()
+    - vfio/type1: Use iommu_paging_domain_alloc()
+    - iommu/of: Support ats-supported device-tree property
+    - iommufd: Add fault and response message definitions
+    - iommufd: Add iommufd fault object
+    - iommufd: Fault-capable hwpt attach/detach/replace
+    - iommufd: Associate fault object with iommufd_hw_pgtable
+    - iommufd/selftest: Add IOPF support for mock device
+    - iommufd/selftest: Add coverage for IOPF test
+    - iommufd: Require drivers to supply the cache_invalidate_user ops
+    - vfio/pci: Init the count variable in collecting hot-reset devices
+    - iommufd: Remove IOMMUFD_PAGE_RESP_FAILURE
+    - iommufd: Add check on user response code
+    - iommufd: Fix error pointer checking
+    - iommu: Move IOMMU_DIRTY_NO_CLEAR define
+    - iommufd: Put constants for all the uAPI enums
+    - iommufd/device: Fix hwpt at err_unresv in iommufd_device_do_replace()
+    - iommufd: Reorder include files
+    - iommu/arm-smmu-v3: Issue a batch of commands to the same cmdq
+    - iommu/arm-smmu-v3: Pass in cmdq pointer to arm_smmu_cmdq_build_sync_cmd
+    - iommu/arm-smmu-v3: Pass in cmdq pointer to arm_smmu_cmdq_init
+    - iommu/arm-smmu-v3: Make symbols public for CONFIG_TEGRA241_CMDQV
+    - iommu/arm-smmu-v3: Add ARM_SMMU_OPT_TEGRA241_CMDQV
+    - iommu/arm-smmu-v3: Add acpi_smmu_iort_probe_model for impl
+    - iommu/arm-smmu-v3: Add struct arm_smmu_impl_ops
+    - iommu/arm-smmu-v3: Add in-kernel support for NVIDIA Tegra241 (Grace) CMDQV
+    - iommu/arm-smmu-v3: Start a new batch if new command is not supported
+    - iommu/tegra241-cmdqv: Limit CMDs for VCMDQs of a guest owned VINTF
+    - iommu/tegra241-cmdqv: Fix -Wformat-truncation warnings in
+      lvcmdq_error_header
+    - iommu/tegra241-cmdqv: Fix ioremap() error handling in probe()
+    - iommu/tegra241-cmdqv: Drop static at local variable
+    - iommu/tegra241-cmdqv: Do not allocate vcmdq until dma_set_mask_and_coherent
+    - iommu/arm-smmu-v3: Use the new rb tree helpers
+    - iommu/arm-smmu-v3: Add arm_smmu_strtab_l1/2_idx()
+    - iommu/arm-smmu-v3: Add types for each level of the 2 level stream table
+    - iommu/arm-smmu-v3: Reorganize struct arm_smmu_strtab_cfg
+    - iommu/arm-smmu-v3: Remove strtab_base/cfg
+    - iommu/arm-smmu-v3: Do not use devm for the cd table allocations
+    - iommu/arm-smmu-v3: Shrink the cdtab l1_desc array
+    - iommu/arm-smmu-v3: Add types for each level of the CD table
+    - iommu/arm-smmu-v3: Reorganize struct arm_smmu_ctx_desc_cfg
+    - iommu/arm-smmu: Un-demote unhandled-fault msg
+    - iommu/arm-smmu-qcom: Register the TBU driver in qcom_smmu_impl_init
+    - iommu/arm-smmu-v3: Fix a NULL vs IS_ERR() check
+    - iommufd/selftest: Fix buffer read overrrun in the dirty test
+    - iommu: Handle iommu faults for a bad iopf setup
+    - cover-letter: Apply upstream patches for dependencies
+    - vfio: Remove VFIO_TYPE1_NESTING_IOMMU
+    - iommu/arm-smmu-v3: Use S2FWB when available
+    - ACPICA: IORT: Update for revision E.f
+    - ACPI/IORT: Support CANWBS memory access flag
+    - iommu/arm-smmu-v3: Report IOMMU_CAP_ENFORCE_CACHE_COHERENCY for CANWBS
+    - iommu/arm-smmu-v3: Support IOMMU_GET_HW_INFO via struct arm_smmu_hw_info
+    - iommu/arm-smmu-v3: Implement IOMMU_HWPT_ALLOC_NEST_PARENT
+    - iommu/arm-smmu-v3: Support IOMMU_DOMAIN_NESTED
+    - cover-letter: Initial support for SMMUv3 nested translation
+    - WAR: ACPI/IORT: Set CANWBS for Grace CPU
+    - cover-letter: WAR for nesting patches
+    - iommufd: Reorder struct forward declarations
+    - iommufd/viommu: Add IOMMUFD_OBJ_VIOMMU and IOMMU_VIOMMU_ALLOC ioctl
+    - iommu: Pass in a viommu pointer to domain_alloc_user op
+    - iommufd: Allow pt_id to carry viommu_id for IOMMU_HWPT_ALLOC
+    - iommufd/selftest: Add IOMMU_VIOMMU_ALLOC test coverage
+    - iommufd/viommu: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID ioctl
+    - iommufd/selftest: Add IOMMU_VIOMMU_SET/UNSET_VDEV_ID test coverage
+    - iommufd/viommu: Add cache_invalidate for IOMMU_VIOMMU_TYPE_DEFAULT
+    - iommufd: Allow hwpt_id to carry viommu_id for IOMMU_HWPT_INVALIDATE
+    - iommufd/viommu: Add vdev_id helpers for IOMMU drivers
+    - iommu: Add iommu_copy_struct_from_full_user_array helper
+    - iommufd/selftest: Add mock_viommu_invalidate_user op
+    - iommufd/selftest: Add IOMMU_TEST_OP_DEV_CHECK_CACHE test command
+    - iommufd/selftest: Add VIOMMU coverage for IOMMU_HWPT_INVALIDATE ioctl
+    - iommufd/viommu: Add iommufd_viommu_to_parent_domain helper
+    - iommu/arm-smmu-v3: Add arm_smmu_cache_invalidate_user
+    - iommu/arm-smmu-v3: Add arm_smmu_viommu_cache_invalidate
+    - iommu/arm-smmu-v3: Allow ATS for IOMMU_DOMAIN_NESTED
+    - iommu/arm-smmu-v3: Update comments about ATS and bypass
+    - cover-letter: iommufd: Add VIOMMU infrastructure (Part-1)
+    - iommufd: Rename IOMMUFD_OBJ_FAULT to IOMMUFD_OBJ_EVENT_IOPF
+    - iommufd: Rename fault.c to event.c
+    - iommufd: Add IOMMUFD_OBJ_EVENT_VIRQ and IOMMUFD_CMD_VIRQ_ALLOC
+    - iommufd/viommu: Allow drivers to control vdev_id lifecycle
+    - iommufd/viommu: Add iommufd_vdev_id_to_dev helper
+    - iommufd/viommu: Add iommufd_viommu_report_irq helper
+    - iommufd/selftest: Implement mock_viommu_set/unset_vdev_id
+    - iommufd/selftest: Add IOMMU_TEST_OP_TRIGGER_VIRQ for VIRQ coverage
+    - iommufd/selftest: Add EVENT_VIRQ test coverage
+    - iommu/arm-smmu-v3: Report virtual IRQ for device in user space
+    - cover-letter: iommufd: Add VIOMMU infrastructure (Part-2 VIRQ)
+    - iommufd/device: Enforce reserved IOVA also when attached to hwpt_nested
+    - iommu/dma: Support MSIs through nested domains
+    - iommu/arm-smmu-v3: Implement arm_smmu_get_msi_mapping_domain
+    - cover-letter: Apply RMR solution for MSI mappings
+    - WAR: iommufd/pages: Bypass PFNMAP
+    - WAR: vfio/pci: Report PASID capability
+    - mm: handle poisoning of pfn without struct pages
+    - mm: Add poison error check in fixup_user_fault() for mapped pfn
+    - mm: Change ghes code to allow poison of non-struct pfn
+    - vfio/nvgrace-gpu: register device memory for poison handling
+    - KVM: arm64: determine memory type from VMA
+    - arm64: configs: Build NVGRACE_GPU_VFIO_PCI as LKM
+    - arm64: configs: Enable IOMMUFD and VFIO_DEVICE_CDEV
+    - arm64: configs: Replace VFIO_CONTAINER with IOMMUFD_VFIO_CONTAINER
+    - cover-letter: Add GPU passthrough support
+    - iommufd: Move iommufd_viommu structs to public iommufd header
+    - iommufd: Rename _iommufd_object_alloc to iommufd_object_alloc_elm
+    - iommufd/viommu: Support driver-managed viommu allocation
+    - iommufd/viommu: Allow driver-level vdev_id structure
+    - iommufd: Add struct iommufd_vqueue and its related viommu ops
+    - iommufd: Add IOMMUFD_OBJ_VQUEUE and IOMMUFD_CMD_VQUEUE_ALLOC
+    - iommufd: Add mmap infrastructure
+    - iommu/tegra241-cmdqv: Add user-space use support
+    - cover-letter: iommufd: Add VIOMMU infrastructure (Part-3 VQUEUE)
+    - arm64: defconfig: Enable CONFIG_TEGRA241_CMDQV
+    - arm64: defconfig: Enable CONFIG_DMA_MAP_BENCHMARK
+    - cover-letter: Add CMDQV support
+    - vfio/nvgrace-gpu: Read dvsec register to determine need for uncached resmem
+    - vfio/nvgrace-gpu: Expose the blackwell device PF BAR1 to the VM
+    - vfio/nvgrace-gpu: Check the HBM training and C2C link status
+    - cover-letter: vfio/nvgrace-gpu: Enable grace blackwell boards
+    - KVM: arm64: Allow exec fault on memory mapped cacheable in VMA
+    - vfio/nvgrace-egm: Introduce module to manage EGM
+    - vfio/nvgrace-egm: Handle pages with ECC errors on the EGM
+    - vfio/nvgrace-egm: Register EGM for runtime ECC poison errors handling
+    - arm64: configs: Build CONFIG_NVGRACE_EGM as LKM
+    - cover-letter: Add virtualization support for EGM
+    - vfio/nvgrace-egm: Move the egm header file to include
+    - vfio/nvgrace-gpu: Add a new GH200 SKU to the devid table
+    - cover-letter: vfio/nvgrace-gpu: Enable GH SKU and migrate EGM header
+    - NVIDIA: [Config] nvidia-6.8: Update annotations for Grace I/O virtualization
+    - net/mlx5: Add IFC related stuff for data direct
+    - RDMA/mlx5: Introduce the 'data direct' driver
+    - RDMA/mlx5: Add the initialization flow to utilize the 'data direct' device
+    - RDMA/umem: Add support for creating pinned DMABUF umem with a given dma
+      device
+    - RDMA/umem: Introduce an option to revoke DMABUF umem
+    - RDMA: Pass uverbs_attr_bundle as part of '.reg_user_mr_dmabuf' API
+    - RDMA/mlx5: Add support for DMABUF MR registrations with Data-direct
+    - RDMA/mlx5: Introduce GET_DATA_DIRECT_SYSFS_PATH ioctl
+    - NVIDIA: [Config] Annotations update
+    - fixup
 
- -- Brad Figg <bfigg@nvidia.com>  Fri, 18 Oct 2024 18:56:01 -0700
+ -- Brad Figg <bfigg@nvidia.com>  Fri, 18 Oct 2024 19:00:17 -0700
 
 linux-nvidia-adv (6.8.0-1002.2) noble; urgency=medium
 
diff --git a/debian.nvidia-adv/reconstruct b/debian.nvidia-adv/reconstruct
index 3e552dabffb20..867760b60253d 100644
--- a/debian.nvidia-adv/reconstruct
+++ b/debian.nvidia-adv/reconstruct
@@ -39,9 +39,12 @@ rm -f 'arch/arm64/boot/dts/qcom/pm2250.dtsi'
 rm -f 'arch/loongarch/include/asm/qspinlock.h'
 rm -f 'arch/sparc/lib/cmpdi2.c'
 rm -f 'arch/sparc/lib/ucmpdi2.c'
+rm -f 'arch/x86/lib/iomap_copy_64.S'
 rm -f 'drivers/gpu/drm/gma500/psb_lid.c'
+rm -f 'drivers/iommu/iommu-sva.h'
 rm -f 'include/linux/amd-pstate.h'
 rm -f 'include/linux/iio/adc/adi-axi-adc.h'
+rm -f 'include/uapi/linux/iommu.h'
 rm -f 'net/bluetooth/a2mp.c'
 rm -f 'net/bluetooth/a2mp.h'
 rm -f 'net/bluetooth/amp.c'

From 38def3c16afc41f23731c74d28c11b183b6f9516 Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Sun, 29 Sep 2024 11:32:13 +0800
Subject: [PATCH 351/352] cppc_cpufreq: Use desired perf if feedback ctrs are 0
 or unchanged

The CPPC performance feedback counters could be 0 or unchanged when the
target cpu is in a low-power idle state, e.g. power-gated or clock-gated.

When the counters are 0, cppc_cpufreq_get_rate() returns 0 KHz, which makes
cpufreq_online() get a false error and fail to generate a cpufreq policy.

When the counters are unchanged, the existing cppc_perf_from_fbctrs()
returns a cached desired perf, but some platforms may update the real
frequency back to the desired perf reg.

For the above cases in cppc_cpufreq_get_rate(), get the latest desired perf
from the CPPC reg to reflect the frequency because some platforms may
update the actual frequency back there; if failed, use the cached desired
perf.

Fixes: 6a4fec4f6d30 ("cpufreq: cppc: cppc_cpufreq_get_rate() returns zero in all error cases.")
Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Zeng Heng <zengheng4@huawei.com>
Reviewed-by: Ionela Voinescu <ionela.voinescu@arm.com>
Reviewed-by: Huisong Li <lihuisong@huawei.com>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
(cherry picked from commit c47195631960b626058c335aec31f186fa854f97 linux-next)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Tested-by: Carol Soto <csoto@nvidia.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 57 +++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 15f1d41920a33..9d476264075d8 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -118,6 +118,9 @@ static void cppc_scale_freq_workfn(struct kthread_work *work)
 
 	perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs,
 				     &fb_ctrs);
+	if (!perf)
+		return;
+
 	cppc_fi->prev_perf_fb_ctrs = fb_ctrs;
 
 	perf <<= SCHED_CAPACITY_SHIFT;
@@ -730,13 +733,31 @@ static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
 	delta_delivered = get_delta(fb_ctrs_t1->delivered,
 				    fb_ctrs_t0->delivered);
 
-	/* Check to avoid divide-by zero and invalid delivered_perf */
+	/*
+	 * Avoid divide-by zero and unchanged feedback counters.
+	 * Leave it for callers to handle.
+	 */
 	if (!delta_reference || !delta_delivered)
-		return cpu_data->perf_ctrls.desired_perf;
+		return 0;
 
 	return (reference_perf * delta_delivered) / delta_reference;
 }
 
+static int cppc_get_perf_ctrs_sample(int cpu,
+				     struct cppc_perf_fb_ctrs *fb_ctrs_t0,
+				     struct cppc_perf_fb_ctrs *fb_ctrs_t1)
+{
+	int ret;
+
+	ret = cppc_get_perf_ctrs(cpu, fb_ctrs_t0);
+	if (ret)
+		return ret;
+
+	udelay(2); /* 2usec delay between sampling */
+
+	return cppc_get_perf_ctrs(cpu, fb_ctrs_t1);
+}
+
 static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
 {
 	struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0};
@@ -752,18 +773,32 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu)
 
 	cpufreq_cpu_put(policy);
 
-	ret = cppc_get_perf_ctrs(cpu, &fb_ctrs_t0);
-	if (ret)
-		return 0;
-
-	udelay(2); /* 2usec delay between sampling */
-
-	ret = cppc_get_perf_ctrs(cpu, &fb_ctrs_t1);
-	if (ret)
-		return 0;
+	ret = cppc_get_perf_ctrs_sample(cpu, &fb_ctrs_t0, &fb_ctrs_t1);
+	if (ret) {
+		if (ret == -EFAULT)
+			/* Any of the associated CPPC regs is 0. */
+			goto out_invalid_counters;
+		else
+			return 0;
+	}
 
 	delivered_perf = cppc_perf_from_fbctrs(cpu_data, &fb_ctrs_t0,
 					       &fb_ctrs_t1);
+	if (!delivered_perf)
+		goto out_invalid_counters;
+
+	return cppc_perf_to_khz(&cpu_data->perf_caps, delivered_perf);
+
+out_invalid_counters:
+	/*
+	 * Feedback counters could be unchanged or 0 when a cpu enters a
+	 * low-power idle state, e.g. clock-gated or power-gated.
+	 * Use desired perf for reflecting frequency.  Get the latest register
+	 * value first as some platforms may update the actual delivered perf
+	 * there; if failed, resort to the cached desired perf.
+	 */
+	if (cppc_get_desired_perf(cpu, &delivered_perf))
+		delivered_perf = cpu_data->perf_ctrls.desired_perf;
 
 	return cppc_perf_to_khz(&cpu_data->perf_caps, delivered_perf);
 }

From 0c61cf2fc01b94684d8bed916bc84cb908303f7c Mon Sep 17 00:00:00 2001
From: Jie Zhan <zhanjie9@hisilicon.com>
Date: Sun, 29 Sep 2024 11:32:14 +0800
Subject: [PATCH 352/352] cppc_cpufreq: Remove HiSilicon CPPC workaround

Since commit 6c8d750f9784 ("cpufreq / cppc: Work around for Hisilicon CPPC
cpufreq"), we introduce a workround for HiSilicon platforms that do not
support performance feedback counters, whereas they can get the actual
frequency from the desired perf register.  Later on, FIE is disabled in
that workaround as well.

Now the workround can be handled by the common code.  Desired perf would be
read and converted to frequency if feedback counters don't change.  FIE
would be disabled if the CPPC regs are in PCC region.

Hence, the workaround is no longer needed and can be safely removed, in an
effort to consolidate the driver procedure.

Signed-off-by: Jie Zhan <zhanjie9@hisilicon.com>
Reviewed-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Reviewed-by: Huisong Li <lihuisong@huawei.com>
[ Viresh: Move fie_disabled withing CONFIG option to fix warning ]
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
(cherry picked from commit ea1829d4d413bc38774703acfc266472f7bc0bb5 linux-next)
Signed-off-by: Jamie Nguyen <jamien@nvidia.com>
Tested-by: Carol Soto <csoto@nvidia.com>
---
 drivers/cpufreq/cppc_cpufreq.c | 73 +---------------------------------
 1 file changed, 1 insertion(+), 72 deletions(-)

diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c
index 9d476264075d8..951614d84defe 100644
--- a/drivers/cpufreq/cppc_cpufreq.c
+++ b/drivers/cpufreq/cppc_cpufreq.c
@@ -36,33 +36,15 @@ static LIST_HEAD(cpu_data_list);
 
 static bool boost_supported;
 
-struct cppc_workaround_oem_info {
-	char oem_id[ACPI_OEM_ID_SIZE + 1];
-	char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
-	u32 oem_revision;
-};
-
-static struct cppc_workaround_oem_info wa_info[] = {
-	{
-		.oem_id		= "HISI  ",
-		.oem_table_id	= "HIP07   ",
-		.oem_revision	= 0,
-	}, {
-		.oem_id		= "HISI  ",
-		.oem_table_id	= "HIP08   ",
-		.oem_revision	= 0,
-	}
-};
-
 static struct cpufreq_driver cppc_cpufreq_driver;
 
+#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
 static enum {
 	FIE_UNSET = -1,
 	FIE_ENABLED,
 	FIE_DISABLED
 } fie_disabled = FIE_UNSET;
 
-#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
 module_param(fie_disabled, int, 0444);
 MODULE_PARM_DESC(fie_disabled, "Disable Frequency Invariance Engine (FIE)");
 
@@ -78,7 +60,6 @@ struct cppc_freq_invariance {
 static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv);
 static struct kthread_worker *kworker_fie;
 
-static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu);
 static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data,
 				 struct cppc_perf_fb_ctrs *fb_ctrs_t0,
 				 struct cppc_perf_fb_ctrs *fb_ctrs_t1);
@@ -853,57 +834,6 @@ static struct cpufreq_driver cppc_cpufreq_driver = {
 	.name = "cppc_cpufreq",
 };
 
-/*
- * HISI platform does not support delivered performance counter and
- * reference performance counter. It can calculate the performance using the
- * platform specific mechanism. We reuse the desired performance register to
- * store the real performance calculated by the platform.
- */
-static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu)
-{
-	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-	struct cppc_cpudata *cpu_data;
-	u64 desired_perf;
-	int ret;
-
-	if (!policy)
-		return -ENODEV;
-
-	cpu_data = policy->driver_data;
-
-	cpufreq_cpu_put(policy);
-
-	ret = cppc_get_desired_perf(cpu, &desired_perf);
-	if (ret < 0)
-		return -EIO;
-
-	return cppc_perf_to_khz(&cpu_data->perf_caps, desired_perf);
-}
-
-static void cppc_check_hisi_workaround(void)
-{
-	struct acpi_table_header *tbl;
-	acpi_status status = AE_OK;
-	int i;
-
-	status = acpi_get_table(ACPI_SIG_PCCT, 0, &tbl);
-	if (ACPI_FAILURE(status) || !tbl)
-		return;
-
-	for (i = 0; i < ARRAY_SIZE(wa_info); i++) {
-		if (!memcmp(wa_info[i].oem_id, tbl->oem_id, ACPI_OEM_ID_SIZE) &&
-		    !memcmp(wa_info[i].oem_table_id, tbl->oem_table_id, ACPI_OEM_TABLE_ID_SIZE) &&
-		    wa_info[i].oem_revision == tbl->oem_revision) {
-			/* Overwrite the get() callback */
-			cppc_cpufreq_driver.get = hisi_cppc_cpufreq_get_rate;
-			fie_disabled = FIE_DISABLED;
-			break;
-		}
-	}
-
-	acpi_put_table(tbl);
-}
-
 static int __init cppc_cpufreq_init(void)
 {
 	int ret;
@@ -911,7 +841,6 @@ static int __init cppc_cpufreq_init(void)
 	if (!acpi_cpc_valid())
 		return -ENODEV;
 
-	cppc_check_hisi_workaround();
 	cppc_freq_invariance_init();
 	populate_efficiency_class();