Add samsung specific changes

2025-08-11 14:29:00 +02:00
parent c66122e619
commit 4d134a1294
2688 changed files with 1127995 additions and 11475 deletions
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -45,6 +45,18 @@ config IOMMU_IO_PGTABLE_LPAE_SELFTEST

 	  If unsure, say N here.

+config IOMMU_IO_PGTABLE_FAST
+	bool "Fast ARMv7/v8 Long Descriptor Format"
+	depends on (ARM || ARM64) && IOMMU_DMA
+	help
+          Enable support for a subset of the ARM long descriptor pagetable
+	  format.  This allocator achieves fast performance by
+	  pre-allocating and pre-populating page table memory up front.
+	  only supports a 32 bit virtual address space.
+
+          This implementation is mainly optimized for use cases where the
+          buffers are small (<= 64K) since it only supports 4K page sizes.
+
 config IOMMU_IO_PGTABLE_ARMV7S
 	bool "ARMv7/v8 Short Descriptor Format"
 	select IOMMU_IO_PGTABLE
@@ -332,6 +344,15 @@ config ARM_SMMU
 	  Say Y here if your SoC includes an IOMMU device implementing
 	  the ARM SMMU architecture.

+config ARM_SMMU_CONTEXT_FAULT_RETRY
+        bool "Context fault retry sequence"
+        depends on ARM_SMMU && (ARCH_WAIPIO || ARCH_PARROT)
+        help
+          In some cases, issuing a tlb invalidate operation after a
+          context fault may cause a subsequent retry of the failing
+          address to succeed. Only applies to clients which have
+          stall-on-fault enabled, like display.
+
 config ARM_SMMU_LEGACY_DT_BINDINGS
 	bool "Support the legacy \"mmu-masters\" devicetree bindings"
 	depends on ARM_SMMU=y && OF
@@ -387,6 +408,44 @@ config ARM_SMMU_QCOM_DEBUG
 	  Say Y here to enable debug for issues such as TLB sync timeouts
 	  which requires implementation defined register dumps.

+config QTI_IOMMU_SUPPORT
+	tristate "Support for QTI iommu drivers"
+	help
+	  The QTI GPU device may switch between multiple iommu domains,
+	  depending on usecase. This introduces a need to track all such
+	  domains in a non-driver specific manner.
+	  If in doubt say N.
+
+config QCOM_IOMMU_UTIL
+	tristate "Support for qcom additions to the iommu framework"
+	help
+	  QCOM iommu drivers support a general set of functionality in
+	  addition to the functions provided by the iommu framework.
+	  This includes devicetree properties for configuring iommu
+	  groups and iova ranges.
+
+	  Say N here if unsure.
+
+config QCOM_IOMMU_DEBUG
+	tristate "IOMMU debugging and testing"
+	depends on QCOM_IOMMU_UTIL
+	depends on DEBUG_FS
+	help
+	  This option is used to enable profiling and debugging in
+	  the IOMMU framework code. IOMMU profiling and debugging
+	  can be done through the debugfs nodes which this option
+	  makes available.
+
+config ARM_SMMU_SELFTEST
+	bool "ARM SMMU self test support"
+	depends on ARM_SMMU
+	help
+	  Enables self tests for arm smmu. Tests basic hardware
+	  configurations like interrupts. Note that enabling this
+	  option can marginally increase the boot time.
+
+	  If unsure, say N.
+
 config ARM_SMMU_V3
 	tristate "ARM Ltd. System MMU Version 3 (SMMUv3) Support"
 	depends on ARM64
@@ -503,4 +562,28 @@ config SPRD_IOMMU

 	  Say Y here if you want to use the multimedia devices listed above.

+config QCOM_LAZY_MAPPING
+	tristate "Reference counted iommu-mapping support"
+	depends on QCOM_DMABUF_HEAPS
+	depends on IOMMU_API
+	help
+	  DMA-BUFs may be shared between several software clients.
+	  Reference counting the mapping may simplify coordination between
+	  these clients, and decrease latency by preventing multiple
+	  map/unmaps of the same region.
+
+	  If unsure, say N here.
+
+config QTVM_IOMMU_TRACE_HOOKS
+	bool "Trace hooks used for QTVM"
+	depends on QCOM_IOMMU_UTIL
+	depends on !ANDROID_VENDOR_HOOKS
+	help
+	  When Android vendor hooks isn't available in a non Android
+	  environment, enable this config to call certain vendor hooks
+	  in the absence of Android vendor hooks. This helps to enable
+	  certain feature in the non Android environment.
+
+	  If unsure, say N here.
+
 endif # IOMMU_SUPPORT
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -29,4 +29,12 @@ obj-$(CONFIG_HYPERV_IOMMU) += hyperv-iommu.o
 obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
 obj-$(CONFIG_IOMMU_SVA) += iommu-sva.o io-pgfault.o
 obj-$(CONFIG_SPRD_IOMMU) += sprd-iommu.o
+obj-$(CONFIG_QCOM_LAZY_MAPPING) += msm_dma_iommu_mapping.o
 obj-$(CONFIG_APPLE_DART) += apple-dart.o
+obj-$(CONFIG_QCOM_IOMMU_UTIL) += qcom_iommu_util.o
+qcom_iommu_util-y += qcom-iommu-util.o
+qcom_iommu_util-$(CONFIG_IOMMU_IO_PGTABLE_FAST) += qcom-dma-iommu-generic.o io-pgtable-fast.o dma-mapping-fast.o
+qcom_iommu_util-$(CONFIG_IOMMU_IO_PGTABLE_LPAE) += qcom-io-pgtable-arm.o qcom-io-pgtable-alloc.o
+obj-$(CONFIG_QTI_IOMMU_SUPPORT) += iommu-logger.o
+obj-$(CONFIG_QCOM_IOMMU_DEBUG) += qcom_iommu_debug.o
+qcom_iommu_debug-y += qcom-iommu-debug.o qcom-iommu-debug-user.o
--- a/drivers/iommu/arm/arm-smmu/Makefile
+++ b/drivers/iommu/arm/arm-smmu/Makefile
@@ -2,5 +2,5 @@
 obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
 obj-$(CONFIG_ARM_SMMU) += arm_smmu.o
 arm_smmu-objs += arm-smmu.o arm-smmu-impl.o arm-smmu-nvidia.o
-arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o
+arm_smmu-$(CONFIG_ARM_SMMU_QCOM) += arm-smmu-qcom.o arm-smmu-qcom-pm.o
 arm_smmu-$(CONFIG_ARM_SMMU_QCOM_DEBUG) += arm-smmu-qcom-debug.o
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -219,6 +219,12 @@ struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu)
 	    of_device_is_compatible(np, "nvidia,tegra186-smmu"))
 		return nvidia_smmu_impl_init(smmu);

+	if (of_device_is_compatible(smmu->dev->of_node, "qcom,qsmmu-v500"))
+		return qsmmuv500_impl_init(smmu);
+
+	if (of_device_is_compatible(smmu->dev->of_node, "qcom,smmu-v2"))
+		return qsmmuv2_impl_init(smmu);
+
 	if (IS_ENABLED(CONFIG_ARM_SMMU_QCOM))
 		smmu = qcom_smmu_impl_init(smmu);

--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-pm.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom-pm.c
@@ -0,0 +1,337 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+#include <linux/clk.h>
+#include <linux/regulator/consumer.h>
+#include <linux/interconnect.h>
+#include <linux/of_platform.h>
+#include <linux/iopoll.h>
+#include "arm-smmu.h"
+
+#define ARM_SMMU_ICC_AVG_BW		0
+#define ARM_SMMU_ICC_PEAK_BW_HIGH	1000
+#define ARM_SMMU_ICC_PEAK_BW_LOW	0
+#define ARM_SMMU_ICC_ACTIVE_ONLY_TAG	0x3
+
+/*
+ * Theoretically, our interconnect does not guarantee the order between
+ * writes to different "register blocks" even with device memory type.
+ * It does guarantee that the completion of a read to a particular
+ * register block implies that previously issued writes to that
+ * register block have completed, with device memory type.
+ *
+ * In particular, we need to ensure that writes to iommu registers
+ * complete before we turn off the power.
+ */
+static void arm_smmu_arch_write_sync(struct arm_smmu_device *smmu)
+{
+	u32 id;
+
+	if (!smmu)
+		return;
+
+	/* Read to complete prior write transcations */
+	id = arm_smmu_gr0_read(smmu, ARM_SMMU_GR0_ID0);
+
+	/* Wait for read to complete before off */
+	rmb();
+}
+
+static int arm_smmu_prepare_clocks(struct arm_smmu_power_resources *pwr)
+{
+
+	int i, ret = 0;
+
+	for (i = 0; i < pwr->num_clocks; ++i) {
+		ret = clk_prepare(pwr->clocks[i]);
+		if (ret) {
+			dev_err(pwr->dev, "Couldn't prepare clock #%d\n", i);
+			while (i--)
+				clk_unprepare(pwr->clocks[i]);
+			break;
+		}
+	}
+	return ret;
+}
+
+static void arm_smmu_unprepare_clocks(struct arm_smmu_power_resources *pwr)
+{
+	int i;
+
+	for (i = pwr->num_clocks; i; --i)
+		clk_unprepare(pwr->clocks[i - 1]);
+}
+
+static int arm_smmu_enable_clocks(struct arm_smmu_power_resources *pwr)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < pwr->num_clocks; ++i) {
+		ret = clk_enable(pwr->clocks[i]);
+		if (ret) {
+			dev_err(pwr->dev, "Couldn't enable clock #%d\n", i);
+			while (i--)
+				clk_disable(pwr->clocks[i]);
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static void arm_smmu_disable_clocks(struct arm_smmu_power_resources *pwr)
+{
+	int i;
+
+	for (i = pwr->num_clocks; i; --i)
+		clk_disable(pwr->clocks[i - 1]);
+}
+
+static int arm_smmu_raise_interconnect_bw(struct arm_smmu_power_resources *pwr)
+{
+	if (!pwr->icc_path)
+		return 0;
+	return icc_set_bw(pwr->icc_path, ARM_SMMU_ICC_AVG_BW,
+			  ARM_SMMU_ICC_PEAK_BW_HIGH);
+}
+
+static void arm_smmu_lower_interconnect_bw(struct arm_smmu_power_resources *pwr)
+{
+	if (!pwr->icc_path)
+		return;
+	WARN_ON(icc_set_bw(pwr->icc_path, ARM_SMMU_ICC_AVG_BW,
+			   ARM_SMMU_ICC_PEAK_BW_LOW));
+}
+
+static int arm_smmu_enable_regulators(struct arm_smmu_power_resources *pwr)
+{
+	struct regulator_bulk_data *consumers;
+	int num_consumers, ret;
+	int i;
+
+	num_consumers = pwr->num_gdscs;
+	consumers = pwr->gdscs;
+	for (i = 0; i < num_consumers; i++) {
+		ret = regulator_enable(consumers[i].consumer);
+		if (ret)
+			goto out;
+	}
+	return 0;
+
+out:
+	i -= 1;
+	for (; i >= 0; i--)
+		regulator_disable(consumers[i].consumer);
+	return ret;
+}
+
+int arm_smmu_power_on(struct arm_smmu_power_resources *pwr)
+{
+	int ret;
+
+	mutex_lock(&pwr->power_lock);
+	if (pwr->power_count > 0) {
+		pwr->power_count += 1;
+		mutex_unlock(&pwr->power_lock);
+		return 0;
+	}
+
+	ret = arm_smmu_raise_interconnect_bw(pwr);
+	if (ret)
+		goto out_unlock;
+
+	ret = arm_smmu_enable_regulators(pwr);
+	if (ret)
+		goto out_disable_bus;
+
+	ret = arm_smmu_prepare_clocks(pwr);
+	if (ret)
+		goto out_disable_regulators;
+
+	ret = arm_smmu_enable_clocks(pwr);
+	if (ret)
+		goto out_unprepare_clocks;
+
+	if (pwr->resume) {
+		ret = pwr->resume(pwr);
+		if (ret)
+			goto out_disable_clocks;
+	}
+
+	pwr->power_count = 1;
+	mutex_unlock(&pwr->power_lock);
+	return 0;
+out_disable_clocks:
+	arm_smmu_disable_clocks(pwr);
+out_unprepare_clocks:
+	arm_smmu_unprepare_clocks(pwr);
+out_disable_regulators:
+	regulator_bulk_disable(pwr->num_gdscs, pwr->gdscs);
+out_disable_bus:
+	arm_smmu_lower_interconnect_bw(pwr);
+out_unlock:
+	mutex_unlock(&pwr->power_lock);
+	return ret;
+}
+
+/*
+ * Needing to pass smmu to this api for arm_smmu_arch_write_sync is awkward.
+ */
+void arm_smmu_power_off(struct arm_smmu_device *smmu,
+			struct arm_smmu_power_resources *pwr)
+{
+	mutex_lock(&pwr->power_lock);
+	if (pwr->power_count == 0) {
+		WARN(1, "%s: Bad power count\n", dev_name(pwr->dev));
+		mutex_unlock(&pwr->power_lock);
+		return;
+
+	} else if (pwr->power_count > 1) {
+		pwr->power_count--;
+		mutex_unlock(&pwr->power_lock);
+		return;
+	}
+
+	if (pwr->suspend)
+		pwr->suspend(pwr);
+
+	arm_smmu_arch_write_sync(smmu);
+	arm_smmu_disable_clocks(pwr);
+	arm_smmu_unprepare_clocks(pwr);
+	regulator_bulk_disable(pwr->num_gdscs, pwr->gdscs);
+	arm_smmu_lower_interconnect_bw(pwr);
+	pwr->power_count = 0;
+	mutex_unlock(&pwr->power_lock);
+}
+
+static int arm_smmu_init_clocks(struct arm_smmu_power_resources *pwr)
+{
+	const char *cname;
+	struct property *prop;
+	int i;
+	struct device *dev = pwr->dev;
+
+	pwr->num_clocks =
+		of_property_count_strings(dev->of_node, "clock-names");
+
+	if (pwr->num_clocks < 1) {
+		pwr->num_clocks = 0;
+		return 0;
+	}
+
+	pwr->clocks = devm_kzalloc(
+		dev, sizeof(*pwr->clocks) * pwr->num_clocks,
+		GFP_KERNEL);
+
+	if (!pwr->clocks)
+		return -ENOMEM;
+
+	i = 0;
+	of_property_for_each_string(dev->of_node, "clock-names",
+				prop, cname) {
+		struct clk *c = devm_clk_get(dev, cname);
+
+		if (IS_ERR(c)) {
+			dev_err(dev, "Couldn't get clock: %s\n",
+				cname);
+			return PTR_ERR(c);
+		}
+
+		if (clk_get_rate(c) == 0) {
+			long rate = clk_round_rate(c, 1000);
+
+			clk_set_rate(c, rate);
+		}
+
+		pwr->clocks[i] = c;
+
+		++i;
+	}
+	return 0;
+}
+
+static int arm_smmu_init_regulators(struct arm_smmu_power_resources *pwr)
+{
+	const char *cname;
+	struct property *prop;
+	int i;
+	struct device *dev = pwr->dev;
+
+	pwr->num_gdscs =
+		of_property_count_strings(dev->of_node, "qcom,regulator-names");
+
+	if (pwr->num_gdscs < 1) {
+		pwr->num_gdscs = 0;
+		return 0;
+	}
+
+	pwr->gdscs = devm_kzalloc(
+			dev, sizeof(*pwr->gdscs) * pwr->num_gdscs, GFP_KERNEL);
+
+	if (!pwr->gdscs)
+		return -ENOMEM;
+
+	i = 0;
+	of_property_for_each_string(dev->of_node, "qcom,regulator-names",
+				prop, cname)
+		pwr->gdscs[i++].supply = cname;
+
+	return devm_regulator_bulk_get(dev, pwr->num_gdscs, pwr->gdscs);
+}
+
+static int arm_smmu_init_interconnect(struct arm_smmu_power_resources *pwr)
+{
+	struct device *dev = pwr->dev;
+
+	/* We don't want the interconnect APIs to print an error message */
+	if (!of_find_property(dev->of_node, "interconnects", NULL)) {
+		dev_dbg(dev, "No interconnect info\n");
+		return 0;
+	}
+
+	pwr->icc_path = devm_of_icc_get(dev, NULL);
+	if (IS_ERR_OR_NULL(pwr->icc_path)) {
+		if (PTR_ERR(pwr->icc_path) != -EPROBE_DEFER)
+			dev_err(dev, "Unable to read interconnect path from devicetree rc: %ld\n",
+				PTR_ERR(pwr->icc_path));
+		return pwr->icc_path ? PTR_ERR(pwr->icc_path) : -EINVAL;
+	}
+
+	if (of_property_read_bool(dev->of_node, "qcom,active-only"))
+		icc_set_tag(pwr->icc_path, ARM_SMMU_ICC_ACTIVE_ONLY_TAG);
+
+	return 0;
+}
+
+/*
+ * Cleanup done by devm. Any non-devm resources must clean up themselves.
+ */
+struct arm_smmu_power_resources *arm_smmu_init_power_resources(
+						struct device *dev)
+{
+	struct arm_smmu_power_resources *pwr;
+	int ret;
+
+	pwr = devm_kzalloc(dev, sizeof(*pwr), GFP_KERNEL);
+	if (!pwr)
+		return ERR_PTR(-ENOMEM);
+
+	pwr->dev = dev;
+	mutex_init(&pwr->power_lock);
+
+	ret = arm_smmu_init_clocks(pwr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = arm_smmu_init_regulators(pwr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	ret = arm_smmu_init_interconnect(pwr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return pwr;
+}
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-trace.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-trace.h
@@ -0,0 +1,200 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2019, 2021 The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM arm_smmu
+
+#if !defined(_TRACE_ARM_SMMU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ARM_SMMU_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#include <linux/scatterlist.h>
+#include "arm-smmu.h"
+
+struct device;
+
+DECLARE_EVENT_CLASS(iommu_tlbi,
+
+	TP_PROTO(struct arm_smmu_domain *domain),
+
+	TP_ARGS(domain),
+
+	TP_STRUCT__entry(
+		__string(group_name, dev_name(domain->dev))
+	),
+
+	TP_fast_assign(
+		__assign_str(group_name, dev_name(domain->dev));
+	),
+
+	TP_printk("group=%s",
+		__get_str(group_name)
+	)
+);
+
+DEFINE_EVENT(iommu_tlbi, tlbi_start,
+
+	TP_PROTO(struct arm_smmu_domain *domain),
+
+	TP_ARGS(domain)
+);
+
+DEFINE_EVENT(iommu_tlbi, tlbi_end,
+
+	TP_PROTO(struct arm_smmu_domain *domain),
+
+	TP_ARGS(domain)
+);
+
+DECLARE_EVENT_CLASS(iommu_pgtable,
+
+	TP_PROTO(struct arm_smmu_domain *domain, unsigned long iova,
+		unsigned long long ipa, size_t granule),
+
+	TP_ARGS(domain, iova, ipa, granule),
+
+	TP_STRUCT__entry(
+		__string(group_name, dev_name(domain->dev))
+		__field(unsigned long, iova)
+		__field(unsigned long long, ipa)
+		__field(size_t, granule)
+	),
+
+	TP_fast_assign(
+		__assign_str(group_name, dev_name(domain->dev));
+		__entry->iova = iova;
+		__entry->ipa = ipa;
+		__entry->granule = granule;
+	),
+
+	TP_printk("group=%s table_base_iova=%lx table_ipa=%llx table_size=%zx",
+		__get_str(group_name), __entry->iova,
+		__entry->ipa, __entry->granule
+	)
+);
+
+DEFINE_EVENT(iommu_pgtable, iommu_pgtable_add,
+
+	TP_PROTO(struct arm_smmu_domain *domain, unsigned long iova,
+		unsigned long long ipa, size_t granule),
+
+	TP_ARGS(domain, iova, ipa, granule)
+);
+
+DEFINE_EVENT(iommu_pgtable, iommu_pgtable_remove,
+
+	TP_PROTO(struct arm_smmu_domain *domain, unsigned long iova,
+		unsigned long long ipa, size_t granule),
+
+	TP_ARGS(domain, iova, ipa, granule)
+);
+
+DECLARE_EVENT_CLASS(iommu_map_pages,
+
+	TP_PROTO(struct arm_smmu_domain *domain, unsigned long iova,
+		size_t pgsize, size_t pgcount),
+
+	TP_ARGS(domain, iova, pgsize, pgcount),
+
+	TP_STRUCT__entry(
+		__string(group_name, dev_name(domain->dev))
+		__field(unsigned long, iova)
+		__field(size_t, pgsize)
+		__field(size_t, pgcount)
+	),
+
+	TP_fast_assign(
+		__assign_str(group_name, dev_name(domain->dev));
+		__entry->iova = iova;
+		__entry->pgsize = pgsize;
+		__entry->pgcount = pgcount;
+	),
+
+	TP_printk("group=%s iova=%lx size=%zx pgsize=%zx pgcount=%zx",
+		__get_str(group_name), __entry->iova,
+		__entry->pgsize * __entry->pgcount,
+		__entry->pgsize, __entry->pgcount
+	)
+);
+
+DEFINE_EVENT(iommu_map_pages, map_pages,
+
+	TP_PROTO(struct arm_smmu_domain *domain, unsigned long iova,
+		size_t pgsize, size_t pgcount),
+
+	TP_ARGS(domain, iova, pgsize, pgcount)
+);
+
+DEFINE_EVENT(iommu_map_pages, unmap_pages,
+
+	TP_PROTO(struct arm_smmu_domain *domain, unsigned long iova,
+		size_t pgsize, size_t pgcount),
+
+	TP_ARGS(domain, iova, pgsize, pgcount)
+);
+
+/* Refer to samples/ftrace_events */
+#ifndef __TRACE_EVENT_ARM_SMMU_HELPER_FUNCTIONS
+#define __TRACE_EVENT_ARM_SMMU_HELPER_FUNCTIONS
+static inline unsigned long sum_scatterlist_length(struct scatterlist *sgl,
+						unsigned int nents)
+{
+	int i = 0;
+	unsigned long sum = 0;
+
+	for (i = 0; i < nents; i++, sgl = sg_next(sgl))
+		sum += sgl->length;
+
+	return sum;
+}
+#endif
+
+TRACE_EVENT(tlbsync_timeout,
+
+	TP_PROTO(struct device *dev),
+
+	TP_ARGS(dev),
+
+	TP_STRUCT__entry(
+		__string(device, dev_name(dev))
+	),
+
+	TP_fast_assign(
+		__assign_str(device, dev_name(dev));
+	),
+
+	TP_printk("smmu=%s",
+		__get_str(device)
+	)
+);
+
+TRACE_EVENT(smmu_init,
+
+	TP_PROTO(u64 time),
+
+	TP_ARGS(time),
+
+	TP_STRUCT__entry(
+		__field(u64, time)
+	),
+
+	TP_fast_assign(
+		__entry->time = time;
+	),
+
+	TP_printk("ARM SMMU init latency: %lld us", __entry->time)
+);
+#endif /* _TRACE_ARM_SMMU_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../drivers/iommu/arm/arm-smmu
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE arm-smmu-trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -5,6 +5,8 @@
 * Copyright (C) 2013 ARM Limited
 *
 * Author: Will Deacon <will.deacon@arm.com>
+ *
+ * Copyright (c) 2022-2024 Qualcomm Innovation Center, Inc. All rights reserved.
 */

 #ifndef _ARM_SMMU_H
@@ -22,10 +24,14 @@
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/types.h>
+#include <linux/qcom-iommu-util.h>
+#include <linux/qcom-io-pgtable.h>

 /* Configuration registers */
 #define ARM_SMMU_GR0_sCR0		0x0
 #define ARM_SMMU_sCR0_VMID16EN		BIT(31)
+#define ARM_SMMU_sCR0_SHCFG		GENMASK(23, 22)
+#define ARM_SMMU_sCR0_SHCFG_NSH		0x3
 #define ARM_SMMU_sCR0_BSU		GENMASK(15, 14)
 #define ARM_SMMU_sCR0_FB		BIT(13)
 #define ARM_SMMU_sCR0_PTM		BIT(12)
@@ -117,6 +123,8 @@ enum arm_smmu_s2cr_type {
 	S2CR_TYPE_FAULT,
 };
 #define ARM_SMMU_S2CR_EXIDVALID		BIT(10)
+#define ARM_SMMU_S2CR_SHCFG		GENMASK(9, 8)
+#define ARM_SMMU_S2CR_SHCFG_NSH		0x3
 #define ARM_SMMU_S2CR_CBNDX		GENMASK(7, 0)

 /* Context bank attribute registers */
@@ -136,12 +144,23 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CBAR_VMID		GENMASK(7, 0)

 #define ARM_SMMU_GR1_CBFRSYNRA(n)	(0x400 + ((n) << 2))
+#define CBFRSYNRA_SID_MASK		(0xffff)

 #define ARM_SMMU_GR1_CBA2R(n)		(0x800 + ((n) << 2))
 #define ARM_SMMU_CBA2R_VMID16		GENMASK(31, 16)
 #define ARM_SMMU_CBA2R_VA64		BIT(0)

 #define ARM_SMMU_CB_SCTLR		0x0
+#define ARM_SMMU_SCTLR_WACFG		GENMASK(27, 26)
+#define ARM_SMMU_SCTLR_WACFG_WA		0x2
+#define ARM_SMMU_SCTLR_RACFG		GENMASK(25, 24)
+#define ARM_SMMU_SCTLR_RACFG_RA		0x2
+#define ARM_SMMU_SCTLR_SHCFG		GENMASK(23, 22)
+#define ARM_SMMU_SCTLR_SHCFG_OSH	0x1
+#define ARM_SMMU_SCTLR_SHCFG_NSH	0x3
+#define ARM_SMMU_SCTLR_MTCFG		BIT(20)
+#define ARM_SMMU_SCTLR_MEM_ATTR		GENMASK(19, 16)
+#define ARM_SMMU_SCTLR_MEM_ATTR_OISH_WB_CACHE	0xf
 #define ARM_SMMU_SCTLR_S1_ASIDPNE	BIT(12)
 #define ARM_SMMU_SCTLR_CFCFG		BIT(7)
 #define ARM_SMMU_SCTLR_HUPCF		BIT(8)
@@ -156,6 +175,7 @@ enum arm_smmu_cbar_type {

 #define ARM_SMMU_CB_RESUME		0x8
 #define ARM_SMMU_RESUME_TERMINATE	BIT(0)
+#define ARM_SMMU_RESUME_RESUME		0

 #define ARM_SMMU_CB_TCR2		0x10
 #define ARM_SMMU_TCR2_SEP		GENMASK(17, 15)
@@ -219,10 +239,19 @@ enum arm_smmu_cbar_type {
 					 ARM_SMMU_FSR_TF |		\
 					 ARM_SMMU_FSR_IGN)

+#define ARM_SMMU_CB_FSRRESTORE		0x5c
 #define ARM_SMMU_CB_FAR			0x60

 #define ARM_SMMU_CB_FSYNR0		0x68
 #define ARM_SMMU_FSYNR0_WNR		BIT(4)
+#define ARM_SMMU_FSYNR0_PNU		BIT(5)
+#define ARM_SMMU_FSYNR0_IND		BIT(6)
+#define ARM_SMMU_FSYNR0_NSATTR		BIT(8)
+
+#define ARM_SMMU_CB_FSYNR1		0x6c
+#define ARM_SMMU_FSYNR1_BID		GENMASK(15, 13)
+#define ARM_SMMU_FSYNR1_PID		GENMASK(12, 8)
+#define ARM_SMMU_FSYNR1_MID		GENMASK(7, 0)

 #define ARM_SMMU_CB_FSYNR1		0x6c

@@ -235,6 +264,24 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_TLBSTATUS		0x7f4
 #define ARM_SMMU_CB_ATS1PR		0x800

+/* Implementation Defined Register Space 5 registers*/
+/* Relative to IMPL_DEF5 page */
+#define ARM_SMMU_STATS_SYNC_INV_TBU_ACK 0x5dc
+#define TBU_SYNC_ACK			GENMASK(31, 17)
+#define TBU_SYNC_REQ			BIT(16)
+#define TBU_INV_ACK			GENMASK(15, 1)
+#define TBU_INV_REQ			BIT(0)
+#define APPS_SMMU_TBU_REG_ACCESS_REQ_NS 0x5f8
+#define APPS_SMMU_TBU_REG_ACCESS_ACK_NS 0x5fc
+
+/* Relative to SMMU_BASE */
+#define ARM_SMMU_TBU_PWR_STATUS         0x2204
+
+/* Relative SMMU_BASE */
+#define ARM_SMMU_MMU2QSS_AND_SAFE_WAIT_CNTR 0x2670
+#define TCU_SYNC_IN_PRGSS		BIT(20)
+#define TCU_INV_IN_PRGSS		BIT(16)
+
 #define ARM_SMMU_CB_ATSR		0x8f0
 #define ARM_SMMU_ATSR_ACTIVE		BIT(0)

@@ -242,8 +289,9 @@ enum arm_smmu_cbar_type {
 /* Maximum number of context banks per SMMU */
 #define ARM_SMMU_MAX_CBS		128

-#define TLB_LOOP_TIMEOUT		1000000	/* 1s! */
+#define TLB_LOOP_TIMEOUT		500000	/* 500ms */
 #define TLB_SPIN_COUNT			10
+#define TLB_LOOP_INC_MAX		1000      /*1ms*/

 /* Shared driver definitions */
 enum arm_smmu_arch_version {
@@ -257,6 +305,33 @@ enum arm_smmu_implementation {
 	ARM_MMU500,
 	CAVIUM_SMMUV2,
 	QCOM_SMMUV2,
+	QCOM_SMMUV500,
+};
+
+/*
+ * Describes resources required for on/off power operation.
+ * Separate reference count is provided for atomic/nonatomic
+ * operations.
+ * gdscs - on kernel 6.6, power domains are used instead. This
+ * field can be removed once no legacy targets using it remain.
+ */
+struct arm_smmu_power_resources {
+	struct device			*dev;
+
+	struct clk			**clocks;
+	int				num_clocks;
+
+	struct regulator_bulk_data	*gdscs;
+	int				num_gdscs;
+
+	struct icc_path			*icc_path;
+
+	/* Protects power_count */
+	struct mutex			power_lock;
+	int				power_count;
+
+	int (*resume)(struct arm_smmu_power_resources *pwr);
+	void (*suspend)(struct arm_smmu_power_resources *pwr);
 };

 struct arm_smmu_s2cr {
@@ -265,6 +340,7 @@ struct arm_smmu_s2cr {
 	enum arm_smmu_s2cr_type		type;
 	enum arm_smmu_s2cr_privcfg	privcfg;
 	u8				cbndx;
+	bool				pinned;
 };

 struct arm_smmu_smr {
@@ -272,6 +348,7 @@ struct arm_smmu_smr {
 	u16				id;
 	bool				valid;
 	bool				pinned;
+	bool				used;
 };

 struct arm_smmu_device {
@@ -297,6 +374,14 @@ struct arm_smmu_device {
 #define ARM_SMMU_FEAT_EXIDS		(1 << 12)
 	u32				features;

+#define ARM_SMMU_OPT_FATAL_ASF		(1 << 0)
+#define ARM_SMMU_OPT_3LVL_TABLES	(1 << 2)
+#define ARM_SMMU_OPT_NO_ASID_RETENTION	(1 << 3)
+#define ARM_SMMU_OPT_DISABLE_ATOS	(1 << 4)
+#define ARM_SMMU_OPT_CONTEXT_FAULT_RETRY	(1 << 5)
+#define ARM_SMMU_OPT_MULTI_MATCH_HANDOFF_SMR	(1 << 6)
+#define ARM_SMMU_OPT_IGNORE_NUMPAGENDXB	(1 << 7)
+	u32				options;
 	enum arm_smmu_arch_version	version;
 	enum arm_smmu_implementation	model;
 	const struct arm_smmu_impl	*impl;
@@ -328,6 +413,17 @@ struct arm_smmu_device {

 	/* IOMMU core code handle */
 	struct iommu_device		iommu;
+
+	/* Specific to QCOM */
+	struct arm_smmu_impl_def_reg	*impl_def_attach_registers;
+	unsigned int			num_impl_def_attach_registers;
+
+	struct arm_smmu_power_resources *pwr;
+
+	/* used for qsmmuv500 scm_io_readl */
+	phys_addr_t                     phys_addr;
+
+	unsigned long			sync_timed_out;
 };

 enum arm_smmu_context_fmt {
@@ -344,6 +440,19 @@ struct arm_smmu_cfg {
 		u16			asid;
 		u16			vmid;
 	};
+	u32				procid;
+	struct {
+		u32     wacfg:2;
+		u32     racfg:2;
+		u32     shcfg:2;
+		u32     mtcfg:1;
+		u32     memattr:4;
+		u32     hupcf:1;
+		u32     cfcfg:1;
+		u32     cfre:1;
+		u32     m:1;
+	}       sctlr;
+
 	enum arm_smmu_cbar_type		cbar;
 	enum arm_smmu_context_fmt	fmt;
 	bool				flush_walk_prefer_tlbiasid;
@@ -354,6 +463,7 @@ struct arm_smmu_cb {
 	u64				ttbr[2];
 	u32				tcr[2];
 	u32				mair[2];
+	u32 sctlr;
 	struct arm_smmu_cfg		*cfg;
 };

@@ -364,16 +474,63 @@ enum arm_smmu_domain_stage {
 	ARM_SMMU_DOMAIN_BYPASS,
 };

+struct arm_smmu_fault_model {
+	char non_fatal : 1;
+	char no_cfre : 1;
+	char no_stall : 1;
+	char hupcf : 1;
+};
+
+struct arm_smmu_mapping_cfg {
+	char s1_bypass : 1;
+	char atomic : 1;
+	char fast : 1;
+};
+
+struct qcom_iommu_fault_param {
+	struct device *dev;
+	fault_handler_irq_t handler;
+	void *token;
+};
+
 struct arm_smmu_domain {
 	struct arm_smmu_device		*smmu;
+	struct device			*dev;
 	struct io_pgtable_ops		*pgtbl_ops;
 	unsigned long			pgtbl_quirks;
+	bool				force_coherent_walk;
 	const struct iommu_flush_ops	*flush_ops;
 	struct arm_smmu_cfg		cfg;
 	enum arm_smmu_domain_stage	stage;
 	struct mutex			init_mutex; /* Protects smmu pointer */
-	spinlock_t			cb_lock; /* Serialises ATS1* ops and TLB syncs */
+	spinlock_t			cb_lock; /* Serialises ATS1* ops */
+	spinlock_t			sync_lock; /* Serialises TLB syncs */
+	struct arm_smmu_fault_model	fault_model;
+	struct arm_smmu_mapping_cfg	mapping_cfg;
+	bool				delayed_s1_trans_enable;
+	u32				secure_vmid;
+	fault_handler_irq_t		fault_handler_irq;
+	void				*handler_irq_token;
+	struct qcom_iommu_fault_param	fault_param;
+
+	/*
+	 * Track PMDs which require tlb invalidate prior to being
+	 * freed, or before their iovas can be reused by iommu_map().
+	 */
+	spinlock_t			iotlb_gather_lock;
+	struct list_head		*freelist;
+	bool				deferred_flush;
+
 	struct iommu_domain		domain;
+	/* mapping_cfg.atomic indicates that runtime power management should be disabled. */
+	bool				rpm_always_on;
+	/* skip tlb management. */
+	bool skip_tlb_management;
+
+#ifdef CONFIG_ARM_SMMU_CONTEXT_FAULT_RETRY
+	u64				prev_fault_address;
+	u32				fault_retry_counter;
+#endif
 };

 struct arm_smmu_master_cfg {
@@ -420,7 +577,28 @@ static inline u32 arm_smmu_lpae_vtcr(const struct io_pgtable_cfg *cfg)
 	       FIELD_PREP(ARM_SMMU_VTCR_T0SZ, cfg->arm_lpae_s2_cfg.vtcr.tsz);
 }

+static inline u32 arm_smmu_lpae_sctlr(struct arm_smmu_cfg *cfg)
+{
+	bool stage1 = cfg->cbar != CBAR_TYPE_S2_TRANS;
+
+	return FIELD_PREP(ARM_SMMU_SCTLR_WACFG, cfg->sctlr.wacfg) |
+	FIELD_PREP(ARM_SMMU_SCTLR_RACFG, cfg->sctlr.racfg) |
+	FIELD_PREP(ARM_SMMU_SCTLR_SHCFG, cfg->sctlr.shcfg) |
+	FIELD_PREP(ARM_SMMU_SCTLR_MTCFG, cfg->sctlr.mtcfg) |
+	FIELD_PREP(ARM_SMMU_SCTLR_MEM_ATTR, cfg->sctlr.memattr) |
+	FIELD_PREP(ARM_SMMU_SCTLR_S1_ASIDPNE, stage1) |
+	FIELD_PREP(ARM_SMMU_SCTLR_HUPCF, cfg->sctlr.hupcf) |
+	FIELD_PREP(ARM_SMMU_SCTLR_CFCFG, cfg->sctlr.cfcfg) |
+	ARM_SMMU_SCTLR_CFIE |
+	FIELD_PREP(ARM_SMMU_SCTLR_CFRE, cfg->sctlr.cfre) |
+	FIELD_PREP(ARM_SMMU_SCTLR_E, IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) |
+	ARM_SMMU_SCTLR_AFE |
+	ARM_SMMU_SCTLR_TRE |
+	FIELD_PREP(ARM_SMMU_SCTLR_M, cfg->sctlr.m);
+}
+
 /* Implementation details, yay! */
+
 struct arm_smmu_impl {
 	u32 (*read_reg)(struct arm_smmu_device *smmu, int page, int offset);
 	void (*write_reg)(struct arm_smmu_device *smmu, int page, int offset,
@@ -432,6 +610,13 @@ struct arm_smmu_impl {
 	int (*reset)(struct arm_smmu_device *smmu);
 	int (*init_context)(struct arm_smmu_domain *smmu_domain,
 			struct io_pgtable_cfg *cfg, struct device *dev);
+	void (*init_context_bank)(struct arm_smmu_domain *smmu_domain,
+				  struct device *dev);
+	phys_addr_t (*iova_to_phys_hard)(struct arm_smmu_domain *smmu_domain,
+					 struct qcom_iommu_atos_txn *txn);
+	void (*tlb_sync_timeout)(struct arm_smmu_device *smmu);
+	void (*device_remove)(struct arm_smmu_device *smmu);
+	int (*device_group)(struct device *dev, struct iommu_group *group);
 	void (*tlb_sync)(struct arm_smmu_device *smmu, int page, int sync,
 			 int status);
 	int (*def_domain_type)(struct device *dev);
@@ -503,6 +688,15 @@ static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page,

 #define ARM_SMMU_GR0		0
 #define ARM_SMMU_GR1		1
+
+/*
+ * Implementation defined space starts after SMMU GR space, so IMPL_DEF page n
+ * is page n + 2 in the SMMU register space.
+ */
+#define ARM_SMMU_IMPL_DEF0	2
+#define ARM_SMMU_IMPL_DEF4	6
+#define ARM_SMMU_IMPL_DEF5	7
+
 #define ARM_SMMU_CB(s, n)	((s)->numpage + (n))

 #define arm_smmu_gr0_read(s, o)		\
@@ -527,8 +721,22 @@ static inline void arm_smmu_writeq(struct arm_smmu_device *smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *qsmmuv500_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *qsmmuv2_impl_init(struct arm_smmu_device *smmu);
+struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device *smmu);

 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);

+int arm_smmu_power_on(struct arm_smmu_power_resources *pwr);
+void arm_smmu_power_off(struct arm_smmu_device *smmu,
+			struct arm_smmu_power_resources *pwr);
+struct arm_smmu_power_resources *arm_smmu_init_power_resources(
+			struct device *dev);
+
+extern struct platform_driver qsmmuv500_tbu_driver;
+
+/* Misc. constants */
+#define ARM_MMU500_ACR_CACHE_LOCK	(1 << 26)
+
 #endif /* _ARM_SMMU_H */
--- a/drivers/iommu/dma-mapping-fast.c
+++ b/drivers/iommu/dma-mapping-fast.c
--- a/drivers/iommu/io-pgtable-fast.c
+++ b/drivers/iommu/io-pgtable-fast.c
@@ -0,0 +1,841 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2016-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2022-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#define pr_fmt(fmt)	"io-pgtable-fast: " fmt
+
+#include <linux/iommu.h>
+#include <linux/kernel.h>
+#include <linux/scatterlist.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/io-pgtable.h>
+#include <linux/io-pgtable-fast.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/dma-mapping.h>
+#include <linux/qcom-iommu-util.h>
+#include <linux/qcom-io-pgtable.h>
+
+#define AV8L_FAST_MAX_ADDR_BITS		48
+
+/* Page table bits */
+#define AV8L_FAST_PTE_TYPE_SHIFT	0
+#define AV8L_FAST_PTE_TYPE_MASK		0x3
+
+#define AV8L_FAST_PTE_TYPE_BLOCK	1
+#define AV8L_FAST_PTE_TYPE_TABLE	3
+#define AV8L_FAST_PTE_TYPE_PAGE		3
+
+#define AV8L_FAST_PTE_NSTABLE		(((av8l_fast_iopte)1) << 63)
+#define AV8L_FAST_PTE_XN		(((av8l_fast_iopte)3) << 53)
+#define AV8L_FAST_PTE_AF		(((av8l_fast_iopte)1) << 10)
+#define AV8L_FAST_PTE_SH_NS		(((av8l_fast_iopte)0) << 8)
+#define AV8L_FAST_PTE_SH_OS		(((av8l_fast_iopte)2) << 8)
+#define AV8L_FAST_PTE_SH_IS		(((av8l_fast_iopte)3) << 8)
+#define AV8L_FAST_PTE_SH_MASK		(((av8l_fast_iopte)3) << 8)
+#define AV8L_FAST_PTE_NS		(((av8l_fast_iopte)1) << 5)
+#define AV8L_FAST_PTE_VALID		(((av8l_fast_iopte)1) << 0)
+
+#define AV8L_FAST_PTE_ATTR_LO_MASK	(((av8l_fast_iopte)0x3ff) << 2)
+/* Ignore the contiguous bit for block splitting */
+#define AV8L_FAST_PTE_ATTR_HI_MASK	(((av8l_fast_iopte)6) << 52)
+#define AV8L_FAST_PTE_ATTR_MASK		(AV8L_FAST_PTE_ATTR_LO_MASK |	\
+					 AV8L_FAST_PTE_ATTR_HI_MASK)
+#define AV8L_FAST_PTE_ADDR_MASK		((av8l_fast_iopte)0xfffffffff000)
+
+
+/* Stage-1 PTE */
+#define AV8L_FAST_PTE_AP_UNPRIV		(((av8l_fast_iopte)1) << 6)
+#define AV8L_FAST_PTE_AP_RDONLY		(((av8l_fast_iopte)2) << 6)
+#define AV8L_FAST_PTE_ATTRINDX_SHIFT	2
+#define AV8L_FAST_PTE_ATTRINDX_MASK	0x7
+#define AV8L_FAST_PTE_nG		(((av8l_fast_iopte)1) << 11)
+
+/* Stage-2 PTE */
+#define AV8L_FAST_PTE_HAP_FAULT		(((av8l_fast_iopte)0) << 6)
+#define AV8L_FAST_PTE_HAP_READ		(((av8l_fast_iopte)1) << 6)
+#define AV8L_FAST_PTE_HAP_WRITE		(((av8l_fast_iopte)2) << 6)
+#define AV8L_FAST_PTE_MEMATTR_OIWB	(((av8l_fast_iopte)0xf) << 2)
+#define AV8L_FAST_PTE_MEMATTR_NC	(((av8l_fast_iopte)0x5) << 2)
+#define AV8L_FAST_PTE_MEMATTR_DEV	(((av8l_fast_iopte)0x1) << 2)
+
+/* Register bits */
+#define ARM_32_LPAE_TCR_EAE		(1 << 31)
+#define ARM_64_LPAE_S2_TCR_RES1		(1 << 31)
+
+#define AV8L_FAST_TCR_TG0_4K		(0 << 14)
+#define AV8L_FAST_TCR_TG0_64K		(1 << 14)
+#define AV8L_FAST_TCR_TG0_16K		(2 << 14)
+
+#define AV8L_FAST_TCR_SH0_SHIFT		12
+#define AV8L_FAST_TCR_SH0_MASK		0x3
+#define AV8L_FAST_TCR_SH_NS		0
+#define AV8L_FAST_TCR_SH_OS		2
+#define AV8L_FAST_TCR_SH_IS		3
+
+#define AV8L_FAST_TCR_ORGN0_SHIFT	10
+#define AV8L_FAST_TCR_IRGN0_SHIFT	8
+#define AV8L_FAST_TCR_RGN_MASK		0x3
+#define AV8L_FAST_TCR_RGN_NC		0
+#define AV8L_FAST_TCR_RGN_WBWA		1
+#define AV8L_FAST_TCR_RGN_WT		2
+#define AV8L_FAST_TCR_RGN_WB		3
+
+#define AV8L_FAST_TCR_SL0_SHIFT		6
+#define AV8L_FAST_TCR_SL0_MASK		0x3
+
+#define AV8L_FAST_TCR_T0SZ_SHIFT	0
+#define AV8L_FAST_TCR_SZ_MASK		0xf
+
+#define AV8L_FAST_TCR_PS_SHIFT		16
+#define AV8L_FAST_TCR_PS_MASK		0x7
+
+#define AV8L_FAST_TCR_IPS_SHIFT		32
+#define AV8L_FAST_TCR_IPS_MASK		0x7
+
+#define AV8L_FAST_TCR_PS_32_BIT		0x0ULL
+#define AV8L_FAST_TCR_PS_36_BIT		0x1ULL
+#define AV8L_FAST_TCR_PS_40_BIT		0x2ULL
+#define AV8L_FAST_TCR_PS_42_BIT		0x3ULL
+#define AV8L_FAST_TCR_PS_44_BIT		0x4ULL
+#define AV8L_FAST_TCR_PS_48_BIT		0x5ULL
+
+#define AV8L_FAST_TCR_EPD1_SHIFT	23
+#define AV8L_FAST_TCR_EPD1_FAULT	1
+
+#define AV8L_FAST_MAIR_ATTR_SHIFT(n)	((n) << 3)
+#define AV8L_FAST_MAIR_ATTR_MASK	0xff
+#define AV8L_FAST_MAIR_ATTR_DEVICE	0x04
+#define AV8L_FAST_MAIR_ATTR_NC		0x44
+#define AV8L_FAST_MAIR_ATTR_WBRWA	0xff
+#define AV8L_FAST_MAIR_ATTR_UPSTREAM	0xf4
+#define AV8L_FAST_MAIR_ATTR_IDX_NC	0
+#define AV8L_FAST_MAIR_ATTR_IDX_CACHE	1
+#define AV8L_FAST_MAIR_ATTR_IDX_DEV	2
+#define AV8L_FAST_MAIR_ATTR_IDX_UPSTREAM	3
+
+#define AV8L_FAST_PAGE_SHIFT		12
+
+#define PTE_MAIR_IDX(pte)				\
+	((pte >> AV8L_FAST_PTE_ATTRINDX_SHIFT) &	\
+	 AV8L_FAST_PTE_ATTRINDX_MASK)
+
+#define PTE_SH_IDX(pte) (pte & AV8L_FAST_PTE_SH_MASK)
+
+#define iopte_pmd_offset(pmds, base, iova) (pmds + ((iova - base) >> 12))
+
+static inline dma_addr_t av8l_dma_addr(void *addr)
+{
+	if (is_vmalloc_addr(addr))
+		return page_to_phys(vmalloc_to_page(addr)) +
+			offset_in_page(addr);
+	return virt_to_phys(addr);
+}
+
+static void __av8l_clean_range(struct device *dev, void *start, void *end)
+{
+	size_t size;
+	void *region_end;
+	unsigned long page_end;
+
+	if (is_vmalloc_addr(start)) {
+		while (start < end) {
+			page_end = round_down((unsigned long)start + PAGE_SIZE,
+					      PAGE_SIZE);
+			region_end = min_t(void *, end, page_end);
+			size = region_end - start;
+			dma_sync_single_for_device(dev, av8l_dma_addr(start),
+						   size, DMA_TO_DEVICE);
+			start = region_end;
+		}
+	} else {
+		size = end - start;
+		dma_sync_single_for_device(dev, av8l_dma_addr(start), size,
+					   DMA_TO_DEVICE);
+	}
+}
+
+static void av8l_clean_range(struct io_pgtable_cfg *cfg, av8l_fast_iopte *start,
+			     av8l_fast_iopte *end)
+{
+	if (!cfg->coherent_walk)
+		__av8l_clean_range(cfg->iommu_dev, start, end);
+}
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST_PROVE_TLB
+
+#include <linux/notifier.h>
+
+static ATOMIC_NOTIFIER_HEAD(av8l_notifier_list);
+
+void av8l_register_notify(struct notifier_block *nb)
+{
+	atomic_notifier_chain_register(&av8l_notifier_list, nb);
+}
+EXPORT_SYMBOL(av8l_register_notify);
+
+static void __av8l_check_for_stale_tlb(av8l_fast_iopte *ptep)
+{
+	if (unlikely(*ptep)) {
+		atomic_notifier_call_chain(
+			&av8l_notifier_list, MAPPED_OVER_STALE_TLB,
+			(void *) ptep);
+		pr_err("Tried to map over a non-vacant pte: 0x%llx @ %p\n",
+		       *ptep, ptep);
+		pr_err("Nearby memory:\n");
+		print_hex_dump(KERN_ERR, "pgtbl: ", DUMP_PREFIX_ADDRESS,
+			       32, 8, ptep - 16, 32 * sizeof(*ptep), false);
+	}
+}
+
+void av8l_fast_clear_stale_ptes(struct io_pgtable_ops *ops, u64 base,
+		u64 end, bool skip_sync)
+{
+	int i;
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	struct io_pgtable *iop = iof_pgtable_ops_to_pgtable(ops);
+	av8l_fast_iopte *pmdp = iopte_pmd_offset(data->pmds, data->base, base);
+
+	for (i = base >> AV8L_FAST_PAGE_SHIFT;
+			i <= (end >> AV8L_FAST_PAGE_SHIFT); ++i) {
+		if (!(*pmdp & AV8L_FAST_PTE_VALID)) {
+			*pmdp = 0;
+			if (!skip_sync)
+				av8l_clean_range(&iop->cfg, pmdp, pmdp + 1);
+		}
+		pmdp++;
+	}
+}
+#else
+static void __av8l_check_for_stale_tlb(av8l_fast_iopte *ptep)
+{
+}
+#endif
+
+static av8l_fast_iopte
+av8l_fast_prot_to_pte(struct av8l_fast_io_pgtable *data, int prot)
+{
+	av8l_fast_iopte pte = AV8L_FAST_PTE_TYPE_PAGE
+		| AV8L_FAST_PTE_AF
+		| AV8L_FAST_PTE_nG;
+
+	if (!(prot & IOMMU_WRITE) && (prot & IOMMU_READ))
+		pte |= AV8L_FAST_PTE_AP_RDONLY;
+	if (!(prot & IOMMU_PRIV))
+		pte |= AV8L_FAST_PTE_AP_UNPRIV;
+
+	if (prot & IOMMU_MMIO)
+		pte |= (AV8L_FAST_MAIR_ATTR_IDX_DEV
+			<< AV8L_FAST_PTE_ATTRINDX_SHIFT);
+	else if (prot & IOMMU_CACHE)
+		pte |= (AV8L_FAST_MAIR_ATTR_IDX_CACHE
+			<< AV8L_FAST_PTE_ATTRINDX_SHIFT);
+	else if (prot & IOMMU_SYS_CACHE)
+		pte |= (AV8L_FAST_MAIR_ATTR_IDX_UPSTREAM
+			<< AV8L_FAST_PTE_ATTRINDX_SHIFT);
+
+	if (prot & IOMMU_CACHE)
+		pte |= AV8L_FAST_PTE_SH_IS;
+	else
+		pte |= AV8L_FAST_PTE_SH_OS;
+
+	if (prot & IOMMU_NOEXEC)
+		pte |= AV8L_FAST_PTE_XN;
+
+	return pte;
+}
+
+static int av8l_fast_map(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot, gfp_t gfp)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	struct io_pgtable *iop = iof_pgtable_ops_to_pgtable(ops);
+	av8l_fast_iopte *ptep = iopte_pmd_offset(data->pmds, data->base, iova);
+	unsigned long i, nptes = size >> AV8L_FAST_PAGE_SHIFT;
+	av8l_fast_iopte pte;
+
+	pte = av8l_fast_prot_to_pte(data, prot);
+	paddr &= AV8L_FAST_PTE_ADDR_MASK;
+	for (i = 0; i < nptes; i++, paddr += SZ_4K) {
+		__av8l_check_for_stale_tlb(ptep + i);
+		*(ptep + i) = pte | paddr;
+	}
+	av8l_clean_range(&iop->cfg, ptep, ptep + nptes);
+
+	return 0;
+}
+
+int av8l_fast_map_public(struct io_pgtable_ops *ops, unsigned long iova,
+			 phys_addr_t paddr, size_t size, int prot)
+{
+	return av8l_fast_map(ops, iova, paddr, size, prot, GFP_ATOMIC);
+}
+
+static int av8l_fast_map_pages(struct io_pgtable_ops *ops, unsigned long iova, phys_addr_t paddr,
+			       size_t pgsize, size_t pgcount, int prot, gfp_t gfp,
+			       size_t *mapped)
+{
+	int ret = av8l_fast_map(ops, iova, paddr, pgsize * pgcount, prot, gfp);
+
+	if (!ret)
+		*mapped = pgsize * pgcount;
+
+	return ret;
+}
+
+static size_t
+__av8l_fast_unmap(struct io_pgtable_ops *ops, unsigned long iova,
+			size_t size, bool allow_stale_tlb)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	struct io_pgtable *iop = iof_pgtable_ops_to_pgtable(ops);
+	unsigned long nptes;
+	av8l_fast_iopte *ptep;
+	int val = allow_stale_tlb
+		? AV8L_FAST_PTE_UNMAPPED_NEED_TLBI
+		: 0;
+
+	ptep = iopte_pmd_offset(data->pmds, data->base, iova);
+	nptes = size >> AV8L_FAST_PAGE_SHIFT;
+
+	memset(ptep, val, sizeof(*ptep) * nptes);
+	av8l_clean_range(&iop->cfg, ptep, ptep + nptes);
+	if (!allow_stale_tlb)
+		io_pgtable_tlb_flush_all(&data->iop);
+
+	return size;
+}
+
+/* caller must take care of tlb cache maintenance */
+void av8l_fast_unmap_public(struct io_pgtable_ops *ops, unsigned long iova,
+				size_t size)
+{
+	__av8l_fast_unmap(ops, iova, size, true);
+}
+
+static size_t av8l_fast_unmap_pages(struct io_pgtable_ops *ops, unsigned long iova, size_t pgsize,
+				    size_t pgcount, struct iommu_iotlb_gather *gather)
+{
+	return __av8l_fast_unmap(ops, iova, pgsize * pgcount, false);
+}
+
+/* TODO: Add this back in android-mainline */
+static int __maybe_unused av8l_fast_map_sg(struct io_pgtable_ops *ops,
+			unsigned long iova, struct scatterlist *sgl,
+			unsigned int nents, int prot, gfp_t gfp, size_t *mapped)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		av8l_fast_map(ops, iova, sg_phys(sg), sg->length, prot, gfp);
+		iova += sg->length;
+		*mapped += sg->length;
+	}
+
+	return 0;
+}
+
+int av8l_fast_map_sg_public(struct io_pgtable_ops *ops,
+			    unsigned long iova, struct scatterlist *sgl,
+			    unsigned int nents, int prot, size_t *mapped)
+{
+	return av8l_fast_map_sg(ops, iova, sgl, nents, prot, GFP_ATOMIC, mapped);
+}
+
+#if defined(CONFIG_ARM64)
+#define FAST_PGDNDX(va) (((va) & 0x7fc0000000) >> 27)
+#elif defined(CONFIG_ARM)
+#define FAST_PGDNDX(va) (((va) & 0xc0000000) >> 27)
+#endif
+
+static phys_addr_t av8l_fast_iova_to_phys(struct io_pgtable_ops *ops,
+					  unsigned long iova)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	av8l_fast_iopte pte, *pgdp, *pudp, *pmdp;
+	unsigned long pgd;
+	phys_addr_t phys;
+	const unsigned long pts = AV8L_FAST_PTE_TYPE_SHIFT;
+	const unsigned long ptm = AV8L_FAST_PTE_TYPE_MASK;
+	const unsigned long ptt = AV8L_FAST_PTE_TYPE_TABLE;
+	const unsigned long ptp = AV8L_FAST_PTE_TYPE_PAGE;
+	const av8l_fast_iopte am = AV8L_FAST_PTE_ADDR_MASK;
+
+	/* TODO: clean up some of these magic numbers... */
+
+	pgd = (unsigned long)data->pgd | FAST_PGDNDX(iova);
+	pgdp = (av8l_fast_iopte *)pgd;
+
+	pte = *pgdp;
+	if (((pte >> pts) & ptm) != ptt)
+		return 0;
+	pudp = phys_to_virt((pte & am) | ((iova & 0x3fe00000) >> 18));
+
+	pte = *pudp;
+	if (((pte >> pts) & ptm) != ptt)
+		return 0;
+	pmdp = phys_to_virt((pte & am) | ((iova & 0x1ff000) >> 9));
+
+	pte = *pmdp;
+	if (((pte >> pts) & ptm) != ptp)
+		return 0;
+	phys = pte & am;
+
+	return phys | (iova & 0xfff);
+}
+
+phys_addr_t av8l_fast_iova_to_phys_public(struct io_pgtable_ops *ops,
+					  unsigned long iova)
+{
+	return av8l_fast_iova_to_phys(ops, iova);
+}
+
+static bool av8l_fast_iova_coherent(struct io_pgtable_ops *ops,
+					unsigned long iova)
+{
+	struct av8l_fast_io_pgtable *data = iof_pgtable_ops_to_data(ops);
+	av8l_fast_iopte *ptep = iopte_pmd_offset(data->pmds, data->base, iova);
+
+	return ((PTE_MAIR_IDX(*ptep) == AV8L_FAST_MAIR_ATTR_IDX_CACHE) &&
+		((PTE_SH_IDX(*ptep) == AV8L_FAST_PTE_SH_OS) ||
+		 (PTE_SH_IDX(*ptep) == AV8L_FAST_PTE_SH_IS)));
+}
+
+bool av8l_fast_iova_coherent_public(struct io_pgtable_ops *ops,
+					unsigned long iova)
+{
+	return av8l_fast_iova_coherent(ops, iova);
+}
+
+static struct av8l_fast_io_pgtable *
+av8l_fast_alloc_pgtable_data(struct io_pgtable_cfg *cfg)
+{
+	struct av8l_fast_io_pgtable *data;
+
+	data = kmalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return NULL;
+
+	data->iop.ops = (struct io_pgtable_ops) {
+		.map_pages	= av8l_fast_map_pages,
+		.unmap_pages	= av8l_fast_unmap_pages,
+		.iova_to_phys	= av8l_fast_iova_to_phys,
+	};
+
+	return data;
+}
+
+/*
+ * We need max 1 page for the pgd, 4 pages for puds (1GB VA per pud page) and
+ * 2048 pages for pmds (each pud page contains 512 table entries, each
+ * pointing to a pmd).
+ */
+#define NUM_PGD_PAGES 1
+#define NUM_PUD_PAGES 4
+#define NUM_PMD_PAGES 2048
+#define NUM_PGTBL_PAGES (NUM_PGD_PAGES + NUM_PUD_PAGES + NUM_PMD_PAGES)
+
+/* undefine arch specific definitions which depends on page table format */
+#undef pud_index
+#undef pud_mask
+#undef pud_next
+#undef pmd_index
+#undef pmd_mask
+#undef pmd_next
+
+#define pud_index(addr)		(((addr) >> 30) & 0x3)
+#define pud_mask(addr)		((addr) & ~((1UL << 30) - 1))
+#define pud_next(addr, end)					\
+({	unsigned long __boundary = pud_mask(addr + (1UL << 30));\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);	\
+})
+
+#define pmd_index(addr)		(((addr) >> 21) & 0x1ff)
+#define pmd_mask(addr)		((addr) & ~((1UL << 21) - 1))
+#define pmd_next(addr, end)					\
+({	unsigned long __boundary = pmd_mask(addr + (1UL << 21));\
+	(__boundary - 1 < (end) - 1) ? __boundary : (end);	\
+})
+
+static int
+av8l_fast_prepopulate_pgtables(struct av8l_fast_io_pgtable *data,
+			       struct io_pgtable_cfg *cfg, void *cookie)
+{
+	int i, j, pg = 0;
+	struct page **pages, *page;
+	struct qcom_io_pgtable_info *pgtbl_info = to_qcom_io_pgtable_info(cfg);
+	dma_addr_t pud, pmd;
+	int pmd_pg_index;
+	dma_addr_t base = pgtbl_info->iova_base;
+	dma_addr_t end = pgtbl_info->iova_end;
+
+	pages = kmalloc(sizeof(*pages) * NUM_PGTBL_PAGES, __GFP_NOWARN |
+							__GFP_NORETRY);
+
+	if (!pages)
+		pages = vmalloc(sizeof(*pages) * NUM_PGTBL_PAGES);
+
+	if (!pages)
+		return -ENOMEM;
+
+	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!page)
+		goto err_free_pages_arr;
+	pages[pg++] = page;
+	data->pgd = page_address(page);
+
+	/*
+	 * We need max 2048 entries at level 2 to map 4GB of VA space. A page
+	 * can hold 512 entries, so we need max 4 pages.
+	 */
+	for (i = pud_index(base), pud = base; pud < end;
+			++i, pud = pud_next(pud, end)) {
+		av8l_fast_iopte pte, *ptep;
+
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+		if (!page)
+			goto err_free_pages;
+		pages[pg++] = page;
+		data->puds[i] = page_address(page);
+		pte = page_to_phys(page) | AV8L_FAST_PTE_TYPE_TABLE;
+		ptep = ((av8l_fast_iopte *)data->pgd) + i;
+		*ptep = pte;
+	}
+	av8l_clean_range(cfg, data->pgd, data->pgd + 4);
+
+	/*
+	 * We have max 4 puds, each of which can point to 512 pmds, so we'll
+	 * have max 2048 pmds, each of which can hold 512 ptes, for a grand
+	 * total of 2048*512=1048576 PTEs.
+	 */
+	pmd_pg_index = pg;
+	for (i = pud_index(base), pud = base; pud < end;
+			++i, pud = pud_next(pud, end)) {
+		for (j = pmd_index(pud), pmd = pud; pmd < pud_next(pud, end);
+				++j, pmd = pmd_next(pmd, end)) {
+			av8l_fast_iopte pte, *pudp;
+			void *addr;
+
+			page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+			if (!page)
+				goto err_free_pages;
+			pages[pg++] = page;
+
+			addr = page_address(page);
+			av8l_clean_range(cfg, addr, addr + SZ_4K);
+
+			pte = page_to_phys(page) | AV8L_FAST_PTE_TYPE_TABLE;
+			pudp = data->puds[i] + j;
+			*pudp = pte;
+		}
+		av8l_clean_range(cfg, data->puds[i], data->puds[i] + 512);
+	}
+
+	/*
+	 * We map the pmds into a virtually contiguous space so that we
+	 * don't have to traverse the first two levels of the page tables
+	 * to find the appropriate pud.  Instead, it will be a simple
+	 * offset from the virtual base of the pmds.
+	 */
+	data->pmds = vmap(&pages[pmd_pg_index], pg - pmd_pg_index,
+			  VM_IOREMAP, PAGE_KERNEL);
+	if (!data->pmds)
+		goto err_free_pages;
+
+	data->pages = pages;
+	data->base = base;
+	data->end = end;
+	data->nr_pages = pg;
+	return 0;
+
+err_free_pages:
+	for (i = 0; i < pg; ++i)
+		__free_page(pages[i]);
+err_free_pages_arr:
+	kvfree(pages);
+	return -ENOMEM;
+}
+
+static struct io_pgtable *
+av8l_fast_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie)
+{
+	u64 reg;
+	struct av8l_fast_io_pgtable *data =
+		av8l_fast_alloc_pgtable_data(cfg);
+	typeof(&cfg->arm_lpae_s1_cfg.tcr) tcr = &cfg->arm_lpae_s1_cfg.tcr;
+
+	if (!data)
+		return NULL;
+
+	/* restrict according to the fast map requirements */
+	cfg->ias = 32;
+	cfg->pgsize_bitmap = SZ_4K;
+
+	/* TCR */
+	if (cfg->coherent_walk) {
+		tcr->sh = AV8L_FAST_TCR_SH_IS;
+		tcr->irgn = AV8L_FAST_TCR_RGN_WBWA;
+		tcr->orgn = AV8L_FAST_TCR_RGN_WBWA;
+		if (WARN_ON(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+			goto out_free_data;
+	} else {
+		tcr->sh = AV8L_FAST_TCR_SH_OS;
+		tcr->irgn = AV8L_FAST_TCR_RGN_NC;
+		if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+			tcr->orgn = AV8L_FAST_TCR_RGN_NC;
+		else
+			tcr->orgn = AV8L_FAST_TCR_RGN_WBWA;
+	}
+
+	tcr->tg = AV8L_FAST_TCR_TG0_4K;
+
+	switch (cfg->oas) {
+	case 32:
+		tcr->ips = AV8L_FAST_TCR_PS_32_BIT;
+		break;
+	case 36:
+		tcr->ips = AV8L_FAST_TCR_PS_36_BIT;
+		break;
+	case 40:
+		tcr->ips = AV8L_FAST_TCR_PS_40_BIT;
+		break;
+	case 42:
+		tcr->ips = AV8L_FAST_TCR_PS_42_BIT;
+		break;
+	case 44:
+		tcr->ips = AV8L_FAST_TCR_PS_44_BIT;
+		break;
+	case 48:
+		tcr->ips = AV8L_FAST_TCR_PS_48_BIT;
+		break;
+	default:
+		goto out_free_data;
+	}
+
+	tcr->tsz = 64ULL - cfg->ias;
+
+	/* MAIRs */
+	reg = (AV8L_FAST_MAIR_ATTR_NC
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_NC)) |
+	      (AV8L_FAST_MAIR_ATTR_WBRWA
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_CACHE)) |
+	      (AV8L_FAST_MAIR_ATTR_DEVICE
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_DEV)) |
+	      (AV8L_FAST_MAIR_ATTR_UPSTREAM
+	       << AV8L_FAST_MAIR_ATTR_SHIFT(AV8L_FAST_MAIR_ATTR_IDX_UPSTREAM));
+
+	cfg->arm_lpae_s1_cfg.mair = reg;
+
+	/* Allocate all page table memory! */
+	if (av8l_fast_prepopulate_pgtables(data, cfg, cookie))
+		goto out_free_data;
+
+	/* TTBRs */
+	cfg->arm_lpae_s1_cfg.ttbr = virt_to_phys(data->pgd);
+	return &data->iop;
+
+out_free_data:
+	kfree(data);
+	return NULL;
+}
+
+static void av8l_fast_free_pgtable(struct io_pgtable *iop)
+{
+	int i;
+	struct av8l_fast_io_pgtable *data = iof_pgtable_to_data(iop);
+
+	vunmap(data->pmds);
+	for (i = 0; i < data->nr_pages; ++i)
+		__free_page(data->pages[i]);
+	kvfree(data->pages);
+	kfree(data);
+}
+
+struct io_pgtable_init_fns io_pgtable_av8l_fast_init_fns = {
+	.alloc	= av8l_fast_alloc_pgtable,
+	.free	= av8l_fast_free_pgtable,
+};
+
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST_SELFTEST
+
+#include <linux/dma-map-ops.h>
+
+static struct io_pgtable_cfg *cfg_cookie;
+
+static void dummy_tlb_flush_all(void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_flush(unsigned long iova, size_t size, size_t granule,
+			    void *cookie)
+{
+	WARN_ON(cookie != cfg_cookie);
+}
+
+static void dummy_tlb_add_page(struct iommu_iotlb_gather *gather,
+			       unsigned long iova, size_t granule, void *cookie)
+{
+	dummy_tlb_flush(iova, granule, granule, cookie);
+}
+
+static struct iommu_flush_ops dummy_tlb_ops __initdata = {
+	.tlb_flush_all	= dummy_tlb_flush_all,
+	.tlb_flush_walk	= dummy_tlb_flush,
+	.tlb_add_page	= dummy_tlb_add_page,
+};
+
+/*
+ * Returns true if the iova range is successfully mapped to the contiguous
+ * phys range in ops.
+ */
+static bool av8l_fast_range_has_specific_mapping(struct io_pgtable_ops *ops,
+						 const unsigned long iova_start,
+						 const phys_addr_t phys_start,
+						 const size_t size)
+{
+	u64 iova = iova_start;
+	phys_addr_t phys = phys_start;
+
+	while (iova < (iova_start + size)) {
+		/* + 42 just to make sure offsetting is working */
+		if (ops->iova_to_phys(ops, iova + 42) != (phys + 42))
+			return false;
+		iova += SZ_4K;
+		phys += SZ_4K;
+	}
+	return true;
+}
+
+static int __init av8l_fast_positive_testing(void)
+{
+	int failed = 0;
+	u64 iova;
+	struct io_pgtable_ops *ops;
+	struct qcom_io_pgtable_info pgtable_info;
+	struct av8l_fast_io_pgtable *data;
+	av8l_fast_iopte *pmds;
+	u64 max = SZ_1G * 4ULL - 1;
+	u64 base = 0;
+
+	pgtable_info.iova_base = base;
+	pgtable_info.iova_end = max;
+	pgtable_info.cfg = (struct io_pgtable_cfg) {
+		.quirks = 0,
+		.tlb = &dummy_tlb_ops,
+		.ias = 32,
+		.oas = 32,
+		.pgsize_bitmap = SZ_4K,
+		.coherent_walk = true,
+	};
+
+	cfg_cookie = &pgtable_info.pgtbl_cfg;
+	ops = alloc_io_pgtable_ops(ARM_V8L_FAST, &pgtable_info.pgtbl_cfg,
+				   &pgtable_info.pgtbl_cfg);
+
+	if (WARN_ON(!ops))
+		return 1;
+
+	data = iof_pgtable_ops_to_data(ops);
+	pmds = data->pmds;
+
+	/* map the entire 4GB VA space with 4K map calls */
+	for (iova = base; iova < max; iova += SZ_4K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_4K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all */
+	for (iova = base; iova < max; iova += SZ_4K) {
+		if (WARN_ON(ops->unmap(ops, iova, SZ_4K, NULL) != SZ_4K))
+			failed++;
+	}
+
+	/* sweep up TLB proving PTEs */
+	av8l_fast_clear_stale_ptes(ops, base, max, false);
+
+	/* map the entire 4GB VA space with 8K map calls */
+	for (iova = base; iova < max; iova += SZ_8K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_8K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all with 8K unmap calls */
+	for (iova = base; iova < max; iova += SZ_8K) {
+		if (WARN_ON(ops->unmap(ops, iova, SZ_8K, NULL) != SZ_8K))
+			failed++;
+	}
+
+	/* sweep up TLB proving PTEs */
+	av8l_fast_clear_stale_ptes(ops, base, max, false);
+
+	/* map the entire 4GB VA space with 16K map calls */
+	for (iova = base; iova < max; iova += SZ_16K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_16K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all */
+	for (iova = base; iova < max; iova += SZ_16K) {
+		if (WARN_ON(ops->unmap(ops, iova, SZ_16K, NULL) != SZ_16K))
+			failed++;
+	}
+
+	/* sweep up TLB proving PTEs */
+	av8l_fast_clear_stale_ptes(ops, base, max, false);
+
+	/* map the entire 4GB VA space with 64K map calls */
+	for (iova = base; iova < max; iova += SZ_64K) {
+		if (WARN_ON(ops->map(ops, iova, iova, SZ_64K, IOMMU_READ))) {
+			failed++;
+			continue;
+		}
+	}
+
+	if (WARN_ON(!av8l_fast_range_has_specific_mapping(ops, base,
+					base, max - base)))
+		failed++;
+
+	/* unmap it all at once */
+	if (WARN_ON(ops->unmap(ops, base, max - base, NULL) != (max - base)))
+		failed++;
+
+	free_io_pgtable_ops(ops);
+	return failed;
+}
+
+static int __init av8l_fast_do_selftests(void)
+{
+	int failed = 0;
+
+	failed += av8l_fast_positive_testing();
+
+	pr_err("selftest: completed with %d failures\n", failed);
+
+	return 0;
+}
+subsys_initcall(av8l_fast_do_selftests);
+#endif
--- a/drivers/iommu/iommu-logger.c
+++ b/drivers/iommu/iommu-logger.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2022-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/bitfield.h>
+#include <linux/module.h>
+#include <linux/iommu.h>
+#include <linux/qcom-io-pgtable.h>
+#include <linux/slab.h>
+
+#include "iommu-logger.h"
+
+static DEFINE_MUTEX(iommu_debug_attachments_lock);
+static LIST_HEAD(iommu_debug_attachments);
+
+static unsigned int iommu_logger_pgtable_levels(struct io_pgtable *iop)
+{
+	unsigned int va_bits, pte_size, bits_per_level, pg_shift;
+	unsigned long ias = iop->cfg.ias;
+
+	switch ((u32)iop->fmt) {
+	case ARM_32_LPAE_S1:
+	case ARM_64_LPAE_S1:
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+	case ARM_V8L_FAST:
+#endif
+	case QCOM_ARM_64_LPAE_S1:
+		pte_size = sizeof(u64);
+		break;
+	default:
+		return 0;
+	}
+
+	pg_shift = __ffs(iop->cfg.pgsize_bitmap);
+	bits_per_level = pg_shift - ilog2(pte_size);
+	va_bits = ias - pg_shift;
+	return DIV_ROUND_UP(va_bits, bits_per_level);
+}
+
+static enum iommu_logger_pgtable_fmt iommu_logger_pgtable_fmt_lut(
+							enum io_pgtable_fmt fmt)
+{
+	switch ((u32)fmt) {
+	case ARM_32_LPAE_S1:
+		return IOMMU_LOGGER_ARM_32_LPAE_S1;
+	case ARM_64_LPAE_S1:
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+	case ARM_V8L_FAST:
+#endif
+	case QCOM_ARM_64_LPAE_S1:
+		return IOMMU_LOGGER_ARM_64_LPAE_S1;
+	default:
+		return IOMMU_LOGGER_MAX_PGTABLE_FMTS;
+	}
+}
+
+static int iommu_logger_domain_ttbrs(struct io_pgtable *iop, void **ttbr0_ptr,
+				     void **ttbr1_ptr)
+{
+	int ret;
+	u64 ttbr0;
+
+	switch ((u32)iop->fmt) {
+	case ARM_32_LPAE_S1:
+	case ARM_64_LPAE_S1:
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+	case ARM_V8L_FAST:
+#endif
+	case QCOM_ARM_64_LPAE_S1:
+		ttbr0 = iop->cfg.arm_lpae_s1_cfg.ttbr;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		*ttbr0_ptr = phys_to_virt(ttbr0);
+		/*
+		 * FIXME - fix ttbr1 retrieval later. In this kernel version
+		 * struct io_pgtable no longer contains this information.
+		 */
+		*ttbr1_ptr = NULL;
+	}
+
+	return ret;
+}
+
+static struct iommu_debug_attachment *iommu_logger_init(
+						struct iommu_domain *domain,
+						struct device *dev,
+						struct io_pgtable *iop)
+{
+	struct iommu_debug_attachment *logger;
+	char *client_name;
+	struct iommu_group *group;
+	unsigned int levels = iommu_logger_pgtable_levels(iop);
+	enum iommu_logger_pgtable_fmt fmt = iommu_logger_pgtable_fmt_lut(
+								iop->fmt);
+	void *ttbr0, *ttbr1;
+	int ret;
+
+	if (!levels || fmt == IOMMU_LOGGER_MAX_PGTABLE_FMTS)
+		return ERR_PTR(-EINVAL);
+
+	ret = iommu_logger_domain_ttbrs(iop, &ttbr0, &ttbr1);
+	if (ret)
+		return ERR_PTR(ret);
+
+	logger = kzalloc(sizeof(*logger), GFP_KERNEL);
+	if (!logger)
+		return ERR_PTR(-ENOMEM);
+
+	client_name = kasprintf(GFP_KERNEL, "%s", kobject_name(&dev->kobj));
+	if (!client_name) {
+		kfree(logger);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	group = iommu_group_get(dev);
+	iommu_group_put(group);
+
+	INIT_LIST_HEAD(&logger->list);
+	logger->domain = domain;
+	logger->group = group;
+	logger->client_name = client_name;
+	logger->fmt = fmt;
+	logger->levels = levels;
+	logger->ttbr0 = ttbr0;
+	logger->ttbr1 = ttbr1;
+	logger->dev = dev;
+
+	return logger;
+}
+
+int iommu_logger_register(struct iommu_domain *domain, struct device *dev,
+			  struct io_pgtable_ops *ops)
+{
+	struct iommu_debug_attachment *logger;
+	struct io_pgtable *iop;
+	int ret = 0;
+
+	/* qcom,iommu-dma = "disabled" causes ops to be NULL */
+	if (!ops)
+		return 0;
+
+	if (!domain || !dev)
+		return -EINVAL;
+
+	iop = io_pgtable_ops_to_pgtable(ops);
+	mutex_lock(&iommu_debug_attachments_lock);
+	list_for_each_entry(logger, &iommu_debug_attachments, list)
+		if (logger->dev == dev && logger->domain == domain)
+			goto out;
+
+	logger = iommu_logger_init(domain, dev, iop);
+	if (IS_ERR(logger)) {
+		ret = PTR_ERR(logger);
+		goto out;
+	}
+
+	list_add(&logger->list, &iommu_debug_attachments);
+out:
+	mutex_unlock(&iommu_debug_attachments_lock);
+	return ret;
+}
+EXPORT_SYMBOL(iommu_logger_register);
+
+void iommu_logger_unregister(struct device *dev, struct iommu_domain *domain)
+{
+	struct iommu_debug_attachment *logger, *tmp;
+
+	mutex_lock(&iommu_debug_attachments_lock);
+	list_for_each_entry_safe(logger, tmp, &iommu_debug_attachments, list) {
+		if (logger->dev == dev || logger->domain == domain) {
+			list_del(&logger->list);
+			kfree(logger->client_name);
+			kfree(logger);
+		}
+	}
+	mutex_unlock(&iommu_debug_attachments_lock);
+}
+EXPORT_SYMBOL(iommu_logger_unregister);
+
+MODULE_DESCRIPTION("QTI IOMMU SUPPORT");
+MODULE_LICENSE("GPL");
--- a/drivers/iommu/iommu-logger.h
+++ b/drivers/iommu/iommu-logger.h
@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023-2024 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef __LINUX_QTI_IOMMU_LOGGER_H
+#define __LINUX_QTI_IOMMU_LOGGER_H
+
+#include <linux/io-pgtable.h>
+
+enum iommu_logger_pgtable_fmt {
+	IOMMU_LOGGER_ARM_32_LPAE_S1,
+	IOMMU_LOGGER_ARM_64_LPAE_S1,
+	IOMMU_LOGGER_MAX_PGTABLE_FMTS,
+};
+
+/*
+ * Each group may have more than one domain; but each domain may
+ * only have one group.
+ */
+struct iommu_debug_attachment {
+	struct iommu_domain *domain;
+	struct iommu_group *group;
+	char *client_name;
+	enum iommu_logger_pgtable_fmt fmt;
+	unsigned int levels;
+	/*
+	 * Virtual addresses of the top-level page tables are stored here,
+	 * as they are more useful for debug tools than physical addresses.
+	 */
+	void *ttbr0;
+	void *ttbr1;
+	struct list_head list;
+	struct device *dev;
+};
+
+#if IS_ENABLED(CONFIG_QTI_IOMMU_SUPPORT)
+
+int iommu_logger_register(struct iommu_domain *domain, struct device *dev,
+			  struct io_pgtable_ops *ops);
+void iommu_logger_unregister(struct device *dev, struct iommu_domain *domain);
+#else
+static inline int iommu_logger_register(struct iommu_domain *domain,
+					struct device *dev,
+					struct io_pgtable_ops *ops)
+{
+	return 0;
+}
+
+static inline void iommu_logger_unregister(struct device *dev, struct iommu_domain *domain) {}
+#endif /* CONFIG_QTI_IOMMU_LOGGER */
+#endif /* __LINUX_QTI_IOMMU_LOGGER_H */
--- a/drivers/iommu/msm_dma_iommu_mapping.c
+++ b/drivers/iommu/msm_dma_iommu_mapping.c
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015-2019, 2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/dma-map-ops.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/err.h>
+#include <asm/barrier.h>
+
+#include <linux/msm_dma_iommu_mapping.h>
+#include <linux/qcom-dma-mapping.h>
+
+/**
+ * struct msm_iommu_map - represents a mapping of an ion buffer to an iommu
+ * @lnode - list node to exist in the buffer's list of iommu mappings
+ * @dev - Device this is mapped to. Used as key
+ * @sgl - The scatterlist for this mapping
+ * @nents - Number of entries in sgl
+ * @dir - The direction for the map.
+ * @meta - Backpointer to the meta this guy belongs to.
+ * @ref - for reference counting this mapping
+ * @attrs - dma mapping attributes
+ * @buf_start_addr - address of start of buffer
+ *
+ * Represents a mapping of one dma_buf buffer to a particular device
+ * and address range. There may exist other mappings of this buffer in
+ * different devices. All mappings will have the same cacheability and security.
+ */
+struct msm_iommu_map {
+	struct list_head lnode;
+	struct rb_node node;
+	struct device *dev;
+	struct scatterlist *sgl;
+	unsigned int nents;
+	enum dma_data_direction dir;
+	struct msm_iommu_meta *meta;
+	struct kref ref;
+	unsigned long attrs;
+	dma_addr_t buf_start_addr;
+};
+
+struct msm_iommu_meta {
+	struct rb_node node;
+	struct list_head iommu_maps;
+	struct kref ref;
+	struct mutex lock;
+	void *buffer;
+};
+
+static struct rb_root iommu_root;
+static DEFINE_MUTEX(msm_iommu_map_mutex);
+
+static void msm_iommu_meta_add(struct msm_iommu_meta *meta)
+{
+	struct rb_root *root = &iommu_root;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct msm_iommu_meta *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct msm_iommu_meta, node);
+
+		if (meta->buffer < entry->buffer)
+			p = &(*p)->rb_left;
+		else if (meta->buffer > entry->buffer)
+			p = &(*p)->rb_right;
+		else
+			pr_err("%s: dma_buf %pK already exists\n", __func__,
+			       entry->buffer);
+	}
+
+	rb_link_node(&meta->node, parent, p);
+	rb_insert_color(&meta->node, root);
+}
+
+static struct msm_iommu_meta *msm_iommu_meta_lookup(void *buffer)
+{
+	struct rb_root *root = &iommu_root;
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct msm_iommu_meta *entry = NULL;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct msm_iommu_meta, node);
+
+		if (buffer < entry->buffer)
+			p = &(*p)->rb_left;
+		else if (buffer > entry->buffer)
+			p = &(*p)->rb_right;
+		else
+			return entry;
+	}
+
+	return NULL;
+}
+
+static void msm_iommu_add(struct msm_iommu_meta *meta,
+			  struct msm_iommu_map *iommu)
+{
+	INIT_LIST_HEAD(&iommu->lnode);
+	list_add(&iommu->lnode, &meta->iommu_maps);
+}
+
+
+static struct msm_iommu_map *msm_iommu_lookup(struct msm_iommu_meta *meta,
+					      struct device *dev)
+{
+	struct msm_iommu_map *entry;
+
+	list_for_each_entry(entry, &meta->iommu_maps, lnode) {
+		if (entry->dev == dev)
+			return entry;
+	}
+
+	return NULL;
+}
+
+static struct msm_iommu_meta *msm_iommu_meta_create(struct dma_buf *dma_buf)
+{
+	struct msm_iommu_meta *meta;
+
+	meta = kzalloc(sizeof(*meta), GFP_KERNEL);
+
+	if (!meta)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&meta->iommu_maps);
+	meta->buffer = dma_buf->priv;
+	kref_init(&meta->ref);
+	mutex_init(&meta->lock);
+	msm_iommu_meta_add(meta);
+
+	return meta;
+}
+
+static void msm_iommu_meta_put(struct msm_iommu_meta *meta);
+
+static struct scatterlist *clone_sgl(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *next, *s;
+	int i;
+	struct sg_table table;
+
+	if (sg_alloc_table(&table, nents, GFP_KERNEL))
+		return NULL;
+	next = table.sgl;
+	for_each_sg(sg, s, nents, i) {
+		*next = *s;
+		next = sg_next(next);
+	}
+	return table.sgl;
+}
+
+static inline int __msm_dma_map_sg(struct device *dev, struct scatterlist *sg,
+				   int nents, enum dma_data_direction dir,
+				   struct dma_buf *dma_buf,
+				   unsigned long attrs)
+{
+	struct msm_iommu_map *iommu_map;
+	struct msm_iommu_meta *iommu_meta = NULL;
+	int ret = 0;
+	bool extra_meta_ref_taken = false;
+	int late_unmap = !(attrs & DMA_ATTR_NO_DELAYED_UNMAP);
+
+	mutex_lock(&msm_iommu_map_mutex);
+	iommu_meta = msm_iommu_meta_lookup(dma_buf->priv);
+
+	if (!iommu_meta) {
+		iommu_meta = msm_iommu_meta_create(dma_buf);
+
+		if (IS_ERR(iommu_meta)) {
+			mutex_unlock(&msm_iommu_map_mutex);
+			ret = PTR_ERR(iommu_meta);
+			goto out;
+		}
+		if (late_unmap) {
+			kref_get(&iommu_meta->ref);
+			extra_meta_ref_taken = true;
+		}
+	} else {
+		kref_get(&iommu_meta->ref);
+	}
+
+	mutex_unlock(&msm_iommu_map_mutex);
+
+	mutex_lock(&iommu_meta->lock);
+	iommu_map = msm_iommu_lookup(iommu_meta, dev);
+	if (!iommu_map) {
+		iommu_map = kmalloc(sizeof(*iommu_map), GFP_KERNEL);
+
+		if (!iommu_map) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+
+		ret = dma_map_sg_attrs(dev, sg, nents, dir, attrs);
+		if (!ret) {
+			kfree(iommu_map);
+			goto out_unlock;
+		}
+
+		iommu_map->sgl = clone_sgl(sg, nents);
+		if (!iommu_map->sgl) {
+			kfree(iommu_map);
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		iommu_map->nents = nents;
+		iommu_map->dev = dev;
+		iommu_map->dir = dir;
+		iommu_map->attrs = attrs;
+		iommu_map->buf_start_addr = sg_phys(sg);
+
+		kref_init(&iommu_map->ref);
+		if (late_unmap)
+			kref_get(&iommu_map->ref);
+		iommu_map->meta = iommu_meta;
+		msm_iommu_add(iommu_meta, iommu_map);
+
+	} else {
+		if (nents == iommu_map->nents &&
+		    dir == iommu_map->dir &&
+		    (attrs & ~DMA_ATTR_SKIP_CPU_SYNC) ==
+		    (iommu_map->attrs & ~DMA_ATTR_SKIP_CPU_SYNC) &&
+		    sg_phys(sg) == iommu_map->buf_start_addr) {
+			struct scatterlist *sg_tmp = sg;
+			struct scatterlist *map_sg;
+			int i;
+
+			for_each_sg(iommu_map->sgl, map_sg, nents, i) {
+				sg_dma_address(sg_tmp) = sg_dma_address(map_sg);
+				sg_dma_len(sg_tmp) = sg_dma_len(map_sg);
+				if (sg_dma_len(map_sg) == 0)
+					break;
+
+				sg_tmp = sg_next(sg_tmp);
+				if (sg_tmp == NULL)
+					break;
+			}
+
+			kref_get(&iommu_map->ref);
+
+			if ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0)
+				dma_sync_sg_for_device(dev, iommu_map->sgl,
+					iommu_map->nents, iommu_map->dir);
+
+			if (dev_is_dma_coherent(dev))
+				/*
+				 * Ensure all outstanding changes for coherent
+				 * buffers are applied to the cache before any
+				 * DMA occurs.
+				 */
+				dmb(ish);
+			ret = nents;
+		} else {
+			bool start_diff = (sg_phys(sg) !=
+					   iommu_map->buf_start_addr);
+
+			dev_err(dev, "lazy map request differs:\n"
+				"req dir:%d, original dir:%d\n"
+				"req nents:%d, original nents:%d\n"
+				"req map attrs:%lu, original map attrs:%lu\n"
+				"req buffer start address differs:%d\n",
+				dir, iommu_map->dir, nents,
+				iommu_map->nents, attrs, iommu_map->attrs,
+				start_diff);
+			ret = -EINVAL;
+		}
+	}
+	mutex_unlock(&iommu_meta->lock);
+	return ret;
+
+out_unlock:
+	mutex_unlock(&iommu_meta->lock);
+out:
+	if (!IS_ERR(iommu_meta)) {
+		if (extra_meta_ref_taken)
+			msm_iommu_meta_put(iommu_meta);
+		msm_iommu_meta_put(iommu_meta);
+	}
+	return ret;
+
+}
+
+/*
+ * We are not taking a reference to the dma_buf here. It is expected that
+ * clients hold reference to the dma_buf until they are done with mapping and
+ * unmapping.
+ */
+int msm_dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+		   enum dma_data_direction dir, struct dma_buf *dma_buf,
+		   unsigned long attrs)
+{
+	int ret;
+
+	if (IS_ERR_OR_NULL(dev)) {
+		pr_err("%s: dev pointer is invalid\n", __func__);
+		return -EINVAL;
+	}
+
+	if (IS_ERR_OR_NULL(sg)) {
+		pr_err("%s: sg table pointer is invalid\n", __func__);
+		return -EINVAL;
+	}
+
+	if (IS_ERR_OR_NULL(dma_buf)) {
+		pr_err("%s: dma_buf pointer is invalid\n", __func__);
+		return -EINVAL;
+	}
+
+	ret = __msm_dma_map_sg(dev, sg, nents, dir, dma_buf, attrs);
+
+	return ret;
+}
+EXPORT_SYMBOL(msm_dma_map_sg_attrs);
+
+static void msm_iommu_meta_destroy(struct kref *kref)
+{
+	struct msm_iommu_meta *meta = container_of(kref, struct msm_iommu_meta,
+						ref);
+
+	if (!list_empty(&meta->iommu_maps)) {
+		WARN(1, "%s: DMA Buffer %pK being destroyed with outstanding iommu mappings!\n",
+		     __func__, meta->buffer);
+	}
+	rb_erase(&meta->node, &iommu_root);
+	kfree(meta);
+}
+
+static void msm_iommu_meta_put(struct msm_iommu_meta *meta)
+{
+	/*
+	 * Need to lock here to prevent race against map/unmap
+	 */
+	mutex_lock(&msm_iommu_map_mutex);
+	kref_put(&meta->ref, msm_iommu_meta_destroy);
+	mutex_unlock(&msm_iommu_map_mutex);
+}
+
+static void msm_iommu_map_release(struct kref *kref)
+{
+	struct msm_iommu_map *map = container_of(kref, struct msm_iommu_map,
+						ref);
+	struct sg_table table;
+
+	table.nents = table.orig_nents = map->nents;
+	table.sgl = map->sgl;
+	list_del(&map->lnode);
+
+	/* Skip an additional cache maintenance on the dma unmap path */
+	if (!(map->attrs & DMA_ATTR_SKIP_CPU_SYNC))
+		map->attrs |= DMA_ATTR_SKIP_CPU_SYNC;
+	dma_unmap_sg_attrs(map->dev, map->sgl, map->nents, map->dir,
+			map->attrs);
+	sg_free_table(&table);
+	kfree(map);
+}
+
+void msm_dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
+			    int nents, enum dma_data_direction dir,
+			    struct dma_buf *dma_buf, unsigned long attrs)
+{
+	struct msm_iommu_map *iommu_map;
+	struct msm_iommu_meta *meta;
+
+	mutex_lock(&msm_iommu_map_mutex);
+	meta = msm_iommu_meta_lookup(dma_buf->priv);
+	if (!meta) {
+		WARN(1, "%s: (%pK) was never mapped\n", __func__, dma_buf);
+		mutex_unlock(&msm_iommu_map_mutex);
+		goto out;
+
+	}
+	mutex_unlock(&msm_iommu_map_mutex);
+
+	mutex_lock(&meta->lock);
+	iommu_map = msm_iommu_lookup(meta, dev);
+
+	if (!iommu_map) {
+		WARN(1, "%s: (%pK) was never mapped for device  %p\n", __func__,
+				dma_buf, dev);
+		mutex_unlock(&meta->lock);
+		goto out;
+	}
+
+	if (dir != iommu_map->dir)
+		WARN(1, "%s: (%pK) dir:%d differs from original dir:%d\n",
+		     __func__, dma_buf, dir, iommu_map->dir);
+
+	if (attrs && ((attrs & DMA_ATTR_SKIP_CPU_SYNC) == 0))
+		dma_sync_sg_for_cpu(dev, iommu_map->sgl, iommu_map->nents, dir);
+
+	iommu_map->attrs = attrs;
+	kref_put(&iommu_map->ref, msm_iommu_map_release);
+	mutex_unlock(&meta->lock);
+
+	msm_iommu_meta_put(meta);
+
+out:
+	return;
+}
+EXPORT_SYMBOL(msm_dma_unmap_sg_attrs);
+
+int msm_dma_unmap_all_for_dev(struct device *dev)
+{
+	int ret = 0;
+	struct msm_iommu_meta *meta;
+	struct rb_root *root;
+	struct rb_node *meta_node;
+
+	mutex_lock(&msm_iommu_map_mutex);
+	root = &iommu_root;
+	meta_node = rb_first(root);
+	while (meta_node) {
+		struct msm_iommu_map *iommu_map;
+		struct msm_iommu_map *iommu_map_next;
+
+		meta = rb_entry(meta_node, struct msm_iommu_meta, node);
+		mutex_lock(&meta->lock);
+		list_for_each_entry_safe(iommu_map, iommu_map_next,
+						&meta->iommu_maps, lnode)
+			if (iommu_map->dev == dev)
+				if (!kref_put(&iommu_map->ref,
+						msm_iommu_map_release))
+					ret = -EINVAL;
+
+		mutex_unlock(&meta->lock);
+		meta_node = rb_next(meta_node);
+	}
+	mutex_unlock(&msm_iommu_map_mutex);
+
+	return ret;
+}
+EXPORT_SYMBOL(msm_dma_unmap_all_for_dev);
+
+/*
+ * Only to be called by ION code when a buffer is freed
+ */
+void msm_dma_buf_freed(void *buffer)
+{
+	struct msm_iommu_map *iommu_map;
+	struct msm_iommu_map *iommu_map_next;
+	struct msm_iommu_meta *meta;
+
+	mutex_lock(&msm_iommu_map_mutex);
+	meta = msm_iommu_meta_lookup(buffer);
+	if (!meta) {
+		/* Already unmapped (assuming no late unmapping) */
+		mutex_unlock(&msm_iommu_map_mutex);
+		return;
+	}
+	mutex_unlock(&msm_iommu_map_mutex);
+
+	mutex_lock(&meta->lock);
+
+	list_for_each_entry_safe(iommu_map, iommu_map_next, &meta->iommu_maps,
+				 lnode)
+		kref_put(&iommu_map->ref, msm_iommu_map_release);
+
+	if (!list_empty(&meta->iommu_maps)) {
+		WARN(1, "%s: DMA buffer %pK destroyed with outstanding iommu mappings\n",
+		     __func__, meta->buffer);
+	}
+
+	INIT_LIST_HEAD(&meta->iommu_maps);
+	mutex_unlock(&meta->lock);
+
+	msm_iommu_meta_put(meta);
+}
+EXPORT_SYMBOL(msm_dma_buf_freed);
+
+MODULE_LICENSE("GPL");
--- a/drivers/iommu/qcom-dma-iommu-generic.c
+++ b/drivers/iommu/qcom-dma-iommu-generic.c
@@ -0,0 +1,815 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2014, 2020-2021, The Linux Foundation. All rights reserved.
+ * Contiguous Memory Allocator for DMA mapping framework
+ * Copyright (c) 2010-2011 by Samsung Electronics.
+ * Written by:
+ *	Marek Szyprowski <m.szyprowski@samsung.com>
+ *	Michal Nazarewicz <mina86@mina86.com>
+ * Copyright (C) 2012, 2014-2015 ARM Ltd.
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/mutex.h>
+#include <linux/rbtree.h>
+#include <linux/genalloc.h>
+#include <linux/dma-direct.h>
+#include <linux/cma.h>
+#include <linux/iova.h>
+#include <linux/dma-map-ops.h>
+#include <linux/dma-mapping.h>
+#include <linux/qcom-dma-mapping.h>
+#include <linux/of_reserved_mem.h>
+#include <linux/iommu.h>
+#include <linux/qcom-iommu-util.h>
+#include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include "qcom-dma-iommu-generic.h"
+
+static bool probe_finished;
+static struct device *qcom_dma_iommu_dev;
+static struct cma *qcom_dma_contiguous_default_area;
+
+struct pci_host_bridge *qcom_pci_find_host_bridge(struct pci_bus *bus)
+{
+	while (bus->parent)
+		bus = bus->parent;
+
+	return to_pci_host_bridge(bus->bridge);
+}
+
+/*
+ * This avoids arch-specific assembly, but may be slower since it calls
+ * back into the dma layer again.
+ */
+void qcom_arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	dma_addr_t dma_addr = phys_to_dma(qcom_dma_iommu_dev, paddr);
+
+	dma_sync_single_for_device(qcom_dma_iommu_dev,
+		dma_addr, size, dir);
+}
+
+void qcom_arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir)
+{
+	dma_addr_t dma_addr = phys_to_dma(qcom_dma_iommu_dev, paddr);
+
+	dma_sync_single_for_cpu(qcom_dma_iommu_dev,
+		dma_addr, size, dir);
+}
+
+void qcom_arch_dma_prep_coherent(struct page *page, size_t size)
+{
+	phys_addr_t phys = page_to_phys(page);
+	dma_addr_t dma_addr = phys_to_dma(qcom_dma_iommu_dev, phys);
+
+	dma_sync_single_for_device(qcom_dma_iommu_dev,
+		dma_addr, size, DMA_TO_DEVICE);
+}
+
+static struct cma *qcom_dev_get_cma_area(struct device *dev)
+{
+	if (dev && dev->cma_area)
+		return dev->cma_area;
+	return qcom_dma_contiguous_default_area;
+}
+
+struct page *qcom_dma_alloc_from_contiguous(struct device *dev, size_t count,
+				       unsigned int align, bool no_warn)
+{
+	if (align > CONFIG_CMA_ALIGNMENT)
+		align = CONFIG_CMA_ALIGNMENT;
+
+	return cma_alloc(qcom_dev_get_cma_area(dev), count, align, no_warn);
+}
+
+bool qcom_dma_release_from_contiguous(struct device *dev, struct page *pages,
+				 int count)
+{
+	return cma_release(qcom_dev_get_cma_area(dev), pages, count);
+}
+
+static struct page *cma_alloc_aligned(struct cma *cma, size_t size, gfp_t gfp)
+{
+	unsigned int align = min(get_order(size), CONFIG_CMA_ALIGNMENT);
+
+	return cma_alloc(cma, size >> PAGE_SHIFT, align, gfp & __GFP_NOWARN);
+}
+
+struct page *qcom_dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
+{
+	/* CMA can be used only in the context which permits sleeping */
+	if (!gfpflags_allow_blocking(gfp))
+		return NULL;
+	if (dev->cma_area)
+		return cma_alloc_aligned(dev->cma_area, size, gfp);
+	if (size <= PAGE_SIZE || !qcom_dma_contiguous_default_area)
+		return NULL;
+	return cma_alloc_aligned(qcom_dma_contiguous_default_area, size, gfp);
+}
+
+void qcom_dma_free_contiguous(struct device *dev, struct page *page, size_t size)
+{
+	if (!cma_release(qcom_dev_get_cma_area(dev), page,
+			 PAGE_ALIGN(size) >> PAGE_SHIFT))
+		__free_pages(page, get_order(size));
+}
+
+
+/*
+ * find_vm_area is not exported. Some dma apis expect that an array of
+ * struct pages can be saved in the vm_area, and retrieved at a later time.
+ */
+struct rb_root _root;
+struct rb_root *root = &_root;
+DEFINE_MUTEX(rbtree_lock);
+
+struct qcom_iommu_dma_area {
+	struct rb_node node;
+	unsigned long addr;
+	struct page **pages;
+};
+
+static void qcom_insert_vm_area(struct qcom_iommu_dma_area *area)
+{
+	struct rb_node **new, *parent;
+
+	mutex_lock(&rbtree_lock);
+
+	parent = NULL;
+	new = &root->rb_node;
+	while (*new) {
+		struct qcom_iommu_dma_area *entry;
+
+		entry = rb_entry(*new,
+				struct qcom_iommu_dma_area,
+				node);
+
+		parent = *new;
+		if (area->addr < entry->addr)
+			new = &((*new)->rb_left);
+		else if (area->addr > entry->addr)
+			new = &((*new)->rb_right);
+		else {
+			mutex_unlock(&rbtree_lock);
+			WARN_ON(1);
+			return;
+		}
+	}
+
+	rb_link_node(&area->node, parent, new);
+	rb_insert_color(&area->node, root);
+	mutex_unlock(&rbtree_lock);
+}
+
+static struct qcom_iommu_dma_area *qcom_find_vm_area(const void *cpu_addr)
+{
+	struct rb_node *node;
+	struct qcom_iommu_dma_area *entry;
+	unsigned long addr = (unsigned long)cpu_addr;
+
+	mutex_lock(&rbtree_lock);
+	node = root->rb_node;
+	while (node) {
+		entry = rb_entry(node,
+				struct qcom_iommu_dma_area,
+				node);
+
+		if (addr < entry->addr)
+			node = node->rb_left;
+		else if (addr > entry->addr)
+			node = node->rb_right;
+		else {
+			mutex_unlock(&rbtree_lock);
+			return entry;
+		}
+	}
+
+	mutex_unlock(&rbtree_lock);
+
+	return NULL;
+}
+
+struct page **qcom_dma_common_find_pages(void *cpu_addr)
+{
+	struct qcom_iommu_dma_area *area = qcom_find_vm_area(cpu_addr);
+
+	if (!area)
+		return NULL;
+	return area->pages;
+}
+
+/*
+ * Remaps an array of PAGE_SIZE pages into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *qcom_dma_common_pages_remap(struct page **pages, size_t size,
+			 pgprot_t prot, const void *caller)
+{
+	struct qcom_iommu_dma_area *area;
+	void *vaddr;
+
+	area = kzalloc(sizeof(*area), GFP_KERNEL);
+	if (!area)
+		return NULL;
+
+	vaddr = vmap(pages, PAGE_ALIGN(size) >> PAGE_SHIFT,
+		     VM_DMA_COHERENT, prot);
+	if (!vaddr) {
+		kfree(area);
+		return NULL;
+	}
+
+	area->pages = pages;
+	area->addr = (unsigned long)vaddr;
+	qcom_insert_vm_area(area);
+
+	return vaddr;
+}
+
+/*
+ * Remaps an allocated contiguous region into another vm_area.
+ * Cannot be used in non-sleeping contexts
+ */
+void *qcom_dma_common_contiguous_remap(struct page *page, size_t size,
+			pgprot_t prot, const void *caller)
+{
+	int count = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	struct page **pages;
+	void *vaddr;
+	int i;
+
+	pages = kmalloc_array(count, sizeof(struct page *), GFP_KERNEL);
+	if (!pages)
+		return NULL;
+	for (i = 0; i < count; i++)
+		pages[i] = nth_page(page, i);
+	vaddr = vmap(pages, count, VM_DMA_COHERENT, prot);
+	kfree(pages);
+
+	return vaddr;
+}
+
+/*
+ * Unmaps a range previously mapped by dma_common_contiguous_remap or
+ * dma_common_pages_remap. Note that dma_common_contiguous_remap does
+ * not insert an rb_tree entry since there is no pages array to save.
+ */
+void qcom_dma_common_free_remap(void *cpu_addr, size_t size)
+{
+	struct qcom_iommu_dma_area *area;
+
+	/* qcom_dma_common_contiguous_remap doesn't save the pages array */
+	area = qcom_find_vm_area(cpu_addr);
+	if (area) {
+		mutex_lock(&rbtree_lock);
+		rb_erase(&area->node, root);
+		mutex_unlock(&rbtree_lock);
+		kfree(area);
+	}
+
+	vunmap(cpu_addr);
+}
+
+static struct gen_pool *atomic_pool __ro_after_init;
+
+static size_t atomic_pool_size;
+static unsigned long current_pool_size;
+
+/* Dynamic background expansion when the atomic pool is near capacity */
+static struct work_struct atomic_pool_work;
+
+static void dma_atomic_pool_debugfs_init(void)
+{
+	struct dentry *root;
+
+	root = debugfs_create_dir("qcom_dma_pools", NULL);
+	if (IS_ERR_OR_NULL(root))
+		return;
+
+	debugfs_create_ulong("pool_size", 0400, root, &current_pool_size);
+}
+
+static void dma_atomic_pool_size_add(gfp_t gfp, size_t size)
+{
+	current_pool_size += size;
+}
+
+static int atomic_pool_expand(struct gen_pool *pool, size_t pool_size,
+			      gfp_t gfp)
+{
+	unsigned int order;
+	struct page *page = NULL;
+	void *addr;
+	int ret = -ENOMEM;
+
+	/* Cannot allocate larger than MAX_ORDER - 1 */
+	order = min(get_order(pool_size), MAX_ORDER - 1);
+
+	do {
+		pool_size = 1 << (PAGE_SHIFT + order);
+
+		if (qcom_dev_get_cma_area(NULL))
+			page = qcom_dma_alloc_from_contiguous(NULL, 1 << order,
+							      order, false);
+		else
+			page = alloc_pages(gfp, order);
+	} while (!page && order-- > 0);
+	if (!page)
+		goto out;
+
+	qcom_arch_dma_prep_coherent(page, pool_size);
+
+	addr = qcom_dma_common_contiguous_remap(page, pool_size,
+						pgprot_dmacoherent(PAGE_KERNEL),
+						__builtin_return_address(0));
+	if (!addr)
+		goto free_page;
+
+	ret = gen_pool_add_virt(pool, (unsigned long)addr, page_to_phys(page),
+				pool_size, NUMA_NO_NODE);
+	if (ret)
+		goto remove_mapping;
+
+	dma_atomic_pool_size_add(gfp, pool_size);
+	return 0;
+
+remove_mapping:
+	qcom_dma_common_free_remap(addr, pool_size);
+free_page:
+	if (!qcom_dma_release_from_contiguous(NULL, page, 1 << order))
+		__free_pages(page, order);
+out:
+	return ret;
+}
+
+static void atomic_pool_resize(struct gen_pool *pool, gfp_t gfp)
+{
+	if (pool && gen_pool_avail(pool) < atomic_pool_size)
+		atomic_pool_expand(pool, gen_pool_size(pool), gfp);
+}
+
+static void atomic_pool_work_fn(struct work_struct *work)
+{
+	atomic_pool_resize(atomic_pool, GFP_KERNEL);
+}
+
+static struct gen_pool *__dma_atomic_pool_init(size_t pool_size, gfp_t gfp)
+{
+	struct gen_pool *pool;
+	int ret;
+
+	pool = gen_pool_create(PAGE_SHIFT, NUMA_NO_NODE);
+	if (!pool)
+		return NULL;
+
+	gen_pool_set_algo(pool, gen_pool_first_fit_order_align, NULL);
+
+	ret = atomic_pool_expand(pool, pool_size, gfp);
+	if (ret) {
+		gen_pool_destroy(pool);
+		pr_err("DMA: failed to allocate %zu KiB %pGg pool for atomic allocation\n",
+		       pool_size >> 10, &gfp);
+		return NULL;
+	}
+
+	pr_info("DMA preallocated %zu KiB %pGg pool for atomic allocations\n",
+		gen_pool_size(pool) >> 10, &gfp);
+	return pool;
+}
+
+static int dma_atomic_pool_init(struct device *dev)
+{
+	int ret = 0;
+	unsigned long pages;
+
+	/* Default the pool size to 128KB per 1 GB of memory, min 128 KB, max MAX_ORDER - 1. */
+	pages = totalram_pages() / (SZ_1G / SZ_128K);
+	pages = min_t(unsigned long, pages, MAX_ORDER_NR_PAGES);
+	atomic_pool_size = max_t(size_t, pages << PAGE_SHIFT, SZ_128K);
+	INIT_WORK(&atomic_pool_work, atomic_pool_work_fn);
+
+	atomic_pool = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL);
+	if (!atomic_pool)
+		return -ENOMEM;
+
+	dma_atomic_pool_debugfs_init();
+	return ret;
+}
+
+/*
+ * Couldn't implement this via dma_alloc_attrs(qcom_iommu_dma_dev, GFP_ATOMIC)
+ * due to dma_free_from_pool only passing in cpu_addr & not dma_handle.
+ */
+void *qcom_dma_alloc_from_pool(struct device *dev, size_t size,
+			struct page **ret_page, gfp_t flags)
+{
+	unsigned long val;
+	void *ptr = NULL;
+
+	if (!atomic_pool) {
+		WARN(1, "coherent pool not initialised!\n");
+		return NULL;
+	}
+
+	val = gen_pool_alloc(atomic_pool, size);
+	if (val) {
+		phys_addr_t phys = gen_pool_virt_to_phys(atomic_pool, val);
+
+		*ret_page = pfn_to_page(__phys_to_pfn(phys));
+		ptr = (void *)val;
+		memset(ptr, 0, size);
+	}
+	if (gen_pool_avail(atomic_pool) < atomic_pool_size)
+		schedule_work(&atomic_pool_work);
+
+	return ptr;
+}
+
+bool qcom_dma_free_from_pool(struct device *dev, void *start, size_t size)
+{
+	if (!atomic_pool || !gen_pool_has_addr(atomic_pool, (unsigned long)start, size))
+		return false;
+	gen_pool_free(atomic_pool, (unsigned long)start, size);
+	return true;
+}
+
+static void qcom_dma_atomic_pool_exit(struct device *dev)
+{
+	unsigned long nr_pages = atomic_pool_size >> PAGE_SHIFT;
+	void *addr;
+	struct page *page;
+
+	/*
+	 * Find the starting address. Pool is expected to be unused.
+	 *
+	 * While the pool size can expand, it is okay to use the initial size
+	 * here, as this function can only ever be called prior to the pool
+	 * ever being used. The pool can only expand when an allocation is satisfied
+	 * from it, which would not be possible by the time this function is called.
+	 * Therefore the size of the pool will be the initial size.
+	 */
+	addr = (void *)gen_pool_alloc(atomic_pool, atomic_pool_size);
+	if (!addr) {
+		WARN_ON(1);
+		return;
+	}
+
+	gen_pool_free(atomic_pool, (unsigned long)addr, atomic_pool_size);
+	gen_pool_destroy(atomic_pool);
+	page = vmalloc_to_page(addr);
+	qcom_dma_common_free_remap(addr, atomic_pool_size);
+	qcom_dma_release_from_contiguous(dev, page, nr_pages);
+}
+
+/*
+ * struct dma_coherent_mem is private, so we cna't access it. 0 indicates
+ * an error condition for dma_mmap_from_dev_coherent.
+ */
+int qcom_dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
+			   void *vaddr, size_t size, int *ret)
+{
+	return 0;
+}
+
+/*
+ * Return the page attributes used for mapping dma_alloc_* memory, either in
+ * kernel space if remapping is needed, or to userspace through dma_mmap_*.
+ */
+pgprot_t qcom_dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
+{
+	if (dev_is_dma_coherent(dev))
+		return prot;
+#ifdef CONFIG_ARCH_HAS_DMA_WRITE_COMBINE
+	if (attrs & DMA_ATTR_WRITE_COMBINE)
+		return pgprot_writecombine(prot);
+#endif
+	return pgprot_dmacoherent(prot);
+}
+
+/**
+ * dma_info_to_prot - Translate DMA API directions and attributes to IOMMU API
+ *                    page flags.
+ * @dir: Direction of DMA transfer
+ * @coherent: Is the DMA master cache-coherent?
+ * @attrs: DMA attributes for the mapping
+ *
+ * Return: corresponding IOMMU API page protection flags
+ */
+int qcom_dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+		     unsigned long attrs)
+{
+	int prot = coherent ? IOMMU_CACHE : 0;
+
+	if (attrs & DMA_ATTR_PRIVILEGED)
+		prot |= IOMMU_PRIV;
+
+	if (attrs & DMA_ATTR_SYS_CACHE)
+		prot |= IOMMU_SYS_CACHE;
+
+	if (attrs & DMA_ATTR_SYS_CACHE_NWA)
+		prot |= IOMMU_SYS_CACHE_NWA;
+
+	switch (dir) {
+	case DMA_BIDIRECTIONAL:
+		return prot | IOMMU_READ | IOMMU_WRITE;
+	case DMA_TO_DEVICE:
+		return prot | IOMMU_READ;
+	case DMA_FROM_DEVICE:
+		return prot | IOMMU_WRITE;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * The DMA API client is passing in a scatterlist which could describe
+ * any old buffer layout, but the IOMMU API requires everything to be
+ * aligned to IOMMU pages. Hence the need for this complicated bit of
+ * impedance-matching, to be able to hand off a suitably-aligned list,
+ * but still preserve the original offsets and sizes for the caller.
+ */
+size_t qcom_iommu_dma_prepare_map_sg(struct device *dev, struct iova_domain *iovad,
+				struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s, *prev = NULL;
+	size_t iova_len = 0;
+	unsigned long mask = dma_get_seg_boundary(dev);
+	int i;
+
+	/*
+	 * Work out how much IOVA space we need, and align the segments to
+	 * IOVA granules for the IOMMU driver to handle. With some clever
+	 * trickery we can modify the list in-place, but reversibly, by
+	 * stashing the unaligned parts in the as-yet-unused DMA fields.
+	 */
+	for_each_sg(sg, s, nents, i) {
+		size_t s_iova_off = iova_offset(iovad, s->offset);
+		size_t s_length = s->length;
+		size_t pad_len = (mask - iova_len + 1) & mask;
+
+		sg_dma_address(s) = s_iova_off;
+		sg_dma_len(s) = s_length;
+		s->offset -= s_iova_off;
+		s_length = iova_align(iovad, s_length + s_iova_off);
+		s->length = s_length;
+
+		/*
+		 * Due to the alignment of our single IOVA allocation, we can
+		 * depend on these assumptions about the segment boundary mask:
+		 * - If mask size >= IOVA size, then the IOVA range cannot
+		 *   possibly fall across a boundary, so we don't care.
+		 * - If mask size < IOVA size, then the IOVA range must start
+		 *   exactly on a boundary, therefore we can lay things out
+		 *   based purely on segment lengths without needing to know
+		 *   the actual addresses beforehand.
+		 * - The mask must be a power of 2, so pad_len == 0 if
+		 *   iova_len == 0, thus we cannot dereference prev the first
+		 *   time through here (i.e. before it has a meaningful value).
+		 */
+		if (pad_len && pad_len < s_length - 1) {
+			prev->length += pad_len;
+			iova_len += pad_len;
+		}
+
+		iova_len += s_length;
+		prev = s;
+	}
+
+	return iova_len;
+}
+
+/*
+ * Prepare a successfully-mapped scatterlist to give back to the caller.
+ *
+ * At this point the segments are already laid out by iommu_dma_map_sg() to
+ * avoid individually crossing any boundaries, so we merely need to check a
+ * segment's start address to avoid concatenating across one.
+ */
+int qcom_iommu_dma_finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
+		dma_addr_t dma_addr)
+{
+	struct scatterlist *s, *cur = sg;
+	unsigned long seg_mask = dma_get_seg_boundary(dev);
+	unsigned int cur_len = 0, max_len = dma_get_max_seg_size(dev);
+	int i, count = 0;
+
+	for_each_sg(sg, s, nents, i) {
+		/* Restore this segment's original unaligned fields first */
+		unsigned int s_iova_off = sg_dma_address(s);
+		unsigned int s_length = sg_dma_len(s);
+		unsigned int s_iova_len = s->length;
+
+		s->offset += s_iova_off;
+		s->length = s_length;
+		sg_dma_address(s) = DMA_MAPPING_ERROR;
+		sg_dma_len(s) = 0;
+
+		/*
+		 * Now fill in the real DMA data. If...
+		 * - there is a valid output segment to append to
+		 * - and this segment starts on an IOVA page boundary
+		 * - but doesn't fall at a segment boundary
+		 * - and wouldn't make the resulting output segment too long
+		 */
+		if (cur_len && !s_iova_off && (dma_addr & seg_mask) &&
+		    (max_len - cur_len >= s_length)) {
+			/* ...then concatenate it with the previous one */
+			cur_len += s_length;
+		} else {
+			/* Otherwise start the next output segment */
+			if (i > 0)
+				cur = sg_next(cur);
+			cur_len = s_length;
+			count++;
+
+			sg_dma_address(cur) = dma_addr + s_iova_off;
+		}
+
+		sg_dma_len(cur) = cur_len;
+		dma_addr += s_iova_len;
+
+		if (s_length + s_iova_off < s_iova_len)
+			cur_len = 0;
+	}
+	return count;
+}
+
+/*
+ * If mapping failed, then just restore the original list,
+ * but making sure the DMA fields are invalidated.
+ */
+void qcom_iommu_dma_invalidate_sg(struct scatterlist *sg, int nents)
+{
+	struct scatterlist *s;
+	int i;
+
+	for_each_sg(sg, s, nents, i) {
+		if (sg_dma_address(s) != DMA_MAPPING_ERROR)
+			s->offset += sg_dma_address(s);
+		if (sg_dma_len(s))
+			s->length = sg_dma_len(s);
+		sg_dma_address(s) = DMA_MAPPING_ERROR;
+		sg_dma_len(s) = 0;
+	}
+}
+
+/**
+ * __iommu_dma_mmap - Map a buffer into provided user VMA
+ * @pages: Array representing buffer from __iommu_dma_alloc()
+ * @size: Size of buffer in bytes
+ * @vma: VMA describing requested userspace mapping
+ *
+ * Maps the pages of the buffer in @pages into @vma. The caller is responsible
+ * for verifying the correct size and protection of @vma beforehand.
+ */
+static int __qcom_iommu_dma_mmap(struct page **pages, size_t size,
+		struct vm_area_struct *vma)
+{
+	return vm_map_pages(vma, pages, PAGE_ALIGN(size) >> PAGE_SHIFT);
+}
+
+int qcom_iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+	unsigned long pfn, off = vma->vm_pgoff;
+	int ret;
+
+	vma->vm_page_prot = qcom_dma_pgprot(dev, vma->vm_page_prot, attrs);
+
+	if (qcom_dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
+		return ret;
+
+	if (off >= nr_pages || vma_pages(vma) > nr_pages - off)
+		return -ENXIO;
+
+	if (is_vmalloc_addr(cpu_addr)) {
+		struct page **pages = qcom_dma_common_find_pages(cpu_addr);
+
+		if (pages)
+			return __qcom_iommu_dma_mmap(pages, size, vma);
+		pfn = vmalloc_to_pfn(cpu_addr);
+	} else {
+		pfn = page_to_pfn(virt_to_page(cpu_addr));
+	}
+
+	return remap_pfn_range(vma, vma->vm_start, pfn + off,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+int qcom_iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs)
+{
+	struct page *page;
+	int ret;
+
+	if (is_vmalloc_addr(cpu_addr)) {
+		struct page **pages = qcom_dma_common_find_pages(cpu_addr);
+
+		if (pages) {
+			return sg_alloc_table_from_pages(sgt, pages,
+					PAGE_ALIGN(size) >> PAGE_SHIFT,
+					0, size, GFP_KERNEL);
+		}
+
+		page = vmalloc_to_page(cpu_addr);
+	} else {
+		page = virt_to_page(cpu_addr);
+	}
+
+	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
+	if (!ret)
+		sg_set_page(sgt->sgl, page, PAGE_ALIGN(size), 0);
+	return ret;
+}
+
+static int qcom_dma_iommu_probe(struct platform_device *pdev)
+{
+	int ret;
+	struct device *dev = &pdev->dev;
+
+	qcom_dma_iommu_dev = dev;
+	if (dev_is_dma_coherent(dev)) {
+		dev_err(dev, "Cannot be dma-coherent\n");
+		return -EINVAL;
+	}
+
+	/* Should be connected to linux,cma-default node */
+	ret = of_reserved_mem_device_init_by_idx(dev, dev->of_node, 0);
+	if (ret)
+		return ret;
+
+	qcom_dma_contiguous_default_area = dev->cma_area;
+	if (!qcom_dma_contiguous_default_area) {
+		dev_err(dev, "Unable to find cma area\n");
+		return -EINVAL;
+	}
+
+	ret = dma_atomic_pool_init(dev);
+	if (ret)
+		goto out_iova_cache;
+
+	ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64));
+	if (ret)
+		goto out_atomic_pool;
+
+	probe_finished = true;
+	return 0;
+
+out_atomic_pool:
+	qcom_dma_atomic_pool_exit(dev);
+
+out_iova_cache:
+	return ret;
+}
+
+bool qcom_dma_iommu_is_ready(void)
+{
+	if (!probe_finished)
+		return false;
+	return true;
+}
+EXPORT_SYMBOL(qcom_dma_iommu_is_ready);
+
+static int qcom_dma_iommu_remove(struct platform_device *pdev)
+{
+	qcom_dma_atomic_pool_exit(&pdev->dev);
+	return 0;
+}
+
+static const struct of_device_id qcom_dma_iommu_of_match[] = {
+	{.compatible = "qcom,iommu-dma"},
+	{}
+};
+MODULE_DEVICE_TABLE(of, qcom_dma_iommu_of_match);
+
+static struct platform_driver qcom_dma_iommu_driver = {
+	.probe = qcom_dma_iommu_probe,
+	.remove = qcom_dma_iommu_remove,
+	.driver = {
+		.name = "qcom_dma_iommu",
+		.of_match_table = qcom_dma_iommu_of_match,
+		.suppress_bind_attrs    = true,
+	},
+};
+
+int __init qcom_dma_iommu_generic_driver_init(void)
+{
+	return platform_driver_register(&qcom_dma_iommu_driver);
+}
+
+void qcom_dma_iommu_generic_driver_exit(void)
+{
+	platform_driver_unregister(&qcom_dma_iommu_driver);
+}
--- a/drivers/iommu/qcom-dma-iommu-generic.h
+++ b/drivers/iommu/qcom-dma-iommu-generic.h
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2020-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef __QCOM_DMA_IOMMU_GENERIC_H
+#define __QCOM_DMA_IOMMU_GENERIC_H
+
+#include <linux/device.h>
+#include <linux/dma-direction.h>
+#include <linux/pci.h>
+
+#ifdef CONFIG_IOMMU_IO_PGTABLE_FAST
+
+bool qcom_dma_iommu_is_ready(void);
+extern int __init qcom_dma_iommu_generic_driver_init(void);
+extern void qcom_dma_iommu_generic_driver_exit(void);
+
+struct pci_host_bridge *qcom_pci_find_host_bridge(struct pci_bus *bus);
+
+void qcom_arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+void qcom_arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+		enum dma_data_direction dir);
+void qcom_arch_dma_prep_coherent(struct page *page, size_t size);
+
+/* kernel/dma/contiguous.c */
+struct page *qcom_dma_alloc_from_contiguous(struct device *dev, size_t count,
+	unsigned int align, bool no_warn);
+bool qcom_dma_release_from_contiguous(struct device *dev, struct page *pages,
+	int count);
+struct page *qcom_dma_alloc_contiguous(struct device *dev, size_t size,
+	gfp_t gfp);
+void qcom_dma_free_contiguous(struct device *dev, struct page *page,
+	size_t size);
+
+/* kernel/dma/remap.c */
+struct page **qcom_dma_common_find_pages(void *cpu_addr);
+void *qcom_dma_common_pages_remap(struct page **pages, size_t size,
+	 pgprot_t prot, const void *caller);
+void *qcom_dma_common_contiguous_remap(struct page *page, size_t size,
+	pgprot_t prot, const void *caller);
+void qcom_dma_common_free_remap(void *cpu_addr, size_t size);
+void *qcom_dma_alloc_from_pool(struct device *dev, size_t size,
+	struct page **ret_page, gfp_t flags);
+bool qcom_dma_free_from_pool(struct device *dev, void *start, size_t size);
+
+int qcom_dma_mmap_from_dev_coherent(struct device *dev,
+	struct vm_area_struct *vma, void *vaddr, size_t size, int *ret);
+
+/* kernel/dma/mapping.c */
+pgprot_t qcom_dma_pgprot(struct device *dev, pgprot_t prot,
+	unsigned long attrs);
+
+/* DMA-IOMMU utilities */
+int qcom_dma_info_to_prot(enum dma_data_direction dir, bool coherent,
+		     unsigned long attrs);
+size_t qcom_iommu_dma_prepare_map_sg(struct device *dev, struct iova_domain *iovad,
+				struct scatterlist *sg, int nents);
+int qcom_iommu_dma_finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
+		dma_addr_t dma_addr);
+void qcom_iommu_dma_invalidate_sg(struct scatterlist *sg, int nents);
+int qcom_iommu_dma_mmap(struct device *dev, struct vm_area_struct *vma,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+int qcom_iommu_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
+		void *cpu_addr, dma_addr_t dma_addr, size_t size,
+		unsigned long attrs);
+
+#else /*CONFIG_IOMMU_IO_PGTABLE_FAST*/
+
+static inline bool qcom_dma_iommu_is_ready(void)
+{
+	return true;
+}
+
+static inline int __init qcom_dma_iommu_generic_driver_init(void)
+{
+	return 0;
+}
+
+static inline void qcom_dma_iommu_generic_driver_exit(void) {}
+
+#endif /*CONFIG_IOMMU_IO_PGTABLE_FAST*/
+#endif	/* __QCOM_DMA_IOMMU_GENERIC_H */
--- a/drivers/iommu/qcom-io-pgtable-alloc.c
+++ b/drivers/iommu/qcom-io-pgtable-alloc.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2021 Qualcomm Innovation Center, Inc. All rights reserved.
+ * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/shrinker.h>
+#include <linux/slab.h>
+#include <linux/firmware/qcom/qcom_scm.h>
+
+#include <soc/qcom/secure_buffer.h>
+
+struct io_pgtable_pool {
+	u32 vmid;
+	struct kref ref;
+	spinlock_t pool_lock;
+	struct list_head page_pool;
+};
+
+static DEFINE_MUTEX(page_pool_xa_lock);
+static DEFINE_XARRAY(page_pool_xa);
+static atomic_long_t page_pool_count = ATOMIC_LONG_INIT(0);
+
+static bool is_secure_vmid(u32 vmid)
+{
+	return !!vmid;
+}
+
+static int io_pgtable_hyp_assign_page(u32 vmid, struct page *page)
+{
+	struct qcom_scm_vmperm dst_vmids[] = {{QCOM_SCM_VMID_HLOS,
+					       PERM_READ | PERM_WRITE},
+					      {vmid, PERM_READ}};
+	u64 src_vmid_list = BIT(QCOM_SCM_VMID_HLOS);
+	phys_addr_t page_addr = page_to_phys(page);
+	int ret;
+
+	ret = qcom_scm_assign_mem(page_to_phys(page), PAGE_SIZE, &src_vmid_list,
+			      dst_vmids, ARRAY_SIZE(dst_vmids));
+	if (ret)
+		pr_err("failed qcom_assign for %pa address of size %zx - subsys VMid %d rc:%d\n",
+			&page_addr, PAGE_SIZE, vmid, ret);
+
+	WARN(ret, "failed to assign memory to VMID: %u rc:%d\n", vmid, ret);
+	return ret ? -EADDRNOTAVAIL : 0;
+}
+
+static int io_pgtable_hyp_unassign_page(u32 vmid, struct page *page)
+{
+	struct qcom_scm_vmperm dst_vmids[] = {{QCOM_SCM_VMID_HLOS,
+					      PERM_READ | PERM_WRITE | PERM_EXEC}};
+	u64 src_vmid_list = BIT(QCOM_SCM_VMID_HLOS) | BIT(vmid);
+	phys_addr_t page_addr = page_to_phys(page);
+	int ret;
+
+	ret = qcom_scm_assign_mem(page_to_phys(page), PAGE_SIZE, &src_vmid_list,
+			      dst_vmids, ARRAY_SIZE(dst_vmids));
+	if (ret)
+		pr_err("failed qcom_assign for unassigning %pa address of size %zx - subsys VMid %d rc:%d\n",
+			&page_addr, PAGE_SIZE, vmid, ret);
+
+	WARN(ret, "failed to unassign memory from VMID: %u rc: %d\n", vmid, ret);
+	return ret ? -EADDRNOTAVAIL : 0;
+}
+
+static struct page *__alloc_page_from_pool(struct list_head *page_pool)
+{
+	struct page *page;
+
+	page = list_first_entry_or_null(page_pool, struct page, lru);
+	if (page) {
+		list_del(&page->lru);
+		atomic_long_dec(&page_pool_count);
+		dec_node_page_state(page, NR_KERNEL_MISC_RECLAIMABLE);
+	}
+
+	return page;
+}
+
+static struct page *alloc_page_from_pool(u32 vmid)
+{
+	struct io_pgtable_pool *pool = xa_load(&page_pool_xa, vmid);
+	struct page *page;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	page = __alloc_page_from_pool(&pool->page_pool);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return page;
+}
+
+static void free_page_to_pool(struct page *page)
+{
+	u32 vmid = page_private(page);
+	struct io_pgtable_pool *pool = xa_load(&page_pool_xa, vmid);
+	unsigned long flags;
+
+	clear_page(page_address(page));
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	list_add(&page->lru, &pool->page_pool);
+	atomic_long_inc(&page_pool_count);
+	inc_node_page_state(page, NR_KERNEL_MISC_RECLAIMABLE);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+}
+
+/* Assumes that page_pool_xa_lock is held. */
+static void io_pgtable_pool_release(struct kref *ref)
+{
+	struct io_pgtable_pool *pool = container_of(ref, struct io_pgtable_pool, ref);
+	struct page *page;
+	bool secure_vmid = is_secure_vmid(pool->vmid);
+
+	xa_erase(&page_pool_xa, pool->vmid);
+
+	/*
+	 * There's no need to take the pool lock, as the pool is no longer accessible to other
+	 * IOMMU clients. There's no possibility for concurrent access either as this
+	 * function is only invoked when the last reference is removed.
+	 */
+	page = __alloc_page_from_pool(&pool->page_pool);
+	while (page) {
+		if (!secure_vmid || !io_pgtable_hyp_unassign_page(pool->vmid, page))
+			__free_page(page);
+
+		page = __alloc_page_from_pool(&pool->page_pool);
+	}
+
+	kfree(pool);
+}
+
+/*
+ * qcom_io_pgtable_allocator_register: Register with the io-pgtable allocator interface.
+ *
+ * @vmid: The VMID that io-pgtable memory needs to be shared with when allocated. If VMID
+ *        is 0, then page table memory will not be shared with any other VMs.
+ *
+ * On success, 0 is returned and there will be a reference held for metadata associated with
+ * @vmid. Otherwise, an error code will be returned.
+ */
+int qcom_io_pgtable_allocator_register(u32 vmid)
+{
+	struct io_pgtable_pool *pool;
+	int ret = 0;
+
+	mutex_lock(&page_pool_xa_lock);
+	pool = xa_load(&page_pool_xa, vmid);
+	if (pool) {
+		kref_get(&pool->ref);
+		goto out;
+	}
+
+	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
+	if (!pool) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	pool->vmid = vmid;
+	kref_init(&pool->ref);
+	spin_lock_init(&pool->pool_lock);
+	INIT_LIST_HEAD(&pool->page_pool);
+
+	ret = xa_err(xa_store(&page_pool_xa, vmid, pool, GFP_KERNEL));
+	if (ret < 0)
+		kfree(pool);
+out:
+	mutex_unlock(&page_pool_xa_lock);
+	return ret;
+}
+
+/*
+ * qcom_io_pgtable_allocator_unregister: Unregister with the io-pgtable allocator interface.
+ *
+ * @vmid: The VMID that was used when registering with the interface with
+ *        qcom_io_pgtable_allocator_register().
+ *
+ * Decrements the references to allocator metadata for @vmid.
+ *
+ * If this call results in references to @vmid dropping to 0, then all metadata and pages
+ * associated with @vmid are released.
+ */
+void qcom_io_pgtable_allocator_unregister(u32 vmid)
+{
+	struct io_pgtable_pool *pool;
+
+	mutex_lock(&page_pool_xa_lock);
+	pool = xa_load(&page_pool_xa, vmid);
+	kref_put(&pool->ref, io_pgtable_pool_release);
+	mutex_unlock(&page_pool_xa_lock);
+}
+
+/*
+ * qcom_io_pgtable_alloc_page: Allocate page table memory from the io-pgtable allocator.
+ *
+ * @vmid: The VMID that the page table memory should be shared with.
+ * @gfp: The GFP flags to be used for allocating the page table memory.
+ *
+ * This function may sleep if memory needs to be shared with other VMs.
+ *
+ * On success, a page will be returned. The page will also have been shared with other
+ * VMs--if any. In case of an error, this function returns NULL.
+ */
+struct page *qcom_io_pgtable_alloc_page(u32 vmid, gfp_t gfp)
+{
+	struct page *page;
+
+	/*
+	 * Mapping memory for secure domains may result in having to assign page table
+	 * memory to another VMID, which can sleep. Atomic and secure domains are
+	 * not a legal combination. We can use the GFP flags to detect atomic domains,
+	 * as they will have GFP_ATOMIC set.
+	 */
+	BUG_ON(!gfpflags_allow_blocking(gfp) && is_secure_vmid(vmid));
+
+	page = alloc_page_from_pool(vmid);
+	if (page)
+		return page;
+
+	page = alloc_page(gfp);
+	if (!page)
+		return NULL;
+	/* The page may be inaccessible if this is true, so leak it. */
+	else if (is_secure_vmid(vmid) && io_pgtable_hyp_assign_page(vmid, page))
+		return NULL;
+
+	set_page_private(page, (unsigned long)vmid);
+	return page;
+}
+
+/*
+ * qcom_io_pgtable_free_page: Frees page table memory.
+ *
+ * @page: The page to be freed.
+ *
+ * We cache pages in their respective page pools to improve performance
+ * for future allocations.
+ *
+ * Export this symbol for the IOMMU driver, since it decides when
+ * page table memory is freed after TLB maintenance.
+ */
+void qcom_io_pgtable_free_page(struct page *page)
+{
+	free_page_to_pool(page);
+}
+EXPORT_SYMBOL(qcom_io_pgtable_free_page);
+
+static unsigned long io_pgtable_alloc_count_objects(struct shrinker *shrinker,
+						    struct shrink_control *sc)
+{
+	unsigned long count = atomic_long_read(&page_pool_count);
+
+	return count ? count : SHRINK_EMPTY;
+}
+
+static unsigned long scan_page_pool(struct io_pgtable_pool *pool, struct list_head *freelist,
+				    unsigned long nr_to_scan)
+{
+	struct page *page;
+	unsigned long count = 0, flags;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	while (count < nr_to_scan) {
+		page = __alloc_page_from_pool(&pool->page_pool);
+		if (page) {
+			list_add(&page->lru, freelist);
+			count++;
+		} else {
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return count;
+}
+
+static unsigned long io_pgtable_alloc_scan_objects(struct shrinker *shrinker,
+						   struct shrink_control *sc)
+{
+	struct page *page, *tmp;
+	struct io_pgtable_pool *pool;
+	unsigned long index;
+	unsigned long nr_to_scan = sc->nr_to_scan, count = 0;
+	u32 vmid;
+	LIST_HEAD(freelist);
+
+	mutex_lock(&page_pool_xa_lock);
+	xa_for_each(&page_pool_xa, index, pool) {
+		count += scan_page_pool(pool, &freelist, nr_to_scan - count);
+		if (count >= nr_to_scan)
+			break;
+	}
+	mutex_unlock(&page_pool_xa_lock);
+
+	list_for_each_entry_safe(page, tmp, &freelist, lru) {
+		vmid = page_private(page);
+		list_del(&page->lru);
+
+		if (!is_secure_vmid(vmid) || !io_pgtable_hyp_unassign_page(vmid, page))
+			__free_page(page);
+		else
+			count--;
+	}
+
+	return count;
+}
+
+static struct shrinker io_pgtable_alloc_shrinker = {
+	.count_objects = io_pgtable_alloc_count_objects,
+	.scan_objects = io_pgtable_alloc_scan_objects,
+	.seeks = DEFAULT_SEEKS,
+};
+
+int qcom_io_pgtable_alloc_init(void)
+{
+	return register_shrinker(&io_pgtable_alloc_shrinker, "io_pgtable_alloc");
+}
+
+void qcom_io_pgtable_alloc_exit(void)
+{
+	unregister_shrinker(&io_pgtable_alloc_shrinker);
+}
--- a/drivers/iommu/qcom-io-pgtable-alloc.h
+++ b/drivers/iommu/qcom-io-pgtable-alloc.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2021,2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+#ifndef __QCOM_IO_PGTABLE_ALLOC_H
+#define __QCOM_IO_PGTABLE_ALLOC_H
+
+int qcom_io_pgtable_allocator_register(u32 vmid);
+void qcom_io_pgtable_allocator_unregister(u32 vmid);
+struct page *qcom_io_pgtable_alloc_page(u32 vmid, gfp_t gfp);
+void qcom_io_pgtable_free_page(struct page *page);
+int qcom_io_pgtable_alloc_init(void);
+void qcom_io_pgtable_alloc_exit(void);
+
+#endif /* __QCOM_IO_PGTABLE_ALLOC_H */
+
--- a/drivers/iommu/qcom-io-pgtable-arm.c
+++ b/drivers/iommu/qcom-io-pgtable-arm.c
--- a/drivers/iommu/qcom-iommu-debug-user.c
+++ b/drivers/iommu/qcom-iommu-debug-user.c
--- a/drivers/iommu/qcom-iommu-debug.c
+++ b/drivers/iommu/qcom-iommu-debug.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2015-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2022-2023 Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#define pr_fmt(fmt) "iommu-debug: %s: " fmt, __func__
+
+#include <linux/bitfield.h>
+#include <linux/debugfs.h>
+#include <linux/iommu.h>
+#include <linux/module.h>
+#include <linux/of_platform.h>
+#include <linux/qcom-iommu-util.h>
+#include "qcom-iommu-debug.h"
+
+#define USECASE_SWITCH_TIMEOUT_MSECS (500)
+
+static int iommu_debug_nr_iters_set(void *data, u64 val)
+{
+	struct iommu_debug_device *ddev = data;
+
+	if (!val)
+		val = 1;
+
+	if (val > 10000)
+		val = 10000;
+
+	ddev->nr_iters = (u32)val;
+
+	return 0;
+}
+
+static int iommu_debug_nr_iters_get(void *data, u64 *val)
+{
+	struct iommu_debug_device *ddev = data;
+
+	*val = ddev->nr_iters;
+
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(iommu_debug_nr_iters_fops,
+			 iommu_debug_nr_iters_get,
+			 iommu_debug_nr_iters_set,
+			 "%llu\n");
+
+int iommu_debug_check_mapping_flags(struct device *dev, dma_addr_t iova, size_t size,
+				    phys_addr_t expected_pa, u32 flags)
+{
+	struct qcom_iommu_atos_txn txn;
+	struct iommu_fwspec *fwspec;
+	struct iommu_domain *domain;
+
+	domain = iommu_get_domain_for_dev(dev);
+	if (!domain) {
+		dev_err(dev, "iommu_get_domain_for_dev() failed\n");
+		return -EINVAL;
+	}
+
+	fwspec = dev_iommu_fwspec_get(dev);
+	if (!fwspec) {
+		dev_err(dev, "dev_iommu_fwspec_get() failed\n");
+		return -EINVAL;
+	}
+
+	txn.addr = iova;
+	txn.id = FIELD_GET(ARM_SMMU_SMR_ID, (fwspec->ids[0]));
+	txn.flags = flags;
+
+	size = PAGE_ALIGN(size);
+	while (size) {
+		phys_addr_t walk_pa, atos_pa;
+
+		atos_pa = qcom_iommu_iova_to_phys_hard(domain, &txn);
+		walk_pa = iommu_iova_to_phys(domain, iova);
+
+		if (expected_pa != atos_pa || expected_pa != walk_pa) {
+			dev_err_ratelimited(dev,
+				"Bad translation for %pad! Expected: %pa Got: %pa (ATOS) %pa (Table Walk) sid=%08x\n",
+				&iova, &expected_pa, &atos_pa, &walk_pa, txn.id);
+			return -EINVAL;
+		}
+
+		size -= PAGE_SIZE;
+		iova += PAGE_SIZE;
+		expected_pa += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+int iommu_debug_check_mapping_sg_flags(struct device *dev, struct scatterlist *sgl,
+				       unsigned int pgoffset, unsigned int dma_nents,
+				       unsigned int nents, u32 flags)
+{
+	int ret;
+	struct sg_page_iter piter;
+	struct sg_dma_page_iter diter;
+
+	for (__sg_page_iter_start(&piter, sgl, nents, pgoffset),
+	     __sg_page_iter_start(&diter.base, sgl, dma_nents, pgoffset);
+	     __sg_page_iter_next(&piter) && __sg_page_iter_dma_next(&diter);) {
+
+		struct page *page = sg_page_iter_page(&piter);
+		dma_addr_t dma_addr = sg_page_iter_dma_address(&diter);
+
+		ret = iommu_debug_check_mapping_flags(dev, dma_addr, PAGE_SIZE,
+						      page_to_phys(page), flags);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+static void iommu_debug_destroy_test_dev(struct iommu_debug_device *ddev)
+{
+	if (ddev->test_dev) {
+		of_platform_device_destroy(ddev->test_dev, NULL);
+		ddev->test_dev = NULL;
+		ddev->domain = NULL;
+	}
+}
+
+/*
+ * Returns struct device corresponding to the new usecase.
+ * ddev->test_dev will change - caller must not use old value!
+ * Caller must hold ddev->state_lock
+ */
+struct device *
+iommu_debug_switch_usecase(struct iommu_debug_device *ddev, u32 usecase_nr)
+{
+	struct platform_device *test_pdev;
+	struct device_node *child;
+	const char *str;
+	int child_nr = 0;
+	int ret;
+
+	if (ddev->test_dev)
+		iommu_debug_destroy_test_dev(ddev);
+
+	if (usecase_nr >= of_get_child_count(ddev->self->of_node)) {
+		dev_err(ddev->self, "Invalid usecase nr requested: %u\n",
+			usecase_nr);
+		return NULL;
+	}
+
+	reinit_completion(&ddev->probe_wait);
+	for_each_child_of_node(ddev->self->of_node, child) {
+		if (child_nr == usecase_nr)
+			break;
+		child_nr++;
+	}
+
+	test_pdev = of_platform_device_create(child, NULL, ddev->self);
+	if (!test_pdev) {
+		dev_err(ddev->self, "Creating platform device failed\n");
+		return NULL;
+	}
+
+	/*
+	 * Wait for child device's probe function to be called.
+	 * Its very unlikely to be asynchonrous...
+	 */
+	ret = wait_for_completion_interruptible_timeout(&ddev->probe_wait,
+						msecs_to_jiffies(USECASE_SWITCH_TIMEOUT_MSECS));
+	if (ret <= 0) {
+		dev_err(ddev->self, "Timed out waiting for usecase to register\n");
+		goto out;
+	}
+
+	if (of_property_read_string(child, "qcom,iommu-dma", &str))
+		str = "default";
+
+	ddev->fastmap_usecase = !strcmp(str, "fastmap");
+	ddev->usecase_nr = usecase_nr;
+	ddev->test_dev = &test_pdev->dev;
+	ddev->domain = iommu_get_domain_for_dev(ddev->test_dev);
+	if (!ddev->domain) {
+		dev_err(ddev->self, "Oops, usecase not associated with iommu\n");
+		goto out;
+	}
+
+	return ddev->test_dev;
+out:
+	iommu_debug_destroy_test_dev(ddev);
+	return NULL;
+}
+
+/*
+ * Caller must hold ddev->state_lock
+ */
+struct device *iommu_debug_usecase_reset(struct iommu_debug_device *ddev)
+{
+	return iommu_debug_switch_usecase(ddev, ddev->usecase_nr);
+}
+
+static int iommu_debug_usecase_register(struct device *dev)
+{
+	struct iommu_debug_device *ddev = dev_get_drvdata(dev->parent);
+
+	complete(&ddev->probe_wait);
+	return 0;
+}
+
+static ssize_t iommu_debug_usecase_read(struct file *file, char __user *ubuf,
+					size_t count, loff_t *offset)
+{
+	struct iommu_debug_device *ddev = file->private_data;
+
+	return simple_read_from_buffer(ubuf, count, offset, ddev->buffer,
+				       strnlen(ddev->buffer, PAGE_SIZE));
+}
+
+static ssize_t iommu_debug_usecase_write(struct file *file, const char __user *ubuf,
+					 size_t count, loff_t *offset)
+{
+	struct iommu_debug_device *ddev = file->private_data;
+	unsigned int usecase_nr;
+	int ret;
+
+	ret = kstrtouint_from_user(ubuf, count, 0, &usecase_nr);
+	if (ret || usecase_nr >= ddev->nr_children)
+		return -EINVAL;
+
+	mutex_lock(&ddev->state_lock);
+	if (!iommu_debug_switch_usecase(ddev, usecase_nr)) {
+		mutex_unlock(&ddev->state_lock);
+		return -EINVAL;
+	}
+	mutex_unlock(&ddev->state_lock);
+
+	return count;
+}
+
+static const struct file_operations iommu_debug_usecase_fops = {
+	.open	 = simple_open,
+	.read	 = iommu_debug_usecase_read,
+	.write   = iommu_debug_usecase_write,
+	.llseek	 = no_llseek,
+};
+
+static int iommu_debug_debugfs_setup(struct iommu_debug_device *ddev)
+{
+	struct dentry *dir;
+
+	dir = debugfs_create_dir("iommu-test", NULL);
+	if (IS_ERR(dir))
+		return -EINVAL;
+
+	ddev->root_dir = dir;
+
+	debugfs_create_file("usecase", 0600, dir, ddev, &iommu_debug_usecase_fops);
+	debugfs_create_file("functional_arm_dma_api", 0400, dir, ddev,
+			    &iommu_debug_functional_arm_dma_api_fops);
+	debugfs_create_file("functional_fast_dma_api", 0400, dir, ddev,
+			    &iommu_debug_functional_fast_dma_api_fops);
+	debugfs_create_file("atos", 0600, dir, ddev, &iommu_debug_atos_fops);
+	debugfs_create_file("map", 0200, dir, ddev, &iommu_debug_map_fops);
+	debugfs_create_file("unmap", 0200, dir, ddev, &iommu_debug_unmap_fops);
+	debugfs_create_file("dma_map", 0200, dir, ddev, &iommu_debug_dma_map_fops);
+	debugfs_create_file("dma_unmap", 0200, dir, ddev, &iommu_debug_dma_unmap_fops);
+	debugfs_create_file("nr_iters", 0600, dir, ddev, &iommu_debug_nr_iters_fops);
+	debugfs_create_file("test_virt_addr", 0400, dir, ddev, &iommu_debug_test_virt_addr_fops);
+	debugfs_create_file("profiling", 0400, dir, ddev, &iommu_debug_profiling_fops);
+
+	return 0;
+}
+
+static int iommu_debug_probe(struct platform_device *pdev)
+{
+	struct iommu_debug_device *ddev;
+	struct device *dev = &pdev->dev;
+	struct device_node *child;
+	int ret;
+	int offset = 0;
+
+	ddev = devm_kzalloc(dev, sizeof(*ddev), GFP_KERNEL);
+	if (!ddev)
+		return -ENOMEM;
+
+	ddev->self = dev;
+	ddev->usecase_nr = U32_MAX;
+	ddev->nr_iters = 1;
+	mutex_init(&ddev->state_lock);
+	init_completion(&ddev->probe_wait);
+
+	ddev->buffer = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL);
+	if (!ddev->buffer) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ddev->nr_children = 0;
+	for_each_child_of_node(dev->of_node, child) {
+		offset += scnprintf(ddev->buffer + offset, PAGE_SIZE - offset,
+				"%d: %s\n", ddev->nr_children, child->name);
+		if (offset + 1 == PAGE_SIZE) {
+			dev_err(dev, "Too many testcases?\n");
+			break;
+		}
+		ddev->nr_children++;
+	}
+	dev_set_drvdata(dev, ddev);
+
+	ret = iommu_debug_debugfs_setup(ddev);
+	if (ret)
+		goto out;
+
+	return 0;
+
+out:
+	mutex_destroy(&ddev->state_lock);
+	return ret;
+}
+
+static int iommu_debug_remove(struct platform_device *pdev)
+{
+	struct iommu_debug_device *ddev = platform_get_drvdata(pdev);
+
+	debugfs_remove_recursive(ddev->root_dir);
+	if (ddev->test_dev)
+		of_platform_device_destroy(ddev->test_dev, NULL);
+
+	mutex_destroy(&ddev->state_lock);
+	return 0;
+}
+
+static const struct of_device_id iommu_debug_of_match[] = {
+	{ .compatible = "qcom,iommu-debug-test" },
+	{ },
+};
+
+static struct platform_driver iommu_debug_driver = {
+	.probe = iommu_debug_probe,
+	.remove = iommu_debug_remove,
+	.driver = {
+		.name = "qcom-iommu-debug",
+		.of_match_table = iommu_debug_of_match,
+	},
+};
+
+/*
+ * This isn't really a "driver", we just need something in the device tree
+ * to hook up to the `iommus' property.
+ */
+static int iommu_debug_usecase_probe(struct platform_device *pdev)
+{
+	return iommu_debug_usecase_register(&pdev->dev);
+}
+
+static const struct of_device_id iommu_debug_usecase_of_match[] = {
+	{ .compatible = "qcom,iommu-debug-usecase" },
+	{ },
+};
+
+static struct platform_driver iommu_debug_usecase_driver = {
+	.probe = iommu_debug_usecase_probe,
+	.driver = {
+		.name = "qcom-iommu-debug-usecase",
+		.of_match_table = iommu_debug_usecase_of_match,
+	},
+};
+
+static int iommu_debug_init(void)
+{
+	int ret;
+
+	ret = platform_driver_register(&iommu_debug_driver);
+	if (ret)
+		return ret;
+
+	ret = platform_driver_register(&iommu_debug_usecase_driver);
+	if (ret)
+		platform_driver_unregister(&iommu_debug_driver);
+	return ret;
+}
+
+static void iommu_debug_exit(void)
+{
+	platform_driver_unregister(&iommu_debug_usecase_driver);
+	platform_driver_unregister(&iommu_debug_driver);
+}
+
+module_init(iommu_debug_init);
+module_exit(iommu_debug_exit);
+
+MODULE_LICENSE("GPL");
--- a/drivers/iommu/qcom-iommu-debug.h
+++ b/drivers/iommu/qcom-iommu-debug.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2015-2021, The Linux Foundation. All rights reserved.
+ * Copyright (c) 2022-2023, Qualcomm Innovation Center, Inc. All rights reserved.
+ */
+
+#ifndef __DRIVERS_IOMMU_QCOM_IOMMU_DEBUG_H__
+#define __DRIVERS_IOMMU_QCOM_IOMMU_DEBUG_H__
+
+#include <linux/device.h>
+#include <linux/debugfs.h>
+#include <linux/iommu.h>
+#include <linux/completion.h>
+#include <linux/mutex.h>
+
+#define MSI_IOVA_BASE			0x8000000
+#define MSI_IOVA_LENGTH			0x100000
+#define ARM_SMMU_SMR_ID			GENMASK(15, 0)
+
+struct iommu_debug_device {
+	struct device *self;
+	u32 nr_children;
+	char *buffer;
+	struct dentry *root_dir;
+	/* for usecase under test */
+	struct device *test_dev;
+	struct iommu_domain *domain;
+	u32 usecase_nr;
+	bool fastmap_usecase;
+	/* Protects test_dev */
+	struct mutex state_lock;
+	/* For waiting for child probe to complete */
+	struct completion probe_wait;
+	/* Used for atos */
+	u64 iova;
+	/* number of iterations */
+	u32 nr_iters;
+};
+
+struct device *iommu_debug_usecase_reset(struct iommu_debug_device *ddev);
+struct device *iommu_debug_switch_usecase(struct iommu_debug_device *ddev, u32 usecase_nr);
+
+int iommu_debug_check_mapping_flags(struct device *dev, dma_addr_t iova, size_t size,
+				    phys_addr_t expected_pa, u32 flags);
+#define iommu_debug_check_mapping(d, i, s, p) \
+	iommu_debug_check_mapping_flags(d, i, s, p, 0)
+/* Only checks a single page */
+#define iommu_debug_check_mapping_fast(d, i, s, p) \
+	iommu_debug_check_mapping_flags(d, i, PAGE_SIZE, p, 0)
+
+int iommu_debug_check_mapping_sg_flags(struct device *dev, struct scatterlist *sgl,
+				       unsigned int pgoffset, unsigned int dma_nents,
+				       unsigned int nents, u32 flags);
+#define iommu_debug_check_mapping_sg(d, s, o, e1, e2) \
+	iommu_debug_check_mapping_sg_flags(d, s, o, e1, e2, 0)
+
+/* Only checks the last page of first sgl */
+static inline int iommu_debug_check_mapping_sg_fast(struct device *dev, struct scatterlist *sgl,
+						    unsigned int pgoffset, unsigned int dma_nents,
+						    unsigned int nents)
+{
+	pgoffset = PAGE_ALIGN(sgl->offset + sgl->length) >> PAGE_SHIFT;
+	return iommu_debug_check_mapping_sg_flags(dev, sgl, pgoffset - 1, dma_nents, 1, 0);
+}
+
+extern const struct file_operations iommu_debug_functional_arm_dma_api_fops;
+extern const struct file_operations iommu_debug_functional_fast_dma_api_fops;
+extern const struct file_operations iommu_debug_atos_fops;
+extern const struct file_operations iommu_debug_map_fops;
+extern const struct file_operations iommu_debug_unmap_fops;
+extern const struct file_operations iommu_debug_dma_map_fops;
+extern const struct file_operations iommu_debug_dma_unmap_fops;
+extern const struct file_operations iommu_debug_test_virt_addr_fops;
+extern const struct file_operations iommu_debug_profiling_fops;
+
+#endif
--- a/drivers/iommu/qcom-iommu-util.c
+++ b/drivers/iommu/qcom-iommu-util.c