ANDROID: Track per-process dmabuf RSS

DMA buffers exist for sharing memory (between processes, drivers, and
hardware) so they are not accounted the same way as user memory present
on a MM's LRUs. Per-process attribution of dmabuf memory is not
maintained by the kernel, so to obtain it from userspace, several files
from procfs and sysfs must be read any time the information is desired.
This process is slow, which can lead to dmabuf accounting information
being out-of-date when it is desired during events like low memory, or
bugreport generation, masking the cause of memory issues.

This patch attributes dmabuf memory to any process that holds a
reference to a buffer. A process can hold a reference to a dmabuf in two
ways:
  1) Through a file descriptor
  2) Though a mapping

A single buffer can be referenced more than once by a single process
with multiple file descriptors for the same buffer, multiple mappings
for the same buffer, or any combination of the two.

The full size of a buffer is effectively pinned until no references
exist from any process, or anywhere else in the kernel such as drivers
that have imported the buffer. Even if a partial mapping of the buffer
is the only reference that exists. Therefore buffer accounting is always
performed in units of the full buffer size, and only once for each
process, regardless of the number and type of references a process has
for a single buffer.

The /proc/<pid>/dmabuf_rss file in procfs now reports the sum of all
buffer sizes referenced by a process. The units are bytes. This allows
userspace to obtain per-process dmabuf accounting information quickly
compared to calculating it from multiple sources in procfs and sysfs.

Note that a dmabuf can be backed by different types of memory such as
system DRAM, GPU VRAM, or others. This patch makes no distinction
between these different types of memory, so on systems with non-unified
memory the reported values should be interpreted with this in mind.

Bug: 424648392
Change-Id: I1de8e937f2971fe714008b459e410dde2a251b90
Signed-off-by: T.J. Mercier <tjmercier@google.com>
This commit is contained in:
T.J. Mercier
2025-06-25 20:06:55 +00:00
committed by Suren Baghdasaryan
parent 250bbe1cbf
commit f44d593749
8 changed files with 307 additions and 7 deletions

View File

@@ -162,9 +162,121 @@ static struct file_system_type dma_buf_fs_type = {
.kill_sb = kill_anon_super,
};
static struct task_dma_buf_record *find_task_dmabuf_record(
struct task_struct *task, struct dma_buf *dmabuf)
{
struct task_dma_buf_record *rec;
lockdep_assert_held(&task->dmabuf_info->lock);
list_for_each_entry(rec, &task->dmabuf_info->dmabufs, node)
if (dmabuf == rec->dmabuf)
return rec;
return NULL;
}
static int new_task_dmabuf_record(struct task_struct *task, struct dma_buf *dmabuf)
{
struct task_dma_buf_record *rec;
lockdep_assert_held(&task->dmabuf_info->lock);
rec = kmalloc(sizeof(*rec), GFP_KERNEL);
if (!rec)
return -ENOMEM;
task->dmabuf_info->rss += dmabuf->size;
rec->dmabuf = dmabuf;
rec->refcnt = 1;
list_add(&rec->node, &task->dmabuf_info->dmabufs);
return 0;
}
/**
* dma_buf_account_task - Account a dmabuf to a task
* @dmabuf: [in] pointer to dma_buf
* @task: [in] pointer to task_struct
*
* When a process obtains a dmabuf file descriptor, or maps a dmabuf, this
* function attributes the provided @dmabuf to the @task. The first time @dmabuf
* is attributed to @task, the buffer's size is added to the @task's dmabuf RSS.
*
* Return:
* * 0 on success
* * A negative error code upon error
*/
int dma_buf_account_task(struct dma_buf *dmabuf, struct task_struct *task)
{
struct task_dma_buf_record *rec;
int ret = 0;
if (!dmabuf || !task)
return -EINVAL;
if (!task->dmabuf_info) {
pr_err("%s dmabuf accounting record was not allocated\n", __func__);
return -ENOMEM;
}
spin_lock(&task->dmabuf_info->lock);
rec = find_task_dmabuf_record(task, dmabuf);
if (!rec)
ret = new_task_dmabuf_record(task, dmabuf);
else
++rec->refcnt;
spin_unlock(&task->dmabuf_info->lock);
return ret;
}
/**
* dma_buf_unaccount_task - Unaccount a dmabuf from a task
* @dmabuf: [in] pointer to dma_buf
* @task: [in] pointer to task_struct
*
* When a process closes a dmabuf file descriptor, or unmaps a dmabuf, this
* function removes the provided @dmabuf attribution from the @task. When all
* references to @dmabuf are removed from @task, the buffer's size is removed
* from the task's dmabuf RSS.
*
* Return:
* * 0 on success
* * A negative error code upon error
*/
void dma_buf_unaccount_task(struct dma_buf *dmabuf, struct task_struct *task)
{
struct task_dma_buf_record *rec;
if (!dmabuf || !task)
return;
if (!task->dmabuf_info) {
pr_err("%s dmabuf accounting record was not allocated\n", __func__);
return;
}
spin_lock(&task->dmabuf_info->lock);
rec = find_task_dmabuf_record(task, dmabuf);
if (!rec) { /* Failed fd_install? */
pr_err("dmabuf not found in task list\n");
goto err;
}
if (--rec->refcnt == 0) {
list_del(&rec->node);
kfree(rec);
task->dmabuf_info->rss -= dmabuf->size;
}
err:
spin_unlock(&task->dmabuf_info->lock);
}
static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
{
struct dma_buf *dmabuf;
int ret;
if (!is_dma_buf_file(file))
return -EINVAL;
@@ -180,7 +292,15 @@ static int dma_buf_mmap_internal(struct file *file, struct vm_area_struct *vma)
dmabuf->size >> PAGE_SHIFT)
return -EINVAL;
return dmabuf->ops->mmap(dmabuf, vma);
ret = dma_buf_account_task(dmabuf, current);
if (ret)
return ret;
ret = dmabuf->ops->mmap(dmabuf, vma);
if (ret)
dma_buf_unaccount_task(dmabuf, current);
return ret;
}
static loff_t dma_buf_llseek(struct file *file, loff_t offset, int whence)
@@ -557,6 +677,12 @@ static void dma_buf_show_fdinfo(struct seq_file *m, struct file *file)
spin_unlock(&dmabuf->name_lock);
}
static int dma_buf_flush(struct file *file, fl_owner_t id)
{
dma_buf_unaccount_task(file->private_data, current);
return 0;
}
static const struct file_operations dma_buf_fops = {
.release = dma_buf_file_release,
.mmap = dma_buf_mmap_internal,
@@ -565,6 +691,7 @@ static const struct file_operations dma_buf_fops = {
.unlocked_ioctl = dma_buf_ioctl,
.compat_ioctl = compat_ptr_ioctl,
.show_fdinfo = dma_buf_show_fdinfo,
.flush = dma_buf_flush,
};
/*
@@ -1555,6 +1682,8 @@ EXPORT_SYMBOL_GPL(dma_buf_end_cpu_access_partial);
int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
unsigned long pgoff)
{
int ret;
if (WARN_ON(!dmabuf || !vma))
return -EINVAL;
@@ -1575,7 +1704,15 @@ int dma_buf_mmap(struct dma_buf *dmabuf, struct vm_area_struct *vma,
vma_set_file(vma, dmabuf->file);
vma->vm_pgoff = pgoff;
return dmabuf->ops->mmap(dmabuf, vma);
ret = dma_buf_account_task(dmabuf, current);
if (ret)
return ret;
ret = dmabuf->ops->mmap(dmabuf, vma);
if (ret)
dma_buf_unaccount_task(dmabuf, current);
return ret;
}
EXPORT_SYMBOL_NS_GPL(dma_buf_mmap, DMA_BUF);

View File

@@ -20,6 +20,7 @@
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/close_range.h>
#include <linux/dma-buf.h>
#include <net/sock.h>
#include "internal.h"
@@ -593,6 +594,9 @@ void fd_install(unsigned int fd, struct file *file)
struct files_struct *files = current->files;
struct fdtable *fdt;
if (is_dma_buf_file(file) && dma_buf_account_task(file->private_data, current))
pr_err("FD dmabuf accounting failed\n");
rcu_read_lock_sched();
if (unlikely(files->resize_in_progress)) {

View File

@@ -100,6 +100,7 @@
#include <linux/cn_proc.h>
#include <linux/ksm.h>
#include <linux/cpufreq_times.h>
#include <linux/dma-buf.h>
#include <trace/events/oom.h>
#include <trace/hooks/sched.h>
#include "internal.h"
@@ -3304,6 +3305,24 @@ static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
}
#endif /* CONFIG_STACKLEAK_METRICS */
#ifdef CONFIG_DMA_SHARED_BUFFER
static int proc_dmabuf_rss_show(struct seq_file *m, struct pid_namespace *ns,
struct pid *pid, struct task_struct *task)
{
if (!task->dmabuf_info) {
pr_err("%s dmabuf accounting record was not allocated\n", __func__);
return -ENOMEM;
}
if (!(task->flags & PF_KTHREAD))
seq_printf(m, "%lld\n", READ_ONCE(task->dmabuf_info->rss));
else
seq_puts(m, "0\n");
return 0;
}
#endif
/*
* Thread groups
*/
@@ -3427,6 +3446,9 @@ static const struct pid_entry tgid_base_stuff[] = {
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat),
#endif
#ifdef CONFIG_DMA_SHARED_BUFFER
ONE("dmabuf_rss", S_IRUGO, proc_dmabuf_rss_show),
#endif
};
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)

View File

@@ -24,6 +24,9 @@
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/android_kabi.h>
#ifndef __GENKSYMS__
#include <linux/refcount.h>
#endif
struct device;
struct dma_buf;
@@ -639,6 +642,43 @@ struct dma_buf_export_info {
ANDROID_KABI_RESERVE(2);
};
/**
* struct task_dma_buf_record - Holds the number of (VMA and FD) references to a
* dmabuf by a collection of tasks that share both mm_struct and files_struct.
* This is the list entry type for @task_dma_buf_info dmabufs list.
*
* @node: Stores the list this record is on.
* @dmabuf: The dmabuf this record is for.
* @refcnt: The number of VMAs and FDs that reference @dmabuf by the tasks that
* share this record.
*/
struct task_dma_buf_record {
struct list_head node;
struct dma_buf *dmabuf;
unsigned long refcnt;
};
/**
* struct task_dma_buf_info - Holds a RSS counter, and a list of dmabufs for all
* tasks that share both mm_struct and files_struct.
*
* @rss: The sum of all dmabuf memory referenced by the tasks via memory
* mappings or file descriptors in bytes. Buffers referenced more than
* once by the process (multiple mmaps, multiple FDs, or any combination
* of both mmaps and FDs) only cause the buffer to be accounted to the
* process once. Partial mappings cause the full size of the buffer to be
* accounted, regardless of the size of the mapping.
* @refcnt: The number of tasks sharing this struct.
* @lock: Lock protecting writes for @rss, and reads/writes for @dmabufs.
* @dmabufs: List of all dmabufs referenced by the tasks.
*/
struct task_dma_buf_info {
s64 rss;
refcount_t refcnt;
spinlock_t lock;
struct list_head dmabufs;
};
/**
* DEFINE_DMA_BUF_EXPORT_INFO - helper macro for exporters
* @name: export-info name
@@ -741,4 +781,7 @@ int dma_buf_vmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map);
void dma_buf_vunmap_unlocked(struct dma_buf *dmabuf, struct iosys_map *map);
long dma_buf_set_name(struct dma_buf *dmabuf, const char *name);
int dma_buf_get_flags(struct dma_buf *dmabuf, unsigned long *flags);
int dma_buf_account_task(struct dma_buf *dmabuf, struct task_struct *task);
void dma_buf_unaccount_task(struct dma_buf *dmabuf, struct task_struct *task);
#endif /* __DMA_BUF_H__ */

View File

@@ -70,6 +70,7 @@ struct seq_file;
struct sighand_struct;
struct signal_struct;
struct task_delay_info;
struct task_dma_buf_info;
struct task_group;
struct user_event_mm;
@@ -1516,6 +1517,9 @@ struct task_struct {
*/
struct callback_head l1d_flush_kill;
#endif
struct task_dma_buf_info *dmabuf_info;
ANDROID_KABI_RESERVE(1);
ANDROID_KABI_RESERVE(2);
ANDROID_KABI_RESERVE(3);

View File

@@ -214,6 +214,7 @@ struct task_struct init_task
.android_vendor_data1 = {0, },
.android_oem_data1 = {0, },
#endif
.dmabuf_info = NULL,
};
EXPORT_SYMBOL(init_task);

View File

@@ -101,6 +101,7 @@
#include <linux/iommu.h>
#include <linux/tick.h>
#include <linux/cpufreq_times.h>
#include <linux/dma-buf.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -994,12 +995,32 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}
static void put_dmabuf_info(struct task_struct *tsk)
{
if (!tsk->dmabuf_info) {
pr_err("%s dmabuf accounting record was not allocated\n", __func__);
return;
}
if (!refcount_dec_and_test(&tsk->dmabuf_info->refcnt))
return;
if (READ_ONCE(tsk->dmabuf_info->rss))
pr_err("%s destroying task with non-zero dmabuf rss\n", __func__);
if (!list_empty(&tsk->dmabuf_info->dmabufs))
pr_err("%s destroying task with non-empty dmabuf list\n", __func__);
kfree(tsk->dmabuf_info);
}
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
put_dmabuf_info(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
@@ -2268,6 +2289,58 @@ static void rv_task_fork(struct task_struct *p)
#define rv_task_fork(p) do {} while (0)
#endif
static int copy_dmabuf_info(u64 clone_flags, struct task_struct *p)
{
struct task_dma_buf_record *rec, *copy;
if (current->dmabuf_info && (clone_flags & (CLONE_VM | CLONE_FILES))
== (CLONE_VM | CLONE_FILES)) {
/*
* Both MM and FD references to dmabufs are shared with the parent, so
* we can share a RSS counter with the parent.
*/
refcount_inc(&current->dmabuf_info->refcnt);
p->dmabuf_info = current->dmabuf_info;
return 0;
}
p->dmabuf_info = kmalloc(sizeof(*p->dmabuf_info), GFP_KERNEL);
if (!p->dmabuf_info)
return -ENOMEM;
refcount_set(&p->dmabuf_info->refcnt, 1);
spin_lock_init(&p->dmabuf_info->lock);
INIT_LIST_HEAD(&p->dmabuf_info->dmabufs);
if (current->dmabuf_info) {
spin_lock(&current->dmabuf_info->lock);
p->dmabuf_info->rss = current->dmabuf_info->rss;
list_for_each_entry(rec, &current->dmabuf_info->dmabufs, node) {
copy = kmalloc(sizeof(*copy), GFP_KERNEL);
if (!copy) {
spin_unlock(&current->dmabuf_info->lock);
goto err_list_copy;
}
copy->dmabuf = rec->dmabuf;
copy->refcnt = rec->refcnt;
list_add(&copy->node, &p->dmabuf_info->dmabufs);
}
spin_unlock(&current->dmabuf_info->lock);
} else {
p->dmabuf_info->rss = 0;
}
return 0;
err_list_copy:
list_for_each_entry_safe(rec, copy, &p->dmabuf_info->dmabufs, node) {
list_del(&rec->node);
kfree(rec);
}
kfree(p->dmabuf_info);
return -ENOMEM;
}
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -2509,14 +2582,18 @@ __latent_entropy struct task_struct *copy_process(
p->bpf_ctx = NULL;
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
retval = copy_dmabuf_info(clone_flags, p);
if (retval)
goto bad_fork_cleanup_policy;
/* Perform scheduler related setup. Assign this task to a CPU. */
retval = sched_fork(clone_flags, p);
if (retval)
goto bad_fork_cleanup_dmabuf;
retval = perf_event_init_task(p, clone_flags);
if (retval)
goto bad_fork_cleanup_policy;
goto bad_fork_cleanup_dmabuf;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
@@ -2819,6 +2896,8 @@ bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
bad_fork_cleanup_dmabuf:
put_dmabuf_info(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA

View File

@@ -49,6 +49,7 @@
#include <linux/oom.h>
#include <linux/sched/mm.h>
#include <linux/ksm.h>
#include <linux/dma-buf.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -144,8 +145,11 @@ static void remove_vma(struct vm_area_struct *vma, bool unreachable)
{
might_sleep();
vma_close(vma);
if (vma->vm_file)
if (vma->vm_file) {
if (is_dma_buf_file(vma->vm_file))
dma_buf_unaccount_task(vma->vm_file->private_data, current);
fput(vma->vm_file);
}
mpol_put(vma_policy(vma));
if (unreachable)
__vm_area_free(vma);
@@ -2417,8 +2421,14 @@ int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
if (err)
goto out_free_mpol;
if (new->vm_file)
if (new->vm_file) {
get_file(new->vm_file);
if (is_dma_buf_file(new->vm_file)) {
/* Should never fail since this task already references the buffer */
if (dma_buf_account_task(new->vm_file->private_data, current))
pr_err("%s failed to account dmabuf\n", __func__);
}
}
if (new->vm_ops && new->vm_ops->open)
new->vm_ops->open(new);