473 lines
15 KiB
Rust
473 lines
15 KiB
Rust
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
// Copyright (C) 2024 Google LLC.
|
|
|
|
use kernel::{
|
|
page::PAGE_SIZE,
|
|
prelude::*,
|
|
rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation},
|
|
seq_file::SeqFile,
|
|
seq_print,
|
|
task::Pid,
|
|
};
|
|
|
|
/// Keeps track of allocations in a process' mmap.
|
|
///
|
|
/// Each process has an mmap where the data for incoming transactions will be placed. This struct
|
|
/// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that
|
|
/// has metadata related to the allocation. We also keep track of available free space.
|
|
pub(crate) struct RangeAllocator<T> {
|
|
tree: RBTree<usize, Descriptor<T>>,
|
|
free_tree: RBTree<FreeKey, ()>,
|
|
size: usize,
|
|
free_oneway_space: usize,
|
|
pub(crate) oneway_spam_detected: bool,
|
|
}
|
|
|
|
/// Represents a range of pages that have just become completely free.
|
|
#[derive(Copy, Clone)]
|
|
pub(crate) struct FreedRange {
|
|
pub(crate) start_page_idx: usize,
|
|
pub(crate) end_page_idx: usize,
|
|
}
|
|
|
|
impl FreedRange {
|
|
fn interior_pages(offset: usize, size: usize) -> FreedRange {
|
|
FreedRange {
|
|
// Divide round up
|
|
start_page_idx: (offset + (PAGE_SIZE - 1)) / PAGE_SIZE,
|
|
// Divide round down
|
|
end_page_idx: (offset + size) / PAGE_SIZE,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl<T> RangeAllocator<T> {
|
|
pub(crate) fn new(size: usize) -> Result<Self> {
|
|
let mut tree = RBTree::new();
|
|
tree.try_create_and_insert(0, Descriptor::new(0, size))?;
|
|
let mut free_tree = RBTree::new();
|
|
free_tree.try_create_and_insert((size, 0), ())?;
|
|
Ok(Self {
|
|
free_oneway_space: size / 2,
|
|
tree,
|
|
free_tree,
|
|
oneway_spam_detected: false,
|
|
size,
|
|
})
|
|
}
|
|
|
|
pub(crate) fn debug_print(&self, m: &mut SeqFile) -> Result<()> {
|
|
for desc in self.tree.values() {
|
|
let state = match &desc.state {
|
|
Some(state) => state,
|
|
None => continue,
|
|
};
|
|
seq_print!(
|
|
m,
|
|
" buffer {}: {} size {} pid {} oneway {}",
|
|
0,
|
|
desc.offset,
|
|
desc.size,
|
|
state.pid(),
|
|
state.is_oneway()
|
|
);
|
|
match state {
|
|
DescriptorState::Reserved(_res) => {
|
|
seq_print!(m, "reserved\n");
|
|
}
|
|
DescriptorState::Allocated(_alloc) => {
|
|
seq_print!(m, "allocated\n");
|
|
}
|
|
}
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor<T>> {
|
|
let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?;
|
|
let ((_, offset), _) = free_cursor.current();
|
|
self.tree.get_mut(offset)
|
|
}
|
|
|
|
/// Try to reserve a new buffer, using the provided allocation if necessary.
|
|
pub(crate) fn reserve_new(
|
|
&mut self,
|
|
size: usize,
|
|
is_oneway: bool,
|
|
pid: Pid,
|
|
alloc: ReserveNewBox<T>,
|
|
) -> Result<usize> {
|
|
// Compute new value of free_oneway_space, which is set only on success.
|
|
let new_oneway_space = if is_oneway {
|
|
match self.free_oneway_space.checked_sub(size) {
|
|
Some(new_oneway_space) => new_oneway_space,
|
|
None => return Err(ENOSPC),
|
|
}
|
|
} else {
|
|
self.free_oneway_space
|
|
};
|
|
|
|
// Start detecting spammers once we have less than 20%
|
|
// of async space left (which is less than 10% of total
|
|
// buffer size).
|
|
//
|
|
// (This will short-circut, so `low_oneway_space` is
|
|
// only called when necessary.)
|
|
self.oneway_spam_detected =
|
|
is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid);
|
|
|
|
let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) {
|
|
None => {
|
|
pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size);
|
|
return Err(ENOSPC);
|
|
}
|
|
Some(desc) => {
|
|
let found_size = desc.size;
|
|
let found_offset = desc.offset;
|
|
|
|
// In case we need to break up the descriptor
|
|
let new_desc = Descriptor::new(found_offset + size, found_size - size);
|
|
let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc);
|
|
|
|
desc.state = Some(DescriptorState::new(is_oneway, pid, desc_node_res));
|
|
desc.size = size;
|
|
|
|
(found_size, found_offset, tree_node, free_tree_node)
|
|
}
|
|
};
|
|
self.free_oneway_space = new_oneway_space;
|
|
self.free_tree.remove(&(found_size, found_off));
|
|
|
|
if found_size != size {
|
|
self.tree.insert(tree_node);
|
|
self.free_tree.insert(free_tree_node);
|
|
}
|
|
|
|
Ok(found_off)
|
|
}
|
|
|
|
pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result<FreedRange> {
|
|
let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| {
|
|
pr_warn!(
|
|
"EINVAL from range_alloc.reservation_abort - offset: {}",
|
|
offset
|
|
);
|
|
EINVAL
|
|
})?;
|
|
|
|
let (_, desc) = cursor.current_mut();
|
|
|
|
if desc.offset != offset {
|
|
pr_warn!(
|
|
"EINVAL from range_alloc.reservation_abort - offset: {}",
|
|
offset
|
|
);
|
|
return Err(EINVAL);
|
|
}
|
|
|
|
let reservation = desc.try_change_state(|state| match state {
|
|
Some(DescriptorState::Reserved(reservation)) => (None, Ok(reservation)),
|
|
None => {
|
|
pr_warn!(
|
|
"EINVAL from range_alloc.reservation_abort - offset: {}",
|
|
offset
|
|
);
|
|
(None, Err(EINVAL))
|
|
}
|
|
allocated => {
|
|
pr_warn!(
|
|
"EPERM from range_alloc.reservation_abort - offset: {}",
|
|
offset
|
|
);
|
|
(allocated, Err(EPERM))
|
|
}
|
|
})?;
|
|
|
|
let mut size = desc.size;
|
|
let mut offset = desc.offset;
|
|
let free_oneway_space_add = if reservation.is_oneway { size } else { 0 };
|
|
|
|
self.free_oneway_space += free_oneway_space_add;
|
|
|
|
let mut freed_range = FreedRange::interior_pages(offset, size);
|
|
// Compute how large the next free region needs to be to include one more page in
|
|
// the newly freed range.
|
|
let add_next_page_needed = match (offset + size) % PAGE_SIZE {
|
|
0 => usize::MAX,
|
|
unalign => PAGE_SIZE - unalign,
|
|
};
|
|
// Compute how large the previous free region needs to be to include one more page
|
|
// in the newly freed range.
|
|
let add_prev_page_needed = match offset % PAGE_SIZE {
|
|
0 => usize::MAX,
|
|
unalign => unalign,
|
|
};
|
|
|
|
// Merge next into current if next is free
|
|
let remove_next = match cursor.peek_next() {
|
|
Some((_, next)) if next.state.is_none() => {
|
|
if next.size >= add_next_page_needed {
|
|
freed_range.end_page_idx += 1;
|
|
}
|
|
self.free_tree.remove(&(next.size, next.offset));
|
|
size += next.size;
|
|
true
|
|
}
|
|
_ => false,
|
|
};
|
|
|
|
if remove_next {
|
|
let (_, desc) = cursor.current_mut();
|
|
desc.size = size;
|
|
cursor.remove_next();
|
|
}
|
|
|
|
// Merge current into prev if prev is free
|
|
match cursor.peek_prev_mut() {
|
|
Some((_, prev)) if prev.state.is_none() => {
|
|
if prev.size >= add_prev_page_needed {
|
|
freed_range.start_page_idx -= 1;
|
|
}
|
|
// merge previous with current, remove current
|
|
self.free_tree.remove(&(prev.size, prev.offset));
|
|
offset = prev.offset;
|
|
size += prev.size;
|
|
prev.size = size;
|
|
cursor.remove_current();
|
|
}
|
|
_ => {}
|
|
};
|
|
|
|
self.free_tree
|
|
.insert(reservation.free_res.into_node((size, offset), ()));
|
|
|
|
Ok(freed_range)
|
|
}
|
|
|
|
pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option<T>) -> Result {
|
|
let desc = self.tree.get_mut(&offset).ok_or_else(|| {
|
|
pr_warn!(
|
|
"ENOENT from range_alloc.reservation_commit - offset: {}",
|
|
offset
|
|
);
|
|
ENOENT
|
|
})?;
|
|
|
|
desc.try_change_state(|state| match state {
|
|
Some(DescriptorState::Reserved(reservation)) => (
|
|
Some(DescriptorState::Allocated(reservation.allocate(data))),
|
|
Ok(()),
|
|
),
|
|
other => {
|
|
pr_warn!(
|
|
"ENOENT from range_alloc.reservation_commit - offset: {}",
|
|
offset
|
|
);
|
|
(other, Err(ENOENT))
|
|
}
|
|
})
|
|
}
|
|
|
|
/// Takes an entry at the given offset from [`DescriptorState::Allocated`] to
|
|
/// [`DescriptorState::Reserved`].
|
|
///
|
|
/// Returns the size of the existing entry and the data associated with it.
|
|
pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, Option<T>)> {
|
|
let desc = self.tree.get_mut(&offset).ok_or_else(|| {
|
|
pr_warn!(
|
|
"ENOENT from range_alloc.reserve_existing - offset: {}",
|
|
offset
|
|
);
|
|
ENOENT
|
|
})?;
|
|
|
|
let data = desc.try_change_state(|state| match state {
|
|
Some(DescriptorState::Allocated(allocation)) => {
|
|
let (reservation, data) = allocation.deallocate();
|
|
(Some(DescriptorState::Reserved(reservation)), Ok(data))
|
|
}
|
|
other => {
|
|
pr_warn!(
|
|
"ENOENT from range_alloc.reserve_existing - offset: {}",
|
|
offset
|
|
);
|
|
(other, Err(ENOENT))
|
|
}
|
|
})?;
|
|
|
|
Ok((desc.size, data))
|
|
}
|
|
|
|
/// Call the provided callback at every allocated region.
|
|
///
|
|
/// This destroys the range allocator. Used only during shutdown.
|
|
pub(crate) fn take_for_each<F: Fn(usize, usize, Option<T>)>(&mut self, callback: F) {
|
|
for (_, desc) in self.tree.iter_mut() {
|
|
if let Some(DescriptorState::Allocated(allocation)) = &mut desc.state {
|
|
callback(desc.offset, desc.size, allocation.take());
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Find the amount and size of buffers allocated by the current caller.
|
|
///
|
|
/// The idea is that once we cross the threshold, whoever is responsible
|
|
/// for the low async space is likely to try to send another async transaction,
|
|
/// and at some point we'll catch them in the act. This is more efficient
|
|
/// than keeping a map per pid.
|
|
fn low_oneway_space(&self, calling_pid: Pid) -> bool {
|
|
let mut total_alloc_size = 0;
|
|
let mut num_buffers = 0;
|
|
for (_, desc) in self.tree.iter() {
|
|
if let Some(state) = &desc.state {
|
|
if state.is_oneway() && state.pid() == calling_pid {
|
|
total_alloc_size += desc.size;
|
|
num_buffers += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Warn if this pid has more than 50 transactions, or more than 50% of
|
|
// async space (which is 25% of total buffer size). Oneway spam is only
|
|
// detected when the threshold is exceeded.
|
|
num_buffers > 50 || total_alloc_size > self.size / 4
|
|
}
|
|
}
|
|
|
|
struct Descriptor<T> {
|
|
size: usize,
|
|
offset: usize,
|
|
state: Option<DescriptorState<T>>,
|
|
}
|
|
|
|
impl<T> Descriptor<T> {
|
|
fn new(offset: usize, size: usize) -> Self {
|
|
Self {
|
|
size,
|
|
offset,
|
|
state: None,
|
|
}
|
|
}
|
|
|
|
fn try_change_state<F, Data>(&mut self, f: F) -> Result<Data>
|
|
where
|
|
F: FnOnce(Option<DescriptorState<T>>) -> (Option<DescriptorState<T>>, Result<Data>),
|
|
{
|
|
let (new_state, result) = f(self.state.take());
|
|
self.state = new_state;
|
|
result
|
|
}
|
|
}
|
|
|
|
enum DescriptorState<T> {
|
|
Reserved(Reservation),
|
|
Allocated(Allocation<T>),
|
|
}
|
|
|
|
impl<T> DescriptorState<T> {
|
|
fn new(is_oneway: bool, pid: Pid, free_res: FreeNodeRes) -> Self {
|
|
DescriptorState::Reserved(Reservation {
|
|
is_oneway,
|
|
pid,
|
|
free_res,
|
|
})
|
|
}
|
|
|
|
fn pid(&self) -> Pid {
|
|
match self {
|
|
DescriptorState::Reserved(inner) => inner.pid,
|
|
DescriptorState::Allocated(inner) => inner.pid,
|
|
}
|
|
}
|
|
|
|
fn is_oneway(&self) -> bool {
|
|
match self {
|
|
DescriptorState::Reserved(inner) => inner.is_oneway,
|
|
DescriptorState::Allocated(inner) => inner.is_oneway,
|
|
}
|
|
}
|
|
}
|
|
|
|
struct Reservation {
|
|
is_oneway: bool,
|
|
pid: Pid,
|
|
free_res: FreeNodeRes,
|
|
}
|
|
|
|
impl Reservation {
|
|
fn allocate<T>(self, data: Option<T>) -> Allocation<T> {
|
|
Allocation {
|
|
data,
|
|
is_oneway: self.is_oneway,
|
|
pid: self.pid,
|
|
free_res: self.free_res,
|
|
}
|
|
}
|
|
}
|
|
|
|
struct Allocation<T> {
|
|
is_oneway: bool,
|
|
pid: Pid,
|
|
free_res: FreeNodeRes,
|
|
data: Option<T>,
|
|
}
|
|
|
|
impl<T> Allocation<T> {
|
|
fn deallocate(self) -> (Reservation, Option<T>) {
|
|
(
|
|
Reservation {
|
|
is_oneway: self.is_oneway,
|
|
pid: self.pid,
|
|
free_res: self.free_res,
|
|
},
|
|
self.data,
|
|
)
|
|
}
|
|
|
|
fn take(&mut self) -> Option<T> {
|
|
self.data.take()
|
|
}
|
|
}
|
|
|
|
// (Descriptor.size, Descriptor.offset)
|
|
type FreeKey = (usize, usize);
|
|
type FreeNodeRes = RBTreeNodeReservation<FreeKey, ()>;
|
|
|
|
/// An allocation for use by `reserve_new`.
|
|
pub(crate) struct ReserveNewBox<T> {
|
|
tree_node_res: RBTreeNodeReservation<usize, Descriptor<T>>,
|
|
free_tree_node_res: FreeNodeRes,
|
|
desc_node_res: FreeNodeRes,
|
|
}
|
|
|
|
impl<T> ReserveNewBox<T> {
|
|
pub(crate) fn try_new() -> Result<Self> {
|
|
let tree_node_res = RBTree::try_reserve_node()?;
|
|
let free_tree_node_res = RBTree::try_reserve_node()?;
|
|
let desc_node_res = RBTree::try_reserve_node()?;
|
|
Ok(Self {
|
|
tree_node_res,
|
|
free_tree_node_res,
|
|
desc_node_res,
|
|
})
|
|
}
|
|
|
|
fn initialize(
|
|
self,
|
|
desc: Descriptor<T>,
|
|
) -> (
|
|
RBTreeNode<usize, Descriptor<T>>,
|
|
RBTreeNode<FreeKey, ()>,
|
|
FreeNodeRes,
|
|
) {
|
|
let size = desc.size;
|
|
let offset = desc.offset;
|
|
(
|
|
self.tree_node_res.into_node(offset, desc),
|
|
self.free_tree_node_res.into_node((size, offset), ()),
|
|
self.desc_node_res,
|
|
)
|
|
}
|
|
}
|