// SPDX-License-Identifier: GPL-2.0 // Copyright (C) 2024 Google LLC. use kernel::{ page::PAGE_SIZE, prelude::*, rbtree::{RBTree, RBTreeNode, RBTreeNodeReservation}, seq_file::SeqFile, seq_print, task::Pid, }; /// Keeps track of allocations in a process' mmap. /// /// Each process has an mmap where the data for incoming transactions will be placed. This struct /// keeps track of allocations made in the mmap. For each allocation, we store a descriptor that /// has metadata related to the allocation. We also keep track of available free space. pub(crate) struct RangeAllocator { tree: RBTree>, free_tree: RBTree, size: usize, free_oneway_space: usize, pub(crate) oneway_spam_detected: bool, } /// Represents a range of pages that have just become completely free. #[derive(Copy, Clone)] pub(crate) struct FreedRange { pub(crate) start_page_idx: usize, pub(crate) end_page_idx: usize, } impl FreedRange { fn interior_pages(offset: usize, size: usize) -> FreedRange { FreedRange { // Divide round up start_page_idx: (offset + (PAGE_SIZE - 1)) / PAGE_SIZE, // Divide round down end_page_idx: (offset + size) / PAGE_SIZE, } } } impl RangeAllocator { pub(crate) fn new(size: usize) -> Result { let mut tree = RBTree::new(); tree.try_create_and_insert(0, Descriptor::new(0, size))?; let mut free_tree = RBTree::new(); free_tree.try_create_and_insert((size, 0), ())?; Ok(Self { free_oneway_space: size / 2, tree, free_tree, oneway_spam_detected: false, size, }) } pub(crate) fn debug_print(&self, m: &mut SeqFile) -> Result<()> { for desc in self.tree.values() { let state = match &desc.state { Some(state) => state, None => continue, }; seq_print!( m, " buffer {}: {} size {} pid {} oneway {}", 0, desc.offset, desc.size, state.pid(), state.is_oneway() ); match state { DescriptorState::Reserved(_res) => { seq_print!(m, "reserved\n"); } DescriptorState::Allocated(_alloc) => { seq_print!(m, "allocated\n"); } } } Ok(()) } fn find_best_match(&mut self, size: usize) -> Option<&mut Descriptor> { let free_cursor = self.free_tree.cursor_lower_bound(&(size, 0))?; let ((_, offset), _) = free_cursor.current(); self.tree.get_mut(offset) } /// Try to reserve a new buffer, using the provided allocation if necessary. pub(crate) fn reserve_new( &mut self, size: usize, is_oneway: bool, pid: Pid, alloc: ReserveNewBox, ) -> Result { // Compute new value of free_oneway_space, which is set only on success. let new_oneway_space = if is_oneway { match self.free_oneway_space.checked_sub(size) { Some(new_oneway_space) => new_oneway_space, None => return Err(ENOSPC), } } else { self.free_oneway_space }; // Start detecting spammers once we have less than 20% // of async space left (which is less than 10% of total // buffer size). // // (This will short-circut, so `low_oneway_space` is // only called when necessary.) self.oneway_spam_detected = is_oneway && new_oneway_space < self.size / 10 && self.low_oneway_space(pid); let (found_size, found_off, tree_node, free_tree_node) = match self.find_best_match(size) { None => { pr_warn!("ENOSPC from range_alloc.reserve_new - size: {}", size); return Err(ENOSPC); } Some(desc) => { let found_size = desc.size; let found_offset = desc.offset; // In case we need to break up the descriptor let new_desc = Descriptor::new(found_offset + size, found_size - size); let (tree_node, free_tree_node, desc_node_res) = alloc.initialize(new_desc); desc.state = Some(DescriptorState::new(is_oneway, pid, desc_node_res)); desc.size = size; (found_size, found_offset, tree_node, free_tree_node) } }; self.free_oneway_space = new_oneway_space; self.free_tree.remove(&(found_size, found_off)); if found_size != size { self.tree.insert(tree_node); self.free_tree.insert(free_tree_node); } Ok(found_off) } pub(crate) fn reservation_abort(&mut self, offset: usize) -> Result { let mut cursor = self.tree.cursor_lower_bound(&offset).ok_or_else(|| { pr_warn!( "EINVAL from range_alloc.reservation_abort - offset: {}", offset ); EINVAL })?; let (_, desc) = cursor.current_mut(); if desc.offset != offset { pr_warn!( "EINVAL from range_alloc.reservation_abort - offset: {}", offset ); return Err(EINVAL); } let reservation = desc.try_change_state(|state| match state { Some(DescriptorState::Reserved(reservation)) => (None, Ok(reservation)), None => { pr_warn!( "EINVAL from range_alloc.reservation_abort - offset: {}", offset ); (None, Err(EINVAL)) } allocated => { pr_warn!( "EPERM from range_alloc.reservation_abort - offset: {}", offset ); (allocated, Err(EPERM)) } })?; let mut size = desc.size; let mut offset = desc.offset; let free_oneway_space_add = if reservation.is_oneway { size } else { 0 }; self.free_oneway_space += free_oneway_space_add; let mut freed_range = FreedRange::interior_pages(offset, size); // Compute how large the next free region needs to be to include one more page in // the newly freed range. let add_next_page_needed = match (offset + size) % PAGE_SIZE { 0 => usize::MAX, unalign => PAGE_SIZE - unalign, }; // Compute how large the previous free region needs to be to include one more page // in the newly freed range. let add_prev_page_needed = match offset % PAGE_SIZE { 0 => usize::MAX, unalign => unalign, }; // Merge next into current if next is free let remove_next = match cursor.peek_next() { Some((_, next)) if next.state.is_none() => { if next.size >= add_next_page_needed { freed_range.end_page_idx += 1; } self.free_tree.remove(&(next.size, next.offset)); size += next.size; true } _ => false, }; if remove_next { let (_, desc) = cursor.current_mut(); desc.size = size; cursor.remove_next(); } // Merge current into prev if prev is free match cursor.peek_prev_mut() { Some((_, prev)) if prev.state.is_none() => { if prev.size >= add_prev_page_needed { freed_range.start_page_idx -= 1; } // merge previous with current, remove current self.free_tree.remove(&(prev.size, prev.offset)); offset = prev.offset; size += prev.size; prev.size = size; cursor.remove_current(); } _ => {} }; self.free_tree .insert(reservation.free_res.into_node((size, offset), ())); Ok(freed_range) } pub(crate) fn reservation_commit(&mut self, offset: usize, data: Option) -> Result { let desc = self.tree.get_mut(&offset).ok_or_else(|| { pr_warn!( "ENOENT from range_alloc.reservation_commit - offset: {}", offset ); ENOENT })?; desc.try_change_state(|state| match state { Some(DescriptorState::Reserved(reservation)) => ( Some(DescriptorState::Allocated(reservation.allocate(data))), Ok(()), ), other => { pr_warn!( "ENOENT from range_alloc.reservation_commit - offset: {}", offset ); (other, Err(ENOENT)) } }) } /// Takes an entry at the given offset from [`DescriptorState::Allocated`] to /// [`DescriptorState::Reserved`]. /// /// Returns the size of the existing entry and the data associated with it. pub(crate) fn reserve_existing(&mut self, offset: usize) -> Result<(usize, Option)> { let desc = self.tree.get_mut(&offset).ok_or_else(|| { pr_warn!( "ENOENT from range_alloc.reserve_existing - offset: {}", offset ); ENOENT })?; let data = desc.try_change_state(|state| match state { Some(DescriptorState::Allocated(allocation)) => { let (reservation, data) = allocation.deallocate(); (Some(DescriptorState::Reserved(reservation)), Ok(data)) } other => { pr_warn!( "ENOENT from range_alloc.reserve_existing - offset: {}", offset ); (other, Err(ENOENT)) } })?; Ok((desc.size, data)) } /// Call the provided callback at every allocated region. /// /// This destroys the range allocator. Used only during shutdown. pub(crate) fn take_for_each)>(&mut self, callback: F) { for (_, desc) in self.tree.iter_mut() { if let Some(DescriptorState::Allocated(allocation)) = &mut desc.state { callback(desc.offset, desc.size, allocation.take()); } } } /// Find the amount and size of buffers allocated by the current caller. /// /// The idea is that once we cross the threshold, whoever is responsible /// for the low async space is likely to try to send another async transaction, /// and at some point we'll catch them in the act. This is more efficient /// than keeping a map per pid. fn low_oneway_space(&self, calling_pid: Pid) -> bool { let mut total_alloc_size = 0; let mut num_buffers = 0; for (_, desc) in self.tree.iter() { if let Some(state) = &desc.state { if state.is_oneway() && state.pid() == calling_pid { total_alloc_size += desc.size; num_buffers += 1; } } } // Warn if this pid has more than 50 transactions, or more than 50% of // async space (which is 25% of total buffer size). Oneway spam is only // detected when the threshold is exceeded. num_buffers > 50 || total_alloc_size > self.size / 4 } } struct Descriptor { size: usize, offset: usize, state: Option>, } impl Descriptor { fn new(offset: usize, size: usize) -> Self { Self { size, offset, state: None, } } fn try_change_state(&mut self, f: F) -> Result where F: FnOnce(Option>) -> (Option>, Result), { let (new_state, result) = f(self.state.take()); self.state = new_state; result } } enum DescriptorState { Reserved(Reservation), Allocated(Allocation), } impl DescriptorState { fn new(is_oneway: bool, pid: Pid, free_res: FreeNodeRes) -> Self { DescriptorState::Reserved(Reservation { is_oneway, pid, free_res, }) } fn pid(&self) -> Pid { match self { DescriptorState::Reserved(inner) => inner.pid, DescriptorState::Allocated(inner) => inner.pid, } } fn is_oneway(&self) -> bool { match self { DescriptorState::Reserved(inner) => inner.is_oneway, DescriptorState::Allocated(inner) => inner.is_oneway, } } } struct Reservation { is_oneway: bool, pid: Pid, free_res: FreeNodeRes, } impl Reservation { fn allocate(self, data: Option) -> Allocation { Allocation { data, is_oneway: self.is_oneway, pid: self.pid, free_res: self.free_res, } } } struct Allocation { is_oneway: bool, pid: Pid, free_res: FreeNodeRes, data: Option, } impl Allocation { fn deallocate(self) -> (Reservation, Option) { ( Reservation { is_oneway: self.is_oneway, pid: self.pid, free_res: self.free_res, }, self.data, ) } fn take(&mut self) -> Option { self.data.take() } } // (Descriptor.size, Descriptor.offset) type FreeKey = (usize, usize); type FreeNodeRes = RBTreeNodeReservation; /// An allocation for use by `reserve_new`. pub(crate) struct ReserveNewBox { tree_node_res: RBTreeNodeReservation>, free_tree_node_res: FreeNodeRes, desc_node_res: FreeNodeRes, } impl ReserveNewBox { pub(crate) fn try_new() -> Result { let tree_node_res = RBTree::try_reserve_node()?; let free_tree_node_res = RBTree::try_reserve_node()?; let desc_node_res = RBTree::try_reserve_node()?; Ok(Self { tree_node_res, free_tree_node_res, desc_node_res, }) } fn initialize( self, desc: Descriptor, ) -> ( RBTreeNode>, RBTreeNode, FreeNodeRes, ) { let size = desc.size; let offset = desc.offset; ( self.tree_node_res.into_node(offset, desc), self.free_tree_node_res.into_node((size, offset), ()), self.desc_node_res, ) } }