mod mapping; mod page_fault; use super::access::KernelPageAccess; use super::address::{VAddrExt as _, VRangeExt as _}; use super::page_alloc::GlobalPageAlloc; use super::paging::{AllocZeroed as _, PageUnmanaged}; use super::{AsMemoryBlock, MMArea, Page}; use crate::{prelude::*, sync::ArcSwap}; use alloc::collections::btree_set::BTreeSet; use arch::DefaultPagingMode; use bindings::{EEXIST, EFAULT, EINVAL, ENOMEM}; use core::fmt; use core::sync::atomic::{AtomicUsize, Ordering}; use eonix_mm::address::{Addr as _, PAddr}; use eonix_mm::page_table::{PageAttribute, PagingMode}; use eonix_mm::paging::PFN; use eonix_mm::{ address::{AddrOps as _, VAddr, VRange}, page_table::{PageTable, RawAttribute, PTE}, paging::PAGE_SIZE, }; use eonix_runtime::task::Task; use eonix_sync::{LazyLock, Mutex}; pub use mapping::{FileMapping, Mapping}; pub use page_fault::handle_page_fault; pub static EMPTY_PAGE: LazyLock = LazyLock::new(|| Page::zeroed()); static KERNEL_ROOT_TABLE_PAGE: LazyLock = LazyLock::new(|| unsafe { // SAFETY: The kernel page table is always valid. PageUnmanaged::from_raw_unchecked(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN) }); #[derive(Debug, Clone, Copy)] pub struct Permission { pub write: bool, pub execute: bool, } struct MMListInner<'a> { areas: BTreeSet, page_table: PageTable<'a, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess>, break_start: Option, break_pos: Option, } pub struct MMList { inner: ArcSwap>>, user_count: AtomicUsize, /// Only used in kernel space to switch page tables on context switch. root_page_table: AtomicUsize, } impl MMListInner<'_> { fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> { self.areas.get(&VRange::from(addr)) } fn check_overlapping_addr(&self, addr: VAddr) -> bool { addr.is_user() && self.overlapping_addr(addr).is_none() } fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator + '_ { self.areas.range(range.into_bounds()) } fn check_overlapping_range(&self, range: VRange) -> bool { range.is_user() && self.overlapping_range(range).next().is_none() } fn random_start(&self) -> VAddr { VAddr::from(0x1234000) } fn find_available(&self, mut hint: VAddr, len: usize) -> Option { let len = len.div_ceil(PAGE_SIZE) * PAGE_SIZE; if hint == VAddr::NULL { hint = self.random_start(); } else { hint = hint.floor(); } let mut range = VRange::from(hint).grow(len); loop { if !range.is_user() { return None; } match self.overlapping_range(range).next_back() { None => return Some(range.start()), Some(area) => { range = VRange::from(area.range().end().ceil()).grow(len); } } } } fn unmap(&mut self, start: VAddr, len: usize) -> KResult> { assert_eq!(start.floor(), start); let end = (start + len).ceil(); let range_to_unmap = VRange::new(start, end); if !range_to_unmap.is_user() { return Err(EINVAL); } let mut left_remaining = None; let mut right_remaining = None; let mut pages_to_free = Vec::new(); // TODO: Write back dirty pages. self.areas.retain(|area| { let Some((left, mid, right)) = area.range().mask_with_checked(&range_to_unmap) else { return true; }; for pte in self.page_table.iter_user(mid) { let (pfn, _) = pte.take(); pages_to_free.push(unsafe { // SAFETY: We got the pfn from a valid page table entry, so it should be valid. Page::from_raw(pfn) }); } match (left, right) { (None, None) => {} (Some(left), None) => { assert!(left_remaining.is_none()); let (Some(left), _) = area.clone().split(left.end()) else { unreachable!("`left.end()` is within the area"); }; left_remaining = Some(left); } (None, Some(right)) => { assert!(right_remaining.is_none()); let (_, Some(right)) = area.clone().split(right.start()) else { unreachable!("`right.start()` is within the area"); }; right_remaining = Some(right); } (Some(left), Some(right)) => { assert!(left_remaining.is_none()); assert!(right_remaining.is_none()); let (Some(left), Some(mid)) = area.clone().split(left.end()) else { unreachable!("`left.end()` is within the area"); }; let (_, Some(right)) = mid.split(right.start()) else { unreachable!("`right.start()` is within the area"); }; left_remaining = Some(left); right_remaining = Some(right); } } false }); if let Some(front) = left_remaining { self.areas.insert(front); } if let Some(back) = right_remaining { self.areas.insert(back); } Ok(pages_to_free) } fn mmap( &mut self, at: VAddr, len: usize, mapping: Mapping, permission: Permission, ) -> KResult<()> { assert_eq!(at.floor(), at); assert_eq!(len & 0xfff, 0); let range = VRange::new(at, at + len); // We are doing a area marker insertion. if len == 0 && !self.check_overlapping_addr(at) || !self.check_overlapping_range(range) { return Err(EEXIST); } match &mapping { Mapping::Anonymous => self.page_table.set_anonymous(range, permission), Mapping::File(_) => self.page_table.set_mmapped(range, permission), } self.areas.insert(MMArea::new(range, mapping, permission)); Ok(()) } } impl MMList { async fn flush_user_tlbs(&self) { match self.user_count.load(Ordering::Relaxed) { 0 => { // If there are currently no users, we don't need to do anything. } 1 => { if PAddr::from(arch::get_root_page_table_pfn()).addr() == self.root_page_table.load(Ordering::Relaxed) { // If there is only one user and we are using the page table, // flushing the TLB for the local cpu only is enough. arch::flush_tlb_all(); } else { // Send the TLB flush request to the core. todo!(); } } _ => { // If there are more than one users, we broadcast the TLB flush // to all cores. todo!() } } } pub fn new() -> Self { let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE); Self { root_page_table: AtomicUsize::from(page_table.addr().addr()), user_count: AtomicUsize::new(0), inner: ArcSwap::new(Mutex::new(MMListInner { areas: BTreeSet::new(), page_table, break_start: None, break_pos: None, })), } } pub async fn new_cloned(&self) -> Self { let inner = self.inner.borrow(); let inner = inner.lock().await; let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE); let list = Self { root_page_table: AtomicUsize::from(page_table.addr().addr()), user_count: AtomicUsize::new(0), inner: ArcSwap::new(Mutex::new(MMListInner { areas: inner.areas.clone(), page_table, break_start: inner.break_start, break_pos: inner.break_pos, })), }; { let list_inner = list.inner.borrow(); let list_inner = list_inner.lock().await; for area in list_inner.areas.iter() { list_inner .page_table .set_copy_on_write(&inner.page_table, area.range()); } } // We've set some pages as CoW, so we need to invalidate all our users' TLB. self.flush_user_tlbs().await; list } pub fn activate(&self) { self.user_count.fetch_add(1, Ordering::Acquire); let root_page_table = self.root_page_table.load(Ordering::Relaxed); assert_ne!(root_page_table, 0); arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table))); } pub fn deactivate(&self) { arch::set_root_page_table_pfn(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN); let old_user_count = self.user_count.fetch_sub(1, Ordering::Release); assert_ne!(old_user_count, 0); } /// Deactivate `self` and activate `to` with root page table changed only once. /// This might reduce the overhead of switching page tables twice. #[allow(dead_code)] pub fn switch(&self, to: &Self) { self.user_count.fetch_add(1, Ordering::Acquire); let root_page_table = self.root_page_table.load(Ordering::Relaxed); assert_ne!(root_page_table, 0); arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table))); let old_user_count = to.user_count.fetch_sub(1, Ordering::Release); assert_ne!(old_user_count, 0); } /// Replace the current page table with a new one. /// /// # Safety /// This function should be called only when we are sure that the `MMList` is not /// being used by any other thread. pub unsafe fn replace(&self, new: Option) { eonix_preempt::disable(); assert_eq!( self.user_count.load(Ordering::Relaxed), 1, "We should be the only user" ); assert_eq!( new.as_ref() .map(|new_mm| new_mm.user_count.load(Ordering::Relaxed)) .unwrap_or(0), 0, "`new` must not be used by anyone" ); let old_root_page_table = self.root_page_table.load(Ordering::Relaxed); let current_root_page_table = arch::get_root_page_table_pfn(); assert_eq!( PAddr::from(current_root_page_table).addr(), old_root_page_table, "We should be the only user" ); let new_root_page_table = match &new { Some(new_mm) => new_mm.root_page_table.load(Ordering::Relaxed), None => PAddr::from(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN).addr(), }; arch::set_root_page_table_pfn(PFN::from(PAddr::from(new_root_page_table))); self.root_page_table .store(new_root_page_table, Ordering::Relaxed); // TODO: Check whether we should wake someone up if they've been put // to sleep when calling `vfork`. self.inner .swap(new.map(|new_mm| new_mm.inner.swap(None)).flatten()); eonix_preempt::enable(); } /// No need to do invalidation manually, `PageTable` already does it. pub async fn unmap(&self, start: VAddr, len: usize) -> KResult<()> { let pages_to_free = self.inner.borrow().lock().await.unmap(start, len)?; // We need to assure that the pages are not accessed anymore. // The ones having these pages in their TLB could read from or write to them. // So flush the TLBs first for all our users. self.flush_user_tlbs().await; // Then free the pages. drop(pages_to_free); Ok(()) } pub fn mmap_hint( &self, hint: VAddr, len: usize, mapping: Mapping, permission: Permission, ) -> KResult { let inner = self.inner.borrow(); let mut inner = Task::block_on(inner.lock()); if hint == VAddr::NULL { let at = inner.find_available(hint, len).ok_or(ENOMEM)?; inner.mmap(at, len, mapping, permission)?; return Ok(at); } match inner.mmap(hint, len, mapping.clone(), permission) { Ok(()) => Ok(hint), Err(EEXIST) => { let at = inner.find_available(hint, len).ok_or(ENOMEM)?; inner.mmap(at, len, mapping, permission)?; Ok(at) } Err(err) => Err(err), } } pub fn mmap_fixed( &self, at: VAddr, len: usize, mapping: Mapping, permission: Permission, ) -> KResult { Task::block_on(self.inner.borrow().lock()) .mmap(at, len, mapping.clone(), permission) .map(|_| at) } pub fn set_break(&self, pos: Option) -> VAddr { let inner = self.inner.borrow(); let mut inner = Task::block_on(inner.lock()); // SAFETY: `set_break` is only called in syscalls, where program break should be valid. assert!(inner.break_start.is_some() && inner.break_pos.is_some()); let break_start = inner.break_start.unwrap(); let current_break = inner.break_pos.unwrap(); let pos = match pos { None => return current_break, Some(pos) => pos.ceil(), }; let range = VRange::new(current_break, pos); if !inner.check_overlapping_range(range) { return current_break; } if !inner.areas.contains(&break_start) { inner.areas.insert(MMArea::new( break_start, Mapping::Anonymous, Permission { write: true, execute: false, }, )); } let program_break = inner .areas .get(&break_start) .expect("Program break area should be valid"); let len = pos - current_break; let range_to_grow = VRange::from(program_break.range().end()).grow(len); program_break.grow(len); inner.page_table.set_anonymous( range_to_grow, Permission { write: true, execute: false, }, ); inner.break_pos = Some(pos); pos } /// This should be called only **once** for every thread. pub fn register_break(&self, start: VAddr) { let inner = self.inner.borrow(); let mut inner = Task::block_on(inner.lock()); assert!(inner.break_start.is_none() && inner.break_pos.is_none()); inner.break_start = Some(start.into()); inner.break_pos = Some(start); } /// Access the memory area with the given function. /// The function will be called with the offset of the area and the slice of the area. pub fn access_mut(&self, start: VAddr, len: usize, func: F) -> KResult<()> where F: Fn(usize, &mut [u8]), { // First, validate the address range. let end = start + len; if !start.is_user() || !end.is_user() { return Err(EINVAL); } let inner = self.inner.borrow(); let inner = Task::block_on(inner.lock()); let mut offset = 0; let mut remaining = len; let mut current = start; while remaining > 0 { let area = inner.overlapping_addr(current).ok_or(EFAULT)?; let area_start = area.range().start(); let area_end = area.range().end(); let area_remaining = area_end - current; let access_len = remaining.min(area_remaining); let access_end = current + access_len; for (idx, pte) in inner .page_table .iter_user(VRange::new(current, access_end)) .enumerate() { let page_start = current.floor() + idx * 0x1000; let page_end = page_start + 0x1000; area.handle(pte, page_start - area_start)?; let start_offset; if page_start < current { start_offset = current - page_start; } else { start_offset = 0; } let end_offset; if page_end > access_end { end_offset = access_end - page_start; } else { end_offset = 0x1000; } unsafe { // SAFETY: We are sure that the page is valid and we have the right to access it. Page::with_raw(pte.get_pfn(), |page| { // SAFETY: The caller guarantees that no one else is using the page. let page_data = page.as_memblk().as_bytes_mut(); func( offset + idx * 0x1000, &mut page_data[start_offset..end_offset], ); }); } } offset += access_len; remaining -= access_len; current = access_end; } Ok(()) } } impl fmt::Debug for MMList { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("MMList").finish() } } trait PageTableExt { fn set_anonymous(&self, range: VRange, permission: Permission); fn set_mmapped(&self, range: VRange, permission: Permission); fn set_copy_on_write(&self, from: &Self, range: VRange); } impl PageTableExt for PageTable<'_, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess> { fn set_anonymous(&self, range: VRange, permission: Permission) { for pte in self.iter_user(range) { pte.set_anonymous(permission.execute); } } fn set_mmapped(&self, range: VRange, permission: Permission) { for pte in self.iter_user(range) { pte.set_mapped(permission.execute); } } fn set_copy_on_write(&self, from: &Self, range: VRange) { let to_iter = self.iter_user(range); let from_iter = from.iter_user(range); for (to, from) in to_iter.zip(from_iter) { to.set_copy_on_write(from); } } } trait PTEExt { fn set_anonymous(&mut self, execute: bool); fn set_mapped(&mut self, execute: bool); fn set_copy_on_write(&mut self, from: &mut Self); } impl PTEExt for T where T: PTE, { fn set_anonymous(&mut self, execute: bool) { // Writable flag is set during page fault handling while executable flag is // preserved across page faults, so we set executable flag now. let mut attr = PageAttribute::PRESENT | PageAttribute::USER | PageAttribute::COPY_ON_WRITE; attr.set(PageAttribute::EXECUTE, execute); self.set(EMPTY_PAGE.clone().into_raw(), T::Attr::from_page_attr(attr)); } fn set_mapped(&mut self, execute: bool) { // Writable flag is set during page fault handling while executable flag is // preserved across page faults, so we set executable flag now. let mut attr = PageAttribute::MAPPED | PageAttribute::USER | PageAttribute::COPY_ON_WRITE; attr.set(PageAttribute::EXECUTE, execute); self.set(EMPTY_PAGE.clone().into_raw(), T::Attr::from_page_attr(attr)); } fn set_copy_on_write(&mut self, from: &mut Self) { let mut from_attr = from .get_attr() .as_page_attr() .expect("Not a page attribute"); if !from_attr.contains(PageAttribute::PRESENT) { return; } from_attr.remove(PageAttribute::WRITE); from_attr.insert(PageAttribute::COPY_ON_WRITE); let pfn = unsafe { // SAFETY: We get the pfn from a valid page table entry, so it should be valid as well. Page::with_raw(from.get_pfn(), |page| page.clone().into_raw()) }; self.set( pfn, T::Attr::from_page_attr(from_attr & !PageAttribute::ACCESSED), ); from.set_attr(T::Attr::from_page_attr(from_attr)); } }