mm_list.rs 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630
  1. mod mapping;
  2. mod page_fault;
  3. use super::access::KernelPageAccess;
  4. use super::address::{VAddrExt as _, VRangeExt as _};
  5. use super::page_alloc::GlobalPageAlloc;
  6. use super::paging::{AllocZeroed as _, PageUnmanaged};
  7. use super::{AsMemoryBlock, MMArea, Page};
  8. use crate::{prelude::*, sync::ArcSwap};
  9. use alloc::collections::btree_set::BTreeSet;
  10. use arch::DefaultPagingMode;
  11. use bindings::{EEXIST, EFAULT, EINVAL, ENOMEM};
  12. use core::fmt;
  13. use core::sync::atomic::{AtomicUsize, Ordering};
  14. use eonix_mm::address::{Addr as _, PAddr};
  15. use eonix_mm::page_table::{PageAttribute, PagingMode};
  16. use eonix_mm::paging::PFN;
  17. use eonix_mm::{
  18. address::{AddrOps as _, VAddr, VRange},
  19. page_table::{PageTable, RawAttribute, PTE},
  20. paging::PAGE_SIZE,
  21. };
  22. use eonix_runtime::task::Task;
  23. use eonix_sync::{LazyLock, Mutex};
  24. pub use mapping::{FileMapping, Mapping};
  25. pub use page_fault::handle_page_fault;
  26. pub static EMPTY_PAGE: LazyLock<Page> = LazyLock::new(|| Page::zeroed());
  27. static KERNEL_ROOT_TABLE_PAGE: LazyLock<PageUnmanaged> = LazyLock::new(|| unsafe {
  28. // SAFETY: The kernel page table is always valid.
  29. PageUnmanaged::from_raw_unchecked(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN)
  30. });
  31. #[derive(Debug, Clone, Copy)]
  32. pub struct Permission {
  33. pub write: bool,
  34. pub execute: bool,
  35. }
  36. struct MMListInner<'a> {
  37. areas: BTreeSet<MMArea>,
  38. page_table: PageTable<'a, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess>,
  39. break_start: Option<VRange>,
  40. break_pos: Option<VAddr>,
  41. }
  42. pub struct MMList {
  43. inner: ArcSwap<Mutex<MMListInner<'static>>>,
  44. user_count: AtomicUsize,
  45. /// Only used in kernel space to switch page tables on context switch.
  46. root_page_table: AtomicUsize,
  47. }
  48. impl MMListInner<'_> {
  49. fn overlapping_addr(&self, addr: VAddr) -> Option<&MMArea> {
  50. self.areas.get(&VRange::from(addr))
  51. }
  52. fn check_overlapping_addr(&self, addr: VAddr) -> bool {
  53. addr.is_user() && self.overlapping_addr(addr).is_none()
  54. }
  55. fn overlapping_range(&self, range: VRange) -> impl DoubleEndedIterator<Item = &MMArea> + '_ {
  56. self.areas.range(range.into_bounds())
  57. }
  58. fn check_overlapping_range(&self, range: VRange) -> bool {
  59. range.is_user() && self.overlapping_range(range).next().is_none()
  60. }
  61. fn random_start(&self) -> VAddr {
  62. VAddr::from(0x1234000)
  63. }
  64. fn find_available(&self, mut hint: VAddr, len: usize) -> Option<VAddr> {
  65. let len = len.div_ceil(PAGE_SIZE) * PAGE_SIZE;
  66. if hint == VAddr::NULL {
  67. hint = self.random_start();
  68. } else {
  69. hint = hint.floor();
  70. }
  71. let mut range = VRange::from(hint).grow(len);
  72. loop {
  73. if !range.is_user() {
  74. return None;
  75. }
  76. match self.overlapping_range(range).next_back() {
  77. None => return Some(range.start()),
  78. Some(area) => {
  79. range = VRange::from(area.range().end().ceil()).grow(len);
  80. }
  81. }
  82. }
  83. }
  84. fn unmap(&mut self, start: VAddr, len: usize) -> KResult<Vec<Page>> {
  85. assert_eq!(start.floor(), start);
  86. let end = (start + len).ceil();
  87. let range_to_unmap = VRange::new(start, end);
  88. if !range_to_unmap.is_user() {
  89. return Err(EINVAL);
  90. }
  91. let mut left_remaining = None;
  92. let mut right_remaining = None;
  93. let mut pages_to_free = Vec::new();
  94. // TODO: Write back dirty pages.
  95. self.areas.retain(|area| {
  96. let Some((left, mid, right)) = area.range().mask_with_checked(&range_to_unmap) else {
  97. return true;
  98. };
  99. for pte in self.page_table.iter_user(mid) {
  100. let (pfn, _) = pte.take();
  101. pages_to_free.push(unsafe {
  102. // SAFETY: We got the pfn from a valid page table entry, so it should be valid.
  103. Page::from_raw(pfn)
  104. });
  105. }
  106. match (left, right) {
  107. (None, None) => {}
  108. (Some(left), None) => {
  109. assert!(left_remaining.is_none());
  110. let (Some(left), _) = area.clone().split(left.end()) else {
  111. unreachable!("`left.end()` is within the area");
  112. };
  113. left_remaining = Some(left);
  114. }
  115. (None, Some(right)) => {
  116. assert!(right_remaining.is_none());
  117. let (_, Some(right)) = area.clone().split(right.start()) else {
  118. unreachable!("`right.start()` is within the area");
  119. };
  120. right_remaining = Some(right);
  121. }
  122. (Some(left), Some(right)) => {
  123. assert!(left_remaining.is_none());
  124. assert!(right_remaining.is_none());
  125. let (Some(left), Some(mid)) = area.clone().split(left.end()) else {
  126. unreachable!("`left.end()` is within the area");
  127. };
  128. let (_, Some(right)) = mid.split(right.start()) else {
  129. unreachable!("`right.start()` is within the area");
  130. };
  131. left_remaining = Some(left);
  132. right_remaining = Some(right);
  133. }
  134. }
  135. false
  136. });
  137. if let Some(front) = left_remaining {
  138. self.areas.insert(front);
  139. }
  140. if let Some(back) = right_remaining {
  141. self.areas.insert(back);
  142. }
  143. Ok(pages_to_free)
  144. }
  145. fn mmap(
  146. &mut self,
  147. at: VAddr,
  148. len: usize,
  149. mapping: Mapping,
  150. permission: Permission,
  151. ) -> KResult<()> {
  152. assert_eq!(at.floor(), at);
  153. assert_eq!(len & 0xfff, 0);
  154. let range = VRange::new(at, at + len);
  155. // We are doing a area marker insertion.
  156. if len == 0 && !self.check_overlapping_addr(at) || !self.check_overlapping_range(range) {
  157. return Err(EEXIST);
  158. }
  159. match &mapping {
  160. Mapping::Anonymous => self.page_table.set_anonymous(range, permission),
  161. Mapping::File(_) => self.page_table.set_mmapped(range, permission),
  162. }
  163. self.areas.insert(MMArea::new(range, mapping, permission));
  164. Ok(())
  165. }
  166. }
  167. impl MMList {
  168. async fn flush_user_tlbs(&self) {
  169. match self.user_count.load(Ordering::Relaxed) {
  170. 0 => {
  171. // If there are currently no users, we don't need to do anything.
  172. }
  173. 1 => {
  174. if PAddr::from(arch::get_root_page_table_pfn()).addr()
  175. == self.root_page_table.load(Ordering::Relaxed)
  176. {
  177. // If there is only one user and we are using the page table,
  178. // flushing the TLB for the local cpu only is enough.
  179. arch::flush_tlb_all();
  180. } else {
  181. // Send the TLB flush request to the core.
  182. todo!();
  183. }
  184. }
  185. _ => {
  186. // If there are more than one users, we broadcast the TLB flush
  187. // to all cores.
  188. todo!()
  189. }
  190. }
  191. }
  192. pub fn new() -> Self {
  193. let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE);
  194. Self {
  195. root_page_table: AtomicUsize::from(page_table.addr().addr()),
  196. user_count: AtomicUsize::new(0),
  197. inner: ArcSwap::new(Mutex::new(MMListInner {
  198. areas: BTreeSet::new(),
  199. page_table,
  200. break_start: None,
  201. break_pos: None,
  202. })),
  203. }
  204. }
  205. pub async fn new_cloned(&self) -> Self {
  206. let inner = self.inner.borrow();
  207. let inner = inner.lock().await;
  208. let page_table = PageTable::new(&KERNEL_ROOT_TABLE_PAGE);
  209. let list = Self {
  210. root_page_table: AtomicUsize::from(page_table.addr().addr()),
  211. user_count: AtomicUsize::new(0),
  212. inner: ArcSwap::new(Mutex::new(MMListInner {
  213. areas: inner.areas.clone(),
  214. page_table,
  215. break_start: inner.break_start,
  216. break_pos: inner.break_pos,
  217. })),
  218. };
  219. {
  220. let list_inner = list.inner.borrow();
  221. let list_inner = list_inner.lock().await;
  222. for area in list_inner.areas.iter() {
  223. list_inner
  224. .page_table
  225. .set_copy_on_write(&inner.page_table, area.range());
  226. }
  227. }
  228. // We've set some pages as CoW, so we need to invalidate all our users' TLB.
  229. self.flush_user_tlbs().await;
  230. list
  231. }
  232. pub fn activate(&self) {
  233. self.user_count.fetch_add(1, Ordering::Acquire);
  234. let root_page_table = self.root_page_table.load(Ordering::Relaxed);
  235. assert_ne!(root_page_table, 0);
  236. arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table)));
  237. }
  238. pub fn deactivate(&self) {
  239. arch::set_root_page_table_pfn(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN);
  240. let old_user_count = self.user_count.fetch_sub(1, Ordering::Release);
  241. assert_ne!(old_user_count, 0);
  242. }
  243. /// Deactivate `self` and activate `to` with root page table changed only once.
  244. /// This might reduce the overhead of switching page tables twice.
  245. #[allow(dead_code)]
  246. pub fn switch(&self, to: &Self) {
  247. self.user_count.fetch_add(1, Ordering::Acquire);
  248. let root_page_table = self.root_page_table.load(Ordering::Relaxed);
  249. assert_ne!(root_page_table, 0);
  250. arch::set_root_page_table_pfn(PFN::from(PAddr::from(root_page_table)));
  251. let old_user_count = to.user_count.fetch_sub(1, Ordering::Release);
  252. assert_ne!(old_user_count, 0);
  253. }
  254. /// Replace the current page table with a new one.
  255. ///
  256. /// # Safety
  257. /// This function should be called only when we are sure that the `MMList` is not
  258. /// being used by any other thread.
  259. pub unsafe fn replace(&self, new: Option<Self>) {
  260. eonix_preempt::disable();
  261. assert_eq!(
  262. self.user_count.load(Ordering::Relaxed),
  263. 1,
  264. "We should be the only user"
  265. );
  266. assert_eq!(
  267. new.as_ref()
  268. .map(|new_mm| new_mm.user_count.load(Ordering::Relaxed))
  269. .unwrap_or(0),
  270. 0,
  271. "`new` must not be used by anyone"
  272. );
  273. let old_root_page_table = self.root_page_table.load(Ordering::Relaxed);
  274. let current_root_page_table = arch::get_root_page_table_pfn();
  275. assert_eq!(
  276. PAddr::from(current_root_page_table).addr(),
  277. old_root_page_table,
  278. "We should be the only user"
  279. );
  280. let new_root_page_table = match &new {
  281. Some(new_mm) => new_mm.root_page_table.load(Ordering::Relaxed),
  282. None => PAddr::from(DefaultPagingMode::KERNEL_ROOT_TABLE_PFN).addr(),
  283. };
  284. arch::set_root_page_table_pfn(PFN::from(PAddr::from(new_root_page_table)));
  285. self.root_page_table
  286. .store(new_root_page_table, Ordering::Relaxed);
  287. // TODO: Check whether we should wake someone up if they've been put
  288. // to sleep when calling `vfork`.
  289. self.inner
  290. .swap(new.map(|new_mm| new_mm.inner.swap(None)).flatten());
  291. eonix_preempt::enable();
  292. }
  293. /// No need to do invalidation manually, `PageTable` already does it.
  294. pub async fn unmap(&self, start: VAddr, len: usize) -> KResult<()> {
  295. let pages_to_free = self.inner.borrow().lock().await.unmap(start, len)?;
  296. // We need to assure that the pages are not accessed anymore.
  297. // The ones having these pages in their TLB could read from or write to them.
  298. // So flush the TLBs first for all our users.
  299. self.flush_user_tlbs().await;
  300. // Then free the pages.
  301. drop(pages_to_free);
  302. Ok(())
  303. }
  304. pub fn mmap_hint(
  305. &self,
  306. hint: VAddr,
  307. len: usize,
  308. mapping: Mapping,
  309. permission: Permission,
  310. ) -> KResult<VAddr> {
  311. let inner = self.inner.borrow();
  312. let mut inner = Task::block_on(inner.lock());
  313. if hint == VAddr::NULL {
  314. let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
  315. inner.mmap(at, len, mapping, permission)?;
  316. return Ok(at);
  317. }
  318. match inner.mmap(hint, len, mapping.clone(), permission) {
  319. Ok(()) => Ok(hint),
  320. Err(EEXIST) => {
  321. let at = inner.find_available(hint, len).ok_or(ENOMEM)?;
  322. inner.mmap(at, len, mapping, permission)?;
  323. Ok(at)
  324. }
  325. Err(err) => Err(err),
  326. }
  327. }
  328. pub fn mmap_fixed(
  329. &self,
  330. at: VAddr,
  331. len: usize,
  332. mapping: Mapping,
  333. permission: Permission,
  334. ) -> KResult<VAddr> {
  335. Task::block_on(self.inner.borrow().lock())
  336. .mmap(at, len, mapping.clone(), permission)
  337. .map(|_| at)
  338. }
  339. pub fn set_break(&self, pos: Option<VAddr>) -> VAddr {
  340. let inner = self.inner.borrow();
  341. let mut inner = Task::block_on(inner.lock());
  342. // SAFETY: `set_break` is only called in syscalls, where program break should be valid.
  343. assert!(inner.break_start.is_some() && inner.break_pos.is_some());
  344. let break_start = inner.break_start.unwrap();
  345. let current_break = inner.break_pos.unwrap();
  346. let pos = match pos {
  347. None => return current_break,
  348. Some(pos) => pos.ceil(),
  349. };
  350. let range = VRange::new(current_break, pos);
  351. if !inner.check_overlapping_range(range) {
  352. return current_break;
  353. }
  354. if !inner.areas.contains(&break_start) {
  355. inner.areas.insert(MMArea::new(
  356. break_start,
  357. Mapping::Anonymous,
  358. Permission {
  359. write: true,
  360. execute: false,
  361. },
  362. ));
  363. }
  364. let program_break = inner
  365. .areas
  366. .get(&break_start)
  367. .expect("Program break area should be valid");
  368. let len = pos - current_break;
  369. let range_to_grow = VRange::from(program_break.range().end()).grow(len);
  370. program_break.grow(len);
  371. inner.page_table.set_anonymous(
  372. range_to_grow,
  373. Permission {
  374. write: true,
  375. execute: false,
  376. },
  377. );
  378. inner.break_pos = Some(pos);
  379. pos
  380. }
  381. /// This should be called only **once** for every thread.
  382. pub fn register_break(&self, start: VAddr) {
  383. let inner = self.inner.borrow();
  384. let mut inner = Task::block_on(inner.lock());
  385. assert!(inner.break_start.is_none() && inner.break_pos.is_none());
  386. inner.break_start = Some(start.into());
  387. inner.break_pos = Some(start);
  388. }
  389. /// Access the memory area with the given function.
  390. /// The function will be called with the offset of the area and the slice of the area.
  391. pub fn access_mut<F>(&self, start: VAddr, len: usize, func: F) -> KResult<()>
  392. where
  393. F: Fn(usize, &mut [u8]),
  394. {
  395. // First, validate the address range.
  396. let end = start + len;
  397. if !start.is_user() || !end.is_user() {
  398. return Err(EINVAL);
  399. }
  400. let inner = self.inner.borrow();
  401. let inner = Task::block_on(inner.lock());
  402. let mut offset = 0;
  403. let mut remaining = len;
  404. let mut current = start;
  405. while remaining > 0 {
  406. let area = inner.overlapping_addr(current).ok_or(EFAULT)?;
  407. let area_start = area.range().start();
  408. let area_end = area.range().end();
  409. let area_remaining = area_end - current;
  410. let access_len = remaining.min(area_remaining);
  411. let access_end = current + access_len;
  412. for (idx, pte) in inner
  413. .page_table
  414. .iter_user(VRange::new(current, access_end))
  415. .enumerate()
  416. {
  417. let page_start = current.floor() + idx * 0x1000;
  418. let page_end = page_start + 0x1000;
  419. area.handle(pte, page_start - area_start)?;
  420. let start_offset;
  421. if page_start < current {
  422. start_offset = current - page_start;
  423. } else {
  424. start_offset = 0;
  425. }
  426. let end_offset;
  427. if page_end > access_end {
  428. end_offset = access_end - page_start;
  429. } else {
  430. end_offset = 0x1000;
  431. }
  432. unsafe {
  433. // SAFETY: We are sure that the page is valid and we have the right to access it.
  434. Page::with_raw(pte.get_pfn(), |page| {
  435. // SAFETY: The caller guarantees that no one else is using the page.
  436. let page_data = page.as_memblk().as_bytes_mut();
  437. func(
  438. offset + idx * 0x1000,
  439. &mut page_data[start_offset..end_offset],
  440. );
  441. });
  442. }
  443. }
  444. offset += access_len;
  445. remaining -= access_len;
  446. current = access_end;
  447. }
  448. Ok(())
  449. }
  450. }
  451. impl fmt::Debug for MMList {
  452. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
  453. f.debug_struct("MMList").finish()
  454. }
  455. }
  456. trait PageTableExt {
  457. fn set_anonymous(&self, range: VRange, permission: Permission);
  458. fn set_mmapped(&self, range: VRange, permission: Permission);
  459. fn set_copy_on_write(&self, from: &Self, range: VRange);
  460. }
  461. impl PageTableExt for PageTable<'_, DefaultPagingMode, GlobalPageAlloc, KernelPageAccess> {
  462. fn set_anonymous(&self, range: VRange, permission: Permission) {
  463. for pte in self.iter_user(range) {
  464. pte.set_anonymous(permission.execute);
  465. }
  466. }
  467. fn set_mmapped(&self, range: VRange, permission: Permission) {
  468. for pte in self.iter_user(range) {
  469. pte.set_mapped(permission.execute);
  470. }
  471. }
  472. fn set_copy_on_write(&self, from: &Self, range: VRange) {
  473. let to_iter = self.iter_user(range);
  474. let from_iter = from.iter_user(range);
  475. for (to, from) in to_iter.zip(from_iter) {
  476. to.set_copy_on_write(from);
  477. }
  478. }
  479. }
  480. trait PTEExt {
  481. fn set_anonymous(&mut self, execute: bool);
  482. fn set_mapped(&mut self, execute: bool);
  483. fn set_copy_on_write(&mut self, from: &mut Self);
  484. }
  485. impl<T> PTEExt for T
  486. where
  487. T: PTE,
  488. {
  489. fn set_anonymous(&mut self, execute: bool) {
  490. // Writable flag is set during page fault handling while executable flag is
  491. // preserved across page faults, so we set executable flag now.
  492. let mut attr = PageAttribute::PRESENT | PageAttribute::USER | PageAttribute::COPY_ON_WRITE;
  493. attr.set(PageAttribute::EXECUTE, execute);
  494. self.set(EMPTY_PAGE.clone().into_raw(), T::Attr::from_page_attr(attr));
  495. }
  496. fn set_mapped(&mut self, execute: bool) {
  497. // Writable flag is set during page fault handling while executable flag is
  498. // preserved across page faults, so we set executable flag now.
  499. let mut attr = PageAttribute::MAPPED | PageAttribute::USER | PageAttribute::COPY_ON_WRITE;
  500. attr.set(PageAttribute::EXECUTE, execute);
  501. self.set(EMPTY_PAGE.clone().into_raw(), T::Attr::from_page_attr(attr));
  502. }
  503. fn set_copy_on_write(&mut self, from: &mut Self) {
  504. let mut from_attr = from
  505. .get_attr()
  506. .as_page_attr()
  507. .expect("Not a page attribute");
  508. if !from_attr.contains(PageAttribute::PRESENT) {
  509. return;
  510. }
  511. from_attr.remove(PageAttribute::WRITE);
  512. from_attr.insert(PageAttribute::COPY_ON_WRITE);
  513. let pfn = unsafe {
  514. // SAFETY: We get the pfn from a valid page table entry, so it should be valid as well.
  515. Page::with_raw(from.get_pfn(), |page| page.clone().into_raw())
  516. };
  517. self.set(
  518. pfn,
  519. T::Attr::from_page_attr(from_attr & !PageAttribute::ACCESSED),
  520. );
  521. from.set_attr(T::Attr::from_page_attr(from_attr));
  522. }
  523. }